From 5b7b85e499b5c6c52a35d5631cff3619e131805b Mon Sep 17 00:00:00 2001
From: Maamoun TK <maamoun.tk@gmail.com>
Date: Thu, 14 Dec 2023 11:42:57 +0200
Subject: [PATCH 1/6] Change order of SHA3 Scalar/Simd256 parameters

---
 benchmarks/sha3.cc                    |  188 +--
 include/Hacl_Hash_SHA3_Scalar.h       |   20 +-
 include/Hacl_Hash_SHA3_Simd256.h      |   64 +-
 include/msvc/Hacl_Hash_SHA3_Scalar.h  |   20 +-
 include/msvc/Hacl_Hash_SHA3_Simd256.h |   64 +-
 src/Hacl_Hash_SHA3_Scalar.c           |   20 +-
 src/Hacl_Hash_SHA3_Simd256.c          | 1654 +++++++++++++------------
 tests/sha3.cc                         |  104 +-
 8 files changed, 1124 insertions(+), 1010 deletions(-)

diff --git a/benchmarks/sha3.cc b/benchmarks/sha3.cc
index 9caf2e54..1deb82c8 100644
--- a/benchmarks/sha3.cc
+++ b/benchmarks/sha3.cc
@@ -65,7 +65,7 @@ Hacl_Sha3_224_Scalar(benchmark::State& state)
 {
   for (auto _ : state) {
     Hacl_Hash_SHA3_Scalar_sha3_224(
-      input.size(), (uint8_t*)input.data(), digest224_0.data());
+      digest224_0.data(), (uint8_t*)input.data(), input.size());
   }
   if (digest224_0 != expected_digest_sha3_224) {
     state.SkipWithError("Incorrect digest.");
@@ -85,15 +85,15 @@ Hacl_Sha3_224_Simd256(benchmark::State& state)
   }
   
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Simd256_sha3_224(input.size(),
+    Hacl_Hash_SHA3_Simd256_sha3_224(digest224_0.data(),
+                                     digest224_1.data(),
+                                     digest224_2.data(),
+                                     digest224_3.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
-                                     digest224_0.data(),
-                                     digest224_1.data(),
-                                     digest224_2.data(),
-                                     digest224_3.data());
+                                     input.size());
   }
   if (digest224_0 != expected_digest_sha3_224 ||
       digest224_1 != expected_digest_sha3_224 ||
@@ -132,12 +132,39 @@ Hacl_Sha3_256(benchmark::State& state)
 
 BENCHMARK(Hacl_Sha3_256)->Setup(DoSetup);
 
+#include "sha3.h"
+
+static void
+Digestif_sha3_256(benchmark::State& state)
+{
+  bytes digest(32, 0);
+
+  for (auto _ : state) {
+
+    sha3_ctx ctx;
+    digestif_sha3_init(&ctx, 256);
+
+    for (auto chunk : chunk(input, chunk_len)) {
+      digestif_sha3_update(&ctx, chunk.data(), chunk.size());
+    }
+
+    digestif_sha3_finalize(&ctx, digest.data(), 0x06);
+  }
+
+  if (digest != expected_digest_sha3_256) {
+    state.SkipWithError("Incorrect digest.");
+    return;
+  }
+}
+
+BENCHMARK(Digestif_sha3_256)->Setup(DoSetup);
+
 static void
 Hacl_Sha3_256_Scalar(benchmark::State& state)
 {
   for (auto _ : state) {
     Hacl_Hash_SHA3_Scalar_sha3_256(
-      input.size(), (uint8_t*)input.data(), digest256_0.data());
+      digest256_0.data(), (uint8_t*)input.data(), input.size());
   }
   if (digest256_0 != expected_digest_sha3_256) {
     state.SkipWithError("Incorrect digest.");
@@ -157,15 +184,15 @@ Hacl_Sha3_256_Simd256(benchmark::State& state)
   }
   
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Simd256_sha3_256(input.size(),
+    Hacl_Hash_SHA3_Simd256_sha3_256(digest256_0.data(),
+                                     digest256_1.data(),
+                                     digest256_2.data(),
+                                     digest256_3.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
-                                     digest256_0.data(),
-                                     digest256_1.data(),
-                                     digest256_2.data(),
-                                     digest256_3.data());
+                                     input.size());
   }
   if (digest256_0 != expected_digest_sha3_256 ||
       digest256_1 != expected_digest_sha3_256 ||
@@ -179,33 +206,6 @@ Hacl_Sha3_256_Simd256(benchmark::State& state)
 BENCHMARK(Hacl_Sha3_256_Simd256)->Setup(DoSetup);
 #endif
 
-#include "sha3.h"
-
-static void
-Digestif_sha3_256(benchmark::State& state)
-{
-  bytes digest(32, 0);
-
-  for (auto _ : state) {
-
-    sha3_ctx ctx;
-    digestif_sha3_init(&ctx, 256);
-
-    for (auto chunk : chunk(input, chunk_len)) {
-      digestif_sha3_update(&ctx, chunk.data(), chunk.size());
-    }
-
-    digestif_sha3_finalize(&ctx, digest.data(), 0x06);
-  }
-
-  if (digest != expected_digest_sha3_256) {
-    state.SkipWithError("Incorrect digest.");
-    return;
-  }
-}
-
-BENCHMARK(Digestif_sha3_256)->Setup(DoSetup);
-
 #ifndef NO_OPENSSL
 BENCHMARK_CAPTURE(OpenSSL_hash_oneshot,
                   sha3_256,
@@ -236,7 +236,7 @@ Hacl_Sha3_384_Scalar(benchmark::State& state)
 {
   for (auto _ : state) {
     Hacl_Hash_SHA3_Scalar_sha3_384(
-      input.size(), (uint8_t*)input.data(), digest384_0.data());
+      digest384_0.data(), (uint8_t*)input.data(), input.size());
   }
   if (digest384_0 != expected_digest_sha3_384) {
     state.SkipWithError("Incorrect digest.");
@@ -256,15 +256,15 @@ Hacl_Sha3_384_Simd256(benchmark::State& state)
   }
   
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Simd256_sha3_384(input.size(),
+    Hacl_Hash_SHA3_Simd256_sha3_384(digest384_0.data(),
+                                     digest384_1.data(),
+                                     digest384_2.data(),
+                                     digest384_3.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
-                                     digest384_0.data(),
-                                     digest384_1.data(),
-                                     digest384_2.data(),
-                                     digest384_3.data());
+                                     input.size());
   }
   if (digest384_0 != expected_digest_sha3_384 ||
       digest384_1 != expected_digest_sha3_384 ||
@@ -303,12 +303,37 @@ Hacl_Sha3_512(benchmark::State& state)
 
 BENCHMARK(Hacl_Sha3_512)->Setup(DoSetup);
 
+static void
+Digestif_sha3_512(benchmark::State& state)
+{
+  bytes digest(64, 0);
+
+  for (auto _ : state) {
+
+    sha3_ctx ctx;
+    digestif_sha3_init(&ctx, 512);
+
+    for (auto chunk : chunk(input, chunk_len)) {
+      digestif_sha3_update(&ctx, chunk.data(), chunk.size());
+    }
+
+    digestif_sha3_finalize(&ctx, digest.data(), 0x06);
+  }
+
+  if (digest != expected_digest_sha3_512) {
+    state.SkipWithError("Incorrect digest.");
+    return;
+  }
+}
+
+BENCHMARK(Digestif_sha3_512)->Setup(DoSetup);
+
 static void
 Hacl_Sha3_512_Scalar(benchmark::State& state)
 {
   for (auto _ : state) {
     Hacl_Hash_SHA3_Scalar_sha3_512(
-      input.size(), (uint8_t*)input.data(), digest512_0.data());
+      digest512_0.data(), (uint8_t*)input.data(), input.size());
   }
   if (digest512_0 != expected_digest_sha3_512) {
     state.SkipWithError("Incorrect digest.");
@@ -328,15 +353,15 @@ Hacl_Sha3_512_Simd256(benchmark::State& state)
   }
   
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Simd256_sha3_512(input.size(),
+    Hacl_Hash_SHA3_Simd256_sha3_512(digest512_0.data(),
+                                     digest512_1.data(),
+                                     digest512_2.data(),
+                                     digest512_3.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
-                                     digest512_0.data(),
-                                     digest512_1.data(),
-                                     digest512_2.data(),
-                                     digest512_3.data());
+                                     input.size());
   }
   if (digest512_0 != expected_digest_sha3_512 ||
       digest512_1 != expected_digest_sha3_512 ||
@@ -350,31 +375,6 @@ Hacl_Sha3_512_Simd256(benchmark::State& state)
 BENCHMARK(Hacl_Sha3_512_Simd256)->Setup(DoSetup);
 #endif
 
-static void
-Digestif_sha3_512(benchmark::State& state)
-{
-  bytes digest(64, 0);
-
-  for (auto _ : state) {
-
-    sha3_ctx ctx;
-    digestif_sha3_init(&ctx, 512);
-
-    for (auto chunk : chunk(input, chunk_len)) {
-      digestif_sha3_update(&ctx, chunk.data(), chunk.size());
-    }
-
-    digestif_sha3_finalize(&ctx, digest.data(), 0x06);
-  }
-
-  if (digest != expected_digest_sha3_512) {
-    state.SkipWithError("Incorrect digest.");
-    return;
-  }
-}
-
-BENCHMARK(Digestif_sha3_512)->Setup(DoSetup);
-
 #ifndef NO_OPENSSL
 BENCHMARK_CAPTURE(OpenSSL_hash_oneshot,
                   sha3_512,
@@ -469,10 +469,10 @@ static void
 Hacl_Sha3_shake128_Scalar(benchmark::State& state)
 {
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Scalar_shake128(input.size(),
-                                   (uint8_t*)input.data(),
+    Hacl_Hash_SHA3_Scalar_shake128(digest_shake_0.data(),
                                    digest_shake_0.size(),
-                                   digest_shake_0.data());
+                                   (uint8_t*)input.data(),
+                                   input.size());
   }
 }
 
@@ -488,16 +488,16 @@ Hacl_Sha3_shake128_Simd256(benchmark::State& state)
   }
   
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Simd256_shake128(input.size(),
+    Hacl_Hash_SHA3_Simd256_shake128(digest_shake_0.data(),
+                                     digest_shake_1.data(),
+                                     digest_shake_2.data(),
+                                     digest_shake_3.data(),
+                                     digest_shake_0.size(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
-                                     digest_shake_0.size(),
-                                     digest_shake_0.data(),
-                                     digest_shake_1.data(),
-                                     digest_shake_2.data(),
-                                     digest_shake_3.data());
+                                     input.size());
   }
 }
 
@@ -521,10 +521,10 @@ static void
 Hacl_Sha3_shake256_Scalar(benchmark::State& state)
 {
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Scalar_shake256(input.size(),
-                                   (uint8_t*)input.data(),
+    Hacl_Hash_SHA3_Scalar_shake256(digest_shake_0.data(),
                                    digest_shake_0.size(),
-                                   digest_shake_0.data());
+                                   (uint8_t*)input.data(),
+                                   input.size());
   }
 }
 
@@ -540,16 +540,16 @@ Hacl_Sha3_shake256_Simd256(benchmark::State& state)
   }
   
   for (auto _ : state) {
-    Hacl_Hash_SHA3_Simd256_shake256(input.size(),
+    Hacl_Hash_SHA3_Simd256_shake256(digest_shake_0.data(),
+                                     digest_shake_1.data(),
+                                     digest_shake_2.data(),
+                                     digest_shake_3.data(),
+                                     digest_shake_0.size(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
                                      (uint8_t*)input.data(),
-                                     digest_shake_0.size(),
-                                     digest_shake_0.data(),
-                                     digest_shake_1.data(),
-                                     digest_shake_2.data(),
-                                     digest_shake_3.data());
+                                     input.size());
   }
 }
 
diff --git a/include/Hacl_Hash_SHA3_Scalar.h b/include/Hacl_Hash_SHA3_Scalar.h
index e49f1967..2063da71 100644
--- a/include/Hacl_Hash_SHA3_Scalar.h
+++ b/include/Hacl_Hash_SHA3_Scalar.h
@@ -37,27 +37,27 @@ extern "C" {
 
 void
 Hacl_Hash_SHA3_Scalar_shake128(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Scalar_shake256(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 );
 
-void Hacl_Hash_SHA3_Scalar_sha3_224(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_224(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
-void Hacl_Hash_SHA3_Scalar_sha3_256(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_256(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
-void Hacl_Hash_SHA3_Scalar_sha3_384(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
-void Hacl_Hash_SHA3_Scalar_sha3_512(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
 #if defined(__cplusplus)
 }
diff --git a/include/Hacl_Hash_SHA3_Simd256.h b/include/Hacl_Hash_SHA3_Simd256.h
index 3dd3772d..22efc736 100644
--- a/include/Hacl_Hash_SHA3_Simd256.h
+++ b/include/Hacl_Hash_SHA3_Simd256.h
@@ -58,82 +58,82 @@ K____uint8_t___uint8_t____K____uint8_t___uint8_t_;
 
 void
 Hacl_Hash_SHA3_Simd256_shake128(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_shake256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_224(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_384(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_512(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 #if defined(__cplusplus)
diff --git a/include/msvc/Hacl_Hash_SHA3_Scalar.h b/include/msvc/Hacl_Hash_SHA3_Scalar.h
index e49f1967..2063da71 100644
--- a/include/msvc/Hacl_Hash_SHA3_Scalar.h
+++ b/include/msvc/Hacl_Hash_SHA3_Scalar.h
@@ -37,27 +37,27 @@ extern "C" {
 
 void
 Hacl_Hash_SHA3_Scalar_shake128(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Scalar_shake256(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 );
 
-void Hacl_Hash_SHA3_Scalar_sha3_224(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_224(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
-void Hacl_Hash_SHA3_Scalar_sha3_256(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_256(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
-void Hacl_Hash_SHA3_Scalar_sha3_384(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
-void Hacl_Hash_SHA3_Scalar_sha3_512(uint32_t inputByteLen, uint8_t *input, uint8_t *output);
+void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
 #if defined(__cplusplus)
 }
diff --git a/include/msvc/Hacl_Hash_SHA3_Simd256.h b/include/msvc/Hacl_Hash_SHA3_Simd256.h
index 3dd3772d..22efc736 100644
--- a/include/msvc/Hacl_Hash_SHA3_Simd256.h
+++ b/include/msvc/Hacl_Hash_SHA3_Simd256.h
@@ -58,82 +58,82 @@ K____uint8_t___uint8_t____K____uint8_t___uint8_t_;
 
 void
 Hacl_Hash_SHA3_Simd256_shake128(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_shake256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_224(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_384(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_512(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 );
 
 #if defined(__cplusplus)
diff --git a/src/Hacl_Hash_SHA3_Scalar.c b/src/Hacl_Hash_SHA3_Scalar.c
index 43d57482..724426eb 100644
--- a/src/Hacl_Hash_SHA3_Scalar.c
+++ b/src/Hacl_Hash_SHA3_Scalar.c
@@ -55,10 +55,10 @@ Hacl_Impl_SHA3_Vec_keccak_rndc[24U] =
 
 void
 Hacl_Hash_SHA3_Scalar_shake128(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 )
 {
   uint32_t rateInBytes = 168U;
@@ -447,10 +447,10 @@ Hacl_Hash_SHA3_Scalar_shake128(
 
 void
 Hacl_Hash_SHA3_Scalar_shake256(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 )
 {
   uint32_t rateInBytes = 136U;
@@ -837,7 +837,7 @@ Hacl_Hash_SHA3_Scalar_shake256(
   memcpy(output + outputByteLen - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_224(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_224(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 144U;
   uint64_t s[25U] = { 0U };
@@ -1223,7 +1223,7 @@ void Hacl_Hash_SHA3_Scalar_sha3_224(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 28U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_256(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_256(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 136U;
   uint64_t s[25U] = { 0U };
@@ -1609,7 +1609,7 @@ void Hacl_Hash_SHA3_Scalar_sha3_256(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 32U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_384(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 104U;
   uint64_t s[25U] = { 0U };
@@ -1995,7 +1995,7 @@ void Hacl_Hash_SHA3_Scalar_sha3_384(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 48U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_512(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 72U;
   uint64_t s[25U] = { 0U };
diff --git a/src/Hacl_Hash_SHA3_Simd256.c b/src/Hacl_Hash_SHA3_Simd256.c
index b9bfcee5..fbe195f5 100644
--- a/src/Hacl_Hash_SHA3_Simd256.c
+++ b/src/Hacl_Hash_SHA3_Simd256.c
@@ -30,16 +30,16 @@
 
 void
 Hacl_Hash_SHA3_Simd256_shake128(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -438,63 +438,63 @@ Hacl_Hash_SHA3_Simd256_shake128(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x1FU;
+  b12[rem] = 0x1FU;
+  b22[rem] = 0x1FU;
+  b32[rem] = 0x1FU;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x1FU;
-  b13[rem] = 0x1FU;
-  b23[rem] = 0x1FU;
-  b33[rem] = 0x1FU;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -723,57 +723,57 @@ Hacl_Hash_SHA3_Simd256_shake128(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b36 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -1332,25 +1332,34 @@ Hacl_Hash_SHA3_Simd256_shake128(
     }
     for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
     {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+    }
+    if (rateInBytes % 32U > 0U)
+    {
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 32U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 64U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 96U,
+        rateInBytes % 32U * sizeof (uint8_t));
     }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -1691,30 +1700,40 @@ Hacl_Hash_SHA3_Simd256_shake128(
     memcpy(b2 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
     memcpy(b3 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
   }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
-  uint8_t *b2 = rb.snd.snd.fst;
-  uint8_t *b1 = rb.snd.fst;
-  uint8_t *b0 = rb.fst;
-  memcpy(b0 + outputByteLen - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  if (remOut % 32U > 0U)
+  {
+    uint8_t *b3 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b1 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 32U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b2 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 64U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b3 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 96U,
+      remOut % 32U * sizeof (uint8_t));
+    return;
+  }
 }
 
 void
 Hacl_Hash_SHA3_Simd256_shake256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -2113,63 +2132,63 @@ Hacl_Hash_SHA3_Simd256_shake256(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x1FU;
+  b12[rem] = 0x1FU;
+  b22[rem] = 0x1FU;
+  b32[rem] = 0x1FU;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x1FU;
-  b13[rem] = 0x1FU;
-  b23[rem] = 0x1FU;
-  b33[rem] = 0x1FU;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -2398,57 +2417,57 @@ Hacl_Hash_SHA3_Simd256_shake256(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b36 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -3007,25 +3026,34 @@ Hacl_Hash_SHA3_Simd256_shake256(
     }
     for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
     {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+    }
+    if (rateInBytes % 32U > 0U)
+    {
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 32U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 64U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 96U,
+        rateInBytes % 32U * sizeof (uint8_t));
     }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -3366,29 +3394,39 @@ Hacl_Hash_SHA3_Simd256_shake256(
     memcpy(b2 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
     memcpy(b3 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
   }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
-  uint8_t *b2 = rb.snd.snd.fst;
-  uint8_t *b1 = rb.snd.fst;
-  uint8_t *b0 = rb.fst;
-  memcpy(b0 + outputByteLen - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  if (remOut % 32U > 0U)
+  {
+    uint8_t *b3 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b1 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 32U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b2 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 64U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b3 + outputByteLen - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 96U,
+      remOut % 32U * sizeof (uint8_t));
+    return;
+  }
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_224(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -3787,63 +3825,63 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -4072,57 +4110,57 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b36 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -4681,25 +4719,34 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
     }
     for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
     {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+    }
+    if (rateInBytes % 32U > 0U)
+    {
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 32U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 64U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 96U,
+        rateInBytes % 32U * sizeof (uint8_t));
     }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -5040,29 +5087,39 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
     memcpy(b2 + 28U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
     memcpy(b3 + 28U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
   }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
-  uint8_t *b2 = rb.snd.snd.fst;
-  uint8_t *b1 = rb.snd.fst;
-  uint8_t *b0 = rb.fst;
-  memcpy(b0 + 28U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 28U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 28U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 28U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  if (remOut % 32U > 0U)
+  {
+    uint8_t *b3 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + 28U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b1 + 28U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 32U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b2 + 28U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 64U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b3 + 28U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 96U,
+      remOut % 32U * sizeof (uint8_t));
+    return;
+  }
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -5461,63 +5518,63 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -5746,57 +5803,57 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b36 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -6355,25 +6412,34 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
     }
     for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
     {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+    }
+    if (rateInBytes % 32U > 0U)
+    {
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 32U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 64U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 96U,
+        rateInBytes % 32U * sizeof (uint8_t));
     }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -6714,29 +6780,39 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
     memcpy(b2 + 32U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
     memcpy(b3 + 32U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
   }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
-  uint8_t *b2 = rb.snd.snd.fst;
-  uint8_t *b1 = rb.snd.fst;
-  uint8_t *b0 = rb.fst;
-  memcpy(b0 + 32U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 32U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 32U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 32U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  if (remOut % 32U > 0U)
+  {
+    uint8_t *b3 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + 32U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b1 + 32U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 32U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b2 + 32U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 64U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b3 + 32U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 96U,
+      remOut % 32U * sizeof (uint8_t));
+    return;
+  }
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_384(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -7135,63 +7211,63 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -7420,57 +7496,57 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b36 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -8029,25 +8105,34 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
     }
     for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
     {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+    }
+    if (rateInBytes % 32U > 0U)
+    {
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 32U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 64U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 96U,
+        rateInBytes % 32U * sizeof (uint8_t));
     }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -8388,29 +8473,39 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
     memcpy(b2 + 48U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
     memcpy(b3 + 48U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
   }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
-  uint8_t *b2 = rb.snd.snd.fst;
-  uint8_t *b1 = rb.snd.fst;
-  uint8_t *b0 = rb.fst;
-  memcpy(b0 + 48U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 48U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 48U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 48U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  if (remOut % 32U > 0U)
+  {
+    uint8_t *b3 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + 48U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b1 + 48U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 32U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b2 + 48U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 64U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b3 + 48U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 96U,
+      remOut % 32U * sizeof (uint8_t));
+    return;
+  }
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_512(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -8809,63 +8904,63 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -9094,57 +9189,57 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b36 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -9703,25 +9798,34 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
     }
     for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
     {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
+    }
+    if (rateInBytes % 32U > 0U)
+    {
+      uint8_t *b3 = rb.snd.snd.snd;
+      uint8_t *b2 = rb.snd.snd.fst;
+      uint8_t *b1 = rb.snd.fst;
+      uint8_t *b0 = rb.fst;
+      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 32U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 64U,
+        rateInBytes % 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
+        hbuf + rateInBytes / 32U * 128U + 96U,
+        rateInBytes % 32U * sizeof (uint8_t));
     }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -10062,15 +10166,25 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
     memcpy(b2 + 64U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
     memcpy(b3 + 64U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
   }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
-  uint8_t *b2 = rb.snd.snd.fst;
-  uint8_t *b1 = rb.snd.fst;
-  uint8_t *b0 = rb.fst;
-  memcpy(b0 + 64U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 64U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 64U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 64U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  if (remOut % 32U > 0U)
+  {
+    uint8_t *b3 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + 64U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b1 + 64U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 32U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b2 + 64U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 64U,
+      remOut % 32U * sizeof (uint8_t));
+    memcpy(b3 + 64U - remOut + remOut / 32U * 32U,
+      hbuf + remOut / 32U * 128U + 96U,
+      remOut % 32U * sizeof (uint8_t));
+    return;
+  }
 }
 
diff --git a/tests/sha3.cc b/tests/sha3.cc
index 858c1542..cc88a2e8 100644
--- a/tests/sha3.cc
+++ b/tests/sha3.cc
@@ -178,7 +178,7 @@ TEST(ApiSuite, ApiTest)
 
     uint8_t digest[HACL_HASH_SHA3_256_DIGEST_LENGTH];
 
-    Hacl_Hash_SHA3_Scalar_sha3_256(message_size, (uint8_t*)message, digest);
+    Hacl_Hash_SHA3_Scalar_sha3_256(digest, (uint8_t*)message, message_size);
     // ANCHOR_END(example scalar_sha3_256)
 
     bytes expected_digest = from_hex(
@@ -204,7 +204,7 @@ TEST(ApiSuite, ApiTest)
     uint8_t digest[42];
 
     Hacl_Hash_SHA3_Scalar_shake128(
-      message_size, (uint8_t*)message, digest_size, digest);
+      digest, digest_size, (uint8_t*)message, message_size);
     // ANCHOR_END(example scalar_shake128)
 
     bytes expected_digest =
@@ -232,15 +232,15 @@ TEST(ApiSuite, ApiTest)
       uint8_t digest2[HACL_HASH_SHA3_256_DIGEST_LENGTH];
       uint8_t digest3[HACL_HASH_SHA3_256_DIGEST_LENGTH];
 
-      Hacl_Hash_SHA3_Simd256_sha3_256(message_size,
+      Hacl_Hash_SHA3_Simd256_sha3_256(digest0,
+                                       digest1,
+                                       digest2,
+                                       digest3,
                                        (uint8_t*)message,
                                        (uint8_t*)message,
                                        (uint8_t*)message,
                                        (uint8_t*)message,
-                                       digest0,
-                                       digest1,
-                                       digest2,
-                                       digest3);
+                                       message_size);
       // ANCHOR_END(example vec256_sha3_256)
 
       bytes expected_digest = from_hex(
@@ -283,16 +283,16 @@ TEST(ApiSuite, ApiTest)
       uint8_t digest2[42];
       uint8_t digest3[42];
 
-      Hacl_Hash_SHA3_Simd256_shake128(message_size,
+      Hacl_Hash_SHA3_Simd256_shake128(digest0,
+                                       digest1,
+                                       digest2,
+                                       digest3,
+                                       digest_size,
                                        (uint8_t*)message0,
                                        (uint8_t*)message1,
                                        (uint8_t*)message2,
                                        (uint8_t*)message3,
-                                       digest_size,
-                                       digest0,
-                                       digest1,
-                                       digest2,
-                                       digest3);
+                                       message_size);
       // ANCHOR_END(example vec256_shake128)
 
       bytes expected_digest0 = from_hex(
@@ -356,16 +356,16 @@ TEST_P(Sha3KAT, TryKAT)
     bytes digest(test_case.md.size(), 0);
     if (test_case.md.size() == 224 / 8) {
       Hacl_Hash_SHA3_Scalar_sha3_224(
-        test_case.msg.size(), test_case.msg.data(), digest.data());
+        digest.data(), test_case.msg.data(), test_case.msg.size());
     } else if (test_case.md.size() == 256 / 8) {
       Hacl_Hash_SHA3_Scalar_sha3_256(
-        test_case.msg.size(), test_case.msg.data(), digest.data());
+        digest.data(), test_case.msg.data(), test_case.msg.size());
     } else if (test_case.md.size() == 384 / 8) {
       Hacl_Hash_SHA3_Scalar_sha3_384(
-        test_case.msg.size(), test_case.msg.data(), digest.data());
+        digest.data(), test_case.msg.data(), test_case.msg.size());
     } else if (test_case.md.size() == 512 / 8) {
       Hacl_Hash_SHA3_Scalar_sha3_512(
-        test_case.msg.size(), test_case.msg.data(), digest.data());
+        digest.data(), test_case.msg.data(), test_case.msg.size());
     }
 
     EXPECT_EQ(test_case.md, digest) << bytes_to_hex(test_case.md) << std::endl
@@ -380,45 +380,45 @@ TEST_P(Sha3KAT, TryKAT)
     bytes digest2(test_case.md.size(), 0);
     bytes digest3(test_case.md.size(), 0);
     if (test_case.md.size() == 224 / 8) {
-      Hacl_Hash_SHA3_Simd256_sha3_224(test_case.msg.size(),
+      Hacl_Hash_SHA3_Simd256_sha3_224(digest0.data(),
+                                       digest1.data(),
+                                       digest2.data(),
+                                       digest3.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
-                                       digest0.data(),
+                                       test_case.msg.size());
+    } else if (test_case.md.size() == 256 / 8) {
+      Hacl_Hash_SHA3_Simd256_sha3_256(digest0.data(),
                                        digest1.data(),
                                        digest2.data(),
-                                       digest3.data());
-    } else if (test_case.md.size() == 256 / 8) {
-      Hacl_Hash_SHA3_Simd256_sha3_256(test_case.msg.size(),
+                                       digest3.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
-                                       digest0.data(),
+                                       test_case.msg.size());
+    } else if (test_case.md.size() == 384 / 8) {
+      Hacl_Hash_SHA3_Simd256_sha3_384(digest0.data(),
                                        digest1.data(),
                                        digest2.data(),
-                                       digest3.data());
-    } else if (test_case.md.size() == 384 / 8) {
-      Hacl_Hash_SHA3_Simd256_sha3_384(test_case.msg.size(),
+                                       digest3.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
-                                       digest0.data(),
+                                       test_case.msg.size());
+    } else if (test_case.md.size() == 512 / 8) {
+      Hacl_Hash_SHA3_Simd256_sha3_512(digest0.data(),
                                        digest1.data(),
                                        digest2.data(),
-                                       digest3.data());
-    } else if (test_case.md.size() == 512 / 8) {
-      Hacl_Hash_SHA3_Simd256_sha3_512(test_case.msg.size(),
+                                       digest3.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
-                                       digest0.data(),
-                                       digest1.data(),
-                                       digest2.data(),
-                                       digest3.data());
+                                       test_case.msg.size());
     }
 
     EXPECT_EQ(test_case.md, digest0) << bytes_to_hex(test_case.md) << std::endl
@@ -468,20 +468,20 @@ TEST_P(ShakeKAT, TryKAT)
     if (test_case.md.size() == 128 / 8) {
       bytes digest(test_case.md.size(), 128 / 8);
 
-      Hacl_Hash_SHA3_Scalar_shake128(test_case.msg.size(),
-                                     test_case.msg.data(),
+      Hacl_Hash_SHA3_Scalar_shake128(digest.data(),
                                      digest.size(),
-                                     digest.data());
+                                     test_case.msg.data(),
+                                     test_case.msg.size());
 
       EXPECT_EQ(test_case.md, digest) << bytes_to_hex(test_case.md) << std::endl
                                       << bytes_to_hex(digest) << std::endl;
     } else if (test_case.md.size() == 256 / 8) {
       bytes digest(test_case.md.size(), 256 / 8);
 
-      Hacl_Hash_SHA3_Scalar_shake256(test_case.msg.size(),
-                                     test_case.msg.data(),
+      Hacl_Hash_SHA3_Scalar_shake256(digest.data(),
                                      digest.size(),
-                                     digest.data());
+                                     test_case.msg.data(),
+                                     test_case.msg.size());
 
       EXPECT_EQ(test_case.md, digest) << bytes_to_hex(test_case.md) << std::endl
                                       << bytes_to_hex(digest) << std::endl;
@@ -497,16 +497,16 @@ TEST_P(ShakeKAT, TryKAT)
       bytes digest2(test_case.md.size(), 128 / 8);
       bytes digest3(test_case.md.size(), 128 / 8);
 
-      Hacl_Hash_SHA3_Simd256_shake128(test_case.msg.size(),
+      Hacl_Hash_SHA3_Simd256_shake128(digest0.data(),
+                                       digest1.data(),
+                                       digest2.data(),
+                                       digest3.data(),
+                                       digest0.size(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
-                                       digest0.size(),
-                                       digest0.data(),
-                                       digest1.data(),
-                                       digest2.data(),
-                                       digest3.data());
+                                       test_case.msg.size());
 
       EXPECT_EQ(test_case.md, digest0)
         << bytes_to_hex(test_case.md) << std::endl
@@ -526,16 +526,16 @@ TEST_P(ShakeKAT, TryKAT)
       bytes digest2(test_case.md.size(), 256 / 8);
       bytes digest3(test_case.md.size(), 256 / 8);
 
-      Hacl_Hash_SHA3_Simd256_shake256(test_case.msg.size(),
+      Hacl_Hash_SHA3_Simd256_shake256(digest0.data(),
+                                       digest1.data(),
+                                       digest2.data(),
+                                       digest3.data(),
+                                       digest0.size(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
                                        test_case.msg.data(),
-                                       digest0.size(),
-                                       digest0.data(),
-                                       digest1.data(),
-                                       digest2.data(),
-                                       digest3.data());
+                                       test_case.msg.size());
 
       EXPECT_EQ(test_case.md, digest0)
         << bytes_to_hex(test_case.md) << std::endl

From 34a673651b5ca720793c6e86d0ca43f75616b5ba Mon Sep 17 00:00:00 2001
From: Maamoun TK <maamoun.tk@gmail.com>
Date: Thu, 14 Dec 2023 18:41:34 +0200
Subject: [PATCH 2/6] Expose shake128_absorb/squeeze functions

---
 include/Hacl_Hash_SHA3_Scalar.h       |   14 +
 include/Hacl_Hash_SHA3_Simd256.h      |   24 +
 include/msvc/Hacl_Hash_SHA3_Scalar.h  |   14 +
 include/msvc/Hacl_Hash_SHA3_Simd256.h |   24 +
 src/Hacl_Hash_SHA3_Scalar.c           |  402 +++++++
 src/Hacl_Hash_SHA3_Simd256.c          | 1434 +++++++++++++++++++++++++
 6 files changed, 1912 insertions(+)

diff --git a/include/Hacl_Hash_SHA3_Scalar.h b/include/Hacl_Hash_SHA3_Scalar.h
index 2063da71..4b893cd8 100644
--- a/include/Hacl_Hash_SHA3_Scalar.h
+++ b/include/Hacl_Hash_SHA3_Scalar.h
@@ -59,6 +59,20 @@ void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t in
 
 void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
+uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void);
+
+void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s);
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen);
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
+  uint64_t *state,
+  uint8_t *output,
+  uint32_t outputByteLen
+);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/include/Hacl_Hash_SHA3_Simd256.h b/include/Hacl_Hash_SHA3_Simd256.h
index 22efc736..534a8899 100644
--- a/include/Hacl_Hash_SHA3_Simd256.h
+++ b/include/Hacl_Hash_SHA3_Simd256.h
@@ -136,6 +136,30 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   uint32_t inputByteLen
 );
 
+uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void);
+
+void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s);
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+);
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen
+);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/include/msvc/Hacl_Hash_SHA3_Scalar.h b/include/msvc/Hacl_Hash_SHA3_Scalar.h
index 2063da71..4b893cd8 100644
--- a/include/msvc/Hacl_Hash_SHA3_Scalar.h
+++ b/include/msvc/Hacl_Hash_SHA3_Scalar.h
@@ -59,6 +59,20 @@ void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t in
 
 void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
+uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void);
+
+void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s);
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen);
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
+  uint64_t *state,
+  uint8_t *output,
+  uint32_t outputByteLen
+);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/include/msvc/Hacl_Hash_SHA3_Simd256.h b/include/msvc/Hacl_Hash_SHA3_Simd256.h
index 22efc736..534a8899 100644
--- a/include/msvc/Hacl_Hash_SHA3_Simd256.h
+++ b/include/msvc/Hacl_Hash_SHA3_Simd256.h
@@ -136,6 +136,30 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   uint32_t inputByteLen
 );
 
+uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void);
+
+void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s);
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+);
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen
+);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/src/Hacl_Hash_SHA3_Scalar.c b/src/Hacl_Hash_SHA3_Scalar.c
index 724426eb..19a1936a 100644
--- a/src/Hacl_Hash_SHA3_Scalar.c
+++ b/src/Hacl_Hash_SHA3_Scalar.c
@@ -2381,3 +2381,405 @@ void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t in
   memcpy(output + 64U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
+uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void)
+{
+  uint64_t *buf = (uint64_t *)KRML_HOST_CALLOC(25U, sizeof (uint64_t));
+  return buf;
+}
+
+void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s)
+{
+  KRML_HOST_FREE(s);
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen)
+{
+  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
+  {
+    uint8_t b1[256U] = { 0U };
+    uint8_t *b_ = b1;
+    uint8_t *b0 = input;
+    uint8_t *bl0 = b_;
+    memcpy(bl0, b0 + i0 * 168U, 168U * sizeof (uint8_t));
+    uint64_t ws[32U] = { 0U };
+    uint8_t *b = b_;
+    uint64_t u = load64_le(b);
+    ws[0U] = u;
+    uint64_t u0 = load64_le(b + 8U);
+    ws[1U] = u0;
+    uint64_t u1 = load64_le(b + 16U);
+    ws[2U] = u1;
+    uint64_t u2 = load64_le(b + 24U);
+    ws[3U] = u2;
+    uint64_t u3 = load64_le(b + 32U);
+    ws[4U] = u3;
+    uint64_t u4 = load64_le(b + 40U);
+    ws[5U] = u4;
+    uint64_t u5 = load64_le(b + 48U);
+    ws[6U] = u5;
+    uint64_t u6 = load64_le(b + 56U);
+    ws[7U] = u6;
+    uint64_t u7 = load64_le(b + 64U);
+    ws[8U] = u7;
+    uint64_t u8 = load64_le(b + 72U);
+    ws[9U] = u8;
+    uint64_t u9 = load64_le(b + 80U);
+    ws[10U] = u9;
+    uint64_t u10 = load64_le(b + 88U);
+    ws[11U] = u10;
+    uint64_t u11 = load64_le(b + 96U);
+    ws[12U] = u11;
+    uint64_t u12 = load64_le(b + 104U);
+    ws[13U] = u12;
+    uint64_t u13 = load64_le(b + 112U);
+    ws[14U] = u13;
+    uint64_t u14 = load64_le(b + 120U);
+    ws[15U] = u14;
+    uint64_t u15 = load64_le(b + 128U);
+    ws[16U] = u15;
+    uint64_t u16 = load64_le(b + 136U);
+    ws[17U] = u16;
+    uint64_t u17 = load64_le(b + 144U);
+    ws[18U] = u17;
+    uint64_t u18 = load64_le(b + 152U);
+    ws[19U] = u18;
+    uint64_t u19 = load64_le(b + 160U);
+    ws[20U] = u19;
+    uint64_t u20 = load64_le(b + 168U);
+    ws[21U] = u20;
+    uint64_t u21 = load64_le(b + 176U);
+    ws[22U] = u21;
+    uint64_t u22 = load64_le(b + 184U);
+    ws[23U] = u22;
+    uint64_t u23 = load64_le(b + 192U);
+    ws[24U] = u23;
+    uint64_t u24 = load64_le(b + 200U);
+    ws[25U] = u24;
+    uint64_t u25 = load64_le(b + 208U);
+    ws[26U] = u25;
+    uint64_t u26 = load64_le(b + 216U);
+    ws[27U] = u26;
+    uint64_t u27 = load64_le(b + 224U);
+    ws[28U] = u27;
+    uint64_t u28 = load64_le(b + 232U);
+    ws[29U] = u28;
+    uint64_t u29 = load64_le(b + 240U);
+    ws[30U] = u29;
+    uint64_t u30 = load64_le(b + 248U);
+    ws[31U] = u30;
+    for (uint32_t i = 0U; i < 25U; i++)
+    {
+      state[i] = state[i] ^ ws[i];
+    }
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      uint64_t _C[5U] = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        _C[i] =
+          state[i
+          + 0U]
+          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
+        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
+      uint64_t x = state[1U];
+      uint64_t current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        uint64_t temp = state[_Y];
+        uint64_t uu____1 = current;
+        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+        state[0U + 5U * i] = v0;
+        state[1U + 5U * i] = v1;
+        state[2U + 5U * i] = v2;
+        state[3U + 5U * i] = v3;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      state[0U] = state[0U] ^ c;
+    }
+  }
+  uint32_t rem = inputByteLen % 168U;
+  uint8_t b2[256U] = { 0U };
+  uint8_t *b_ = b2;
+  uint32_t rem1 = inputByteLen % 168U;
+  uint8_t *b00 = input;
+  uint8_t *bl0 = b_;
+  memcpy(bl0, b00 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b01 = b_;
+  b01[rem] = 0x1FU;
+  uint64_t ws[32U] = { 0U };
+  uint8_t *b = b_;
+  uint64_t u0 = load64_le(b);
+  ws[0U] = u0;
+  uint64_t u1 = load64_le(b + 8U);
+  ws[1U] = u1;
+  uint64_t u2 = load64_le(b + 16U);
+  ws[2U] = u2;
+  uint64_t u3 = load64_le(b + 24U);
+  ws[3U] = u3;
+  uint64_t u4 = load64_le(b + 32U);
+  ws[4U] = u4;
+  uint64_t u5 = load64_le(b + 40U);
+  ws[5U] = u5;
+  uint64_t u6 = load64_le(b + 48U);
+  ws[6U] = u6;
+  uint64_t u7 = load64_le(b + 56U);
+  ws[7U] = u7;
+  uint64_t u8 = load64_le(b + 64U);
+  ws[8U] = u8;
+  uint64_t u9 = load64_le(b + 72U);
+  ws[9U] = u9;
+  uint64_t u10 = load64_le(b + 80U);
+  ws[10U] = u10;
+  uint64_t u11 = load64_le(b + 88U);
+  ws[11U] = u11;
+  uint64_t u12 = load64_le(b + 96U);
+  ws[12U] = u12;
+  uint64_t u13 = load64_le(b + 104U);
+  ws[13U] = u13;
+  uint64_t u14 = load64_le(b + 112U);
+  ws[14U] = u14;
+  uint64_t u15 = load64_le(b + 120U);
+  ws[15U] = u15;
+  uint64_t u16 = load64_le(b + 128U);
+  ws[16U] = u16;
+  uint64_t u17 = load64_le(b + 136U);
+  ws[17U] = u17;
+  uint64_t u18 = load64_le(b + 144U);
+  ws[18U] = u18;
+  uint64_t u19 = load64_le(b + 152U);
+  ws[19U] = u19;
+  uint64_t u20 = load64_le(b + 160U);
+  ws[20U] = u20;
+  uint64_t u21 = load64_le(b + 168U);
+  ws[21U] = u21;
+  uint64_t u22 = load64_le(b + 176U);
+  ws[22U] = u22;
+  uint64_t u23 = load64_le(b + 184U);
+  ws[23U] = u23;
+  uint64_t u24 = load64_le(b + 192U);
+  ws[24U] = u24;
+  uint64_t u25 = load64_le(b + 200U);
+  ws[25U] = u25;
+  uint64_t u26 = load64_le(b + 208U);
+  ws[26U] = u26;
+  uint64_t u27 = load64_le(b + 216U);
+  ws[27U] = u27;
+  uint64_t u28 = load64_le(b + 224U);
+  ws[28U] = u28;
+  uint64_t u29 = load64_le(b + 232U);
+  ws[29U] = u29;
+  uint64_t u30 = load64_le(b + 240U);
+  ws[30U] = u30;
+  uint64_t u31 = load64_le(b + 248U);
+  ws[31U] = u31;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws[i];
+  }
+  uint8_t b3[256U] = { 0U };
+  uint8_t *b4 = b3;
+  uint8_t *b0 = b4;
+  b0[167U] = 0x80U;
+  uint64_t ws0[32U] = { 0U };
+  uint8_t *b1 = b4;
+  uint64_t u = load64_le(b1);
+  ws0[0U] = u;
+  uint64_t u32 = load64_le(b1 + 8U);
+  ws0[1U] = u32;
+  uint64_t u33 = load64_le(b1 + 16U);
+  ws0[2U] = u33;
+  uint64_t u34 = load64_le(b1 + 24U);
+  ws0[3U] = u34;
+  uint64_t u35 = load64_le(b1 + 32U);
+  ws0[4U] = u35;
+  uint64_t u36 = load64_le(b1 + 40U);
+  ws0[5U] = u36;
+  uint64_t u37 = load64_le(b1 + 48U);
+  ws0[6U] = u37;
+  uint64_t u38 = load64_le(b1 + 56U);
+  ws0[7U] = u38;
+  uint64_t u39 = load64_le(b1 + 64U);
+  ws0[8U] = u39;
+  uint64_t u40 = load64_le(b1 + 72U);
+  ws0[9U] = u40;
+  uint64_t u41 = load64_le(b1 + 80U);
+  ws0[10U] = u41;
+  uint64_t u42 = load64_le(b1 + 88U);
+  ws0[11U] = u42;
+  uint64_t u43 = load64_le(b1 + 96U);
+  ws0[12U] = u43;
+  uint64_t u44 = load64_le(b1 + 104U);
+  ws0[13U] = u44;
+  uint64_t u45 = load64_le(b1 + 112U);
+  ws0[14U] = u45;
+  uint64_t u46 = load64_le(b1 + 120U);
+  ws0[15U] = u46;
+  uint64_t u47 = load64_le(b1 + 128U);
+  ws0[16U] = u47;
+  uint64_t u48 = load64_le(b1 + 136U);
+  ws0[17U] = u48;
+  uint64_t u49 = load64_le(b1 + 144U);
+  ws0[18U] = u49;
+  uint64_t u50 = load64_le(b1 + 152U);
+  ws0[19U] = u50;
+  uint64_t u51 = load64_le(b1 + 160U);
+  ws0[20U] = u51;
+  uint64_t u52 = load64_le(b1 + 168U);
+  ws0[21U] = u52;
+  uint64_t u53 = load64_le(b1 + 176U);
+  ws0[22U] = u53;
+  uint64_t u54 = load64_le(b1 + 184U);
+  ws0[23U] = u54;
+  uint64_t u55 = load64_le(b1 + 192U);
+  ws0[24U] = u55;
+  uint64_t u56 = load64_le(b1 + 200U);
+  ws0[25U] = u56;
+  uint64_t u57 = load64_le(b1 + 208U);
+  ws0[26U] = u57;
+  uint64_t u58 = load64_le(b1 + 216U);
+  ws0[27U] = u58;
+  uint64_t u59 = load64_le(b1 + 224U);
+  ws0[28U] = u59;
+  uint64_t u60 = load64_le(b1 + 232U);
+  ws0[29U] = u60;
+  uint64_t u61 = load64_le(b1 + 240U);
+  ws0[30U] = u61;
+  uint64_t u62 = load64_le(b1 + 248U);
+  ws0[31U] = u62;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws0[i];
+  }
+  for (uint32_t i0 = 0U; i0 < 24U; i0++)
+  {
+    uint64_t _C[5U] = { 0U };
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      _C[i] = state[i + 0U] ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+    KRML_MAYBE_FOR5(i1,
+      0U,
+      5U,
+      1U,
+      uint64_t uu____2 = _C[(i1 + 1U) % 5U];
+      uint64_t _D = _C[(i1 + 4U) % 5U] ^ (uu____2 << 1U | uu____2 >> 63U);
+      KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i1 + 5U * i] = state[i1 + 5U * i] ^ _D;););
+    uint64_t x = state[1U];
+    uint64_t current = x;
+    for (uint32_t i = 0U; i < 24U; i++)
+    {
+      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+      uint64_t temp = state[_Y];
+      uint64_t uu____3 = current;
+      state[_Y] = uu____3 << r | uu____3 >> (64U - r);
+      current = temp;
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+      uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+      uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+      uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+      uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+      state[0U + 5U * i] = v0;
+      state[1U + 5U * i] = v1;
+      state[2U + 5U * i] = v2;
+      state[3U + 5U * i] = v3;
+      state[4U + 5U * i] = v4;);
+    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
+    state[0U] = state[0U] ^ c;
+  }
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
+  uint64_t *state,
+  uint8_t *output,
+  uint32_t outputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < outputByteLen / 168U; i0++)
+  {
+    uint8_t hbuf[256U] = { 0U };
+    uint64_t ws[32U] = { 0U };
+    memcpy(ws, state, 25U * sizeof (uint64_t));
+    for (uint32_t i = 0U; i < 32U; i++)
+    {
+      store64_le(hbuf + i * 8U, ws[i]);
+    }
+    memcpy(output + i0 * 168U, hbuf, 168U * sizeof (uint8_t));
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      uint64_t _C[5U] = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        _C[i] =
+          state[i
+          + 0U]
+          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
+        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
+      uint64_t x = state[1U];
+      uint64_t current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        uint64_t temp = state[_Y];
+        uint64_t uu____1 = current;
+        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+        state[0U + 5U * i] = v0;
+        state[1U + 5U * i] = v1;
+        state[2U + 5U * i] = v2;
+        state[3U + 5U * i] = v3;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      state[0U] = state[0U] ^ c;
+    }
+  }
+}
+
diff --git a/src/Hacl_Hash_SHA3_Simd256.c b/src/Hacl_Hash_SHA3_Simd256.c
index fbe195f5..a401bd71 100644
--- a/src/Hacl_Hash_SHA3_Simd256.c
+++ b/src/Hacl_Hash_SHA3_Simd256.c
@@ -10188,3 +10188,1437 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   }
 }
 
+uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void)
+{
+  uint64_t *buf = (uint64_t *)KRML_HOST_CALLOC(100U, sizeof (uint64_t));
+  return buf;
+}
+
+void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s)
+{
+  KRML_HOST_FREE(s);
+}
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
+  {
+    uint8_t b00[256U] = { 0U };
+    uint8_t b10[256U] = { 0U };
+    uint8_t b20[256U] = { 0U };
+    uint8_t b30[256U] = { 0U };
+    K____uint8_t___uint8_t____K____uint8_t___uint8_t_
+    b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
+    uint8_t *b01 = input0;
+    uint8_t *b11 = input1;
+    uint8_t *b21 = input2;
+    uint8_t *b31 = input3;
+    uint8_t *bl3 = b_.snd.snd.snd;
+    uint8_t *bl2 = b_.snd.snd.fst;
+    uint8_t *bl1 = b_.snd.fst;
+    uint8_t *bl0 = b_.fst;
+    memcpy(bl0, b01 + i0 * 168U, 168U * sizeof (uint8_t));
+    memcpy(bl1, b11 + i0 * 168U, 168U * sizeof (uint8_t));
+    memcpy(bl2, b21 + i0 * 168U, 168U * sizeof (uint8_t));
+    memcpy(bl3, b31 + i0 * 168U, 168U * sizeof (uint8_t));
+    KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws[32U] KRML_POST_ALIGN(32) = { 0U };
+    uint8_t *b3 = b_.snd.snd.snd;
+    uint8_t *b2 = b_.snd.snd.fst;
+    uint8_t *b1 = b_.snd.fst;
+    uint8_t *b0 = b_.fst;
+    ws[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0);
+    ws[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1);
+    ws[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2);
+    ws[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+    ws[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 32U);
+    ws[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 32U);
+    ws[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 32U);
+    ws[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+    ws[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 64U);
+    ws[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 64U);
+    ws[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 64U);
+    ws[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+    ws[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 96U);
+    ws[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 96U);
+    ws[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 96U);
+    ws[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+    ws[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 128U);
+    ws[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 128U);
+    ws[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 128U);
+    ws[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+    ws[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 160U);
+    ws[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 160U);
+    ws[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 160U);
+    ws[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+    ws[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 192U);
+    ws[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 192U);
+    ws[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 192U);
+    ws[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+    ws[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 224U);
+    ws[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 224U);
+    ws[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 224U);
+    ws[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
+    Lib_IntVector_Intrinsics_vec256 v00 = ws[0U];
+    Lib_IntVector_Intrinsics_vec256 v10 = ws[1U];
+    Lib_IntVector_Intrinsics_vec256 v20 = ws[2U];
+    Lib_IntVector_Intrinsics_vec256 v30 = ws[3U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v1_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v2_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v3_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v0__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v1__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v2__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256
+    v3__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256 ws0 = v0__;
+    Lib_IntVector_Intrinsics_vec256 ws1 = v2__;
+    Lib_IntVector_Intrinsics_vec256 ws2 = v1__;
+    Lib_IntVector_Intrinsics_vec256 ws3 = v3__;
+    Lib_IntVector_Intrinsics_vec256 v01 = ws[4U];
+    Lib_IntVector_Intrinsics_vec256 v11 = ws[5U];
+    Lib_IntVector_Intrinsics_vec256 v21 = ws[6U];
+    Lib_IntVector_Intrinsics_vec256 v31 = ws[7U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v1_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v2_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v3_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v0__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v1__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v2__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256
+    v3__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256 ws4 = v0__0;
+    Lib_IntVector_Intrinsics_vec256 ws5 = v2__0;
+    Lib_IntVector_Intrinsics_vec256 ws6 = v1__0;
+    Lib_IntVector_Intrinsics_vec256 ws7 = v3__0;
+    Lib_IntVector_Intrinsics_vec256 v02 = ws[8U];
+    Lib_IntVector_Intrinsics_vec256 v12 = ws[9U];
+    Lib_IntVector_Intrinsics_vec256 v22 = ws[10U];
+    Lib_IntVector_Intrinsics_vec256 v32 = ws[11U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v1_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v2_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v3_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v0__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v1__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v2__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256
+    v3__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256 ws8 = v0__1;
+    Lib_IntVector_Intrinsics_vec256 ws9 = v2__1;
+    Lib_IntVector_Intrinsics_vec256 ws10 = v1__1;
+    Lib_IntVector_Intrinsics_vec256 ws11 = v3__1;
+    Lib_IntVector_Intrinsics_vec256 v03 = ws[12U];
+    Lib_IntVector_Intrinsics_vec256 v13 = ws[13U];
+    Lib_IntVector_Intrinsics_vec256 v23 = ws[14U];
+    Lib_IntVector_Intrinsics_vec256 v33 = ws[15U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v1_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v2_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v3_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v0__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v1__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v2__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256
+    v3__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256 ws12 = v0__2;
+    Lib_IntVector_Intrinsics_vec256 ws13 = v2__2;
+    Lib_IntVector_Intrinsics_vec256 ws14 = v1__2;
+    Lib_IntVector_Intrinsics_vec256 ws15 = v3__2;
+    Lib_IntVector_Intrinsics_vec256 v04 = ws[16U];
+    Lib_IntVector_Intrinsics_vec256 v14 = ws[17U];
+    Lib_IntVector_Intrinsics_vec256 v24 = ws[18U];
+    Lib_IntVector_Intrinsics_vec256 v34 = ws[19U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v1_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v2_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v3_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v0__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v1__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v2__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256
+    v3__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256 ws16 = v0__3;
+    Lib_IntVector_Intrinsics_vec256 ws17 = v2__3;
+    Lib_IntVector_Intrinsics_vec256 ws18 = v1__3;
+    Lib_IntVector_Intrinsics_vec256 ws19 = v3__3;
+    Lib_IntVector_Intrinsics_vec256 v05 = ws[20U];
+    Lib_IntVector_Intrinsics_vec256 v15 = ws[21U];
+    Lib_IntVector_Intrinsics_vec256 v25 = ws[22U];
+    Lib_IntVector_Intrinsics_vec256 v35 = ws[23U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v1_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v2_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v3_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v0__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v1__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v2__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256
+    v3__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256 ws20 = v0__4;
+    Lib_IntVector_Intrinsics_vec256 ws21 = v2__4;
+    Lib_IntVector_Intrinsics_vec256 ws22 = v1__4;
+    Lib_IntVector_Intrinsics_vec256 ws23 = v3__4;
+    Lib_IntVector_Intrinsics_vec256 v06 = ws[24U];
+    Lib_IntVector_Intrinsics_vec256 v16 = ws[25U];
+    Lib_IntVector_Intrinsics_vec256 v26 = ws[26U];
+    Lib_IntVector_Intrinsics_vec256 v36 = ws[27U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v1_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v2_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v3_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v0__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v1__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v2__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256
+    v3__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256 ws24 = v0__5;
+    Lib_IntVector_Intrinsics_vec256 ws25 = v2__5;
+    Lib_IntVector_Intrinsics_vec256 ws26 = v1__5;
+    Lib_IntVector_Intrinsics_vec256 ws27 = v3__5;
+    Lib_IntVector_Intrinsics_vec256 v0 = ws[28U];
+    Lib_IntVector_Intrinsics_vec256 v1 = ws[29U];
+    Lib_IntVector_Intrinsics_vec256 v2 = ws[30U];
+    Lib_IntVector_Intrinsics_vec256 v3 = ws[31U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v1_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v2_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v3_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v0__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v1__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v2__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256
+    v3__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256 ws28 = v0__6;
+    Lib_IntVector_Intrinsics_vec256 ws29 = v2__6;
+    Lib_IntVector_Intrinsics_vec256 ws30 = v1__6;
+    Lib_IntVector_Intrinsics_vec256 ws31 = v3__6;
+    ws[0U] = ws0;
+    ws[1U] = ws1;
+    ws[2U] = ws2;
+    ws[3U] = ws3;
+    ws[4U] = ws4;
+    ws[5U] = ws5;
+    ws[6U] = ws6;
+    ws[7U] = ws7;
+    ws[8U] = ws8;
+    ws[9U] = ws9;
+    ws[10U] = ws10;
+    ws[11U] = ws11;
+    ws[12U] = ws12;
+    ws[13U] = ws13;
+    ws[14U] = ws14;
+    ws[15U] = ws15;
+    ws[16U] = ws16;
+    ws[17U] = ws17;
+    ws[18U] = ws18;
+    ws[19U] = ws19;
+    ws[20U] = ws20;
+    ws[21U] = ws21;
+    ws[22U] = ws22;
+    ws[23U] = ws23;
+    ws[24U] = ws24;
+    ws[25U] = ws25;
+    ws[26U] = ws26;
+    ws[27U] = ws27;
+    ws[28U] = ws28;
+    ws[29U] = ws29;
+    ws[30U] = ws30;
+    ws[31U] = ws31;
+    for (uint32_t i = 0U; i < 25U; i++)
+    {
+      state[i] = Lib_IntVector_Intrinsics_vec256_xor(state[i], ws[i]);
+    }
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____0 = state[i + 0U];
+        Lib_IntVector_Intrinsics_vec256 uu____1 = state[i + 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____2 = state[i + 10U];
+        _C[i] =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____0,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____1,
+              Lib_IntVector_Intrinsics_vec256_xor(uu____2,
+                Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____3 = _C[(i2 + 4U) % 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____4 = _C[(i2 + 1U) % 5U];
+        Lib_IntVector_Intrinsics_vec256
+        _D =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____3,
+            Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____4,
+                1U),
+              Lib_IntVector_Intrinsics_vec256_shift_right64(uu____4, 63U)));
+        KRML_MAYBE_FOR5(i,
+          0U,
+          5U,
+          1U,
+          state[i2 + 5U * i] = Lib_IntVector_Intrinsics_vec256_xor(state[i2 + 5U * i], _D);););
+      Lib_IntVector_Intrinsics_vec256 x = state[1U];
+      Lib_IntVector_Intrinsics_vec256 current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
+        Lib_IntVector_Intrinsics_vec256 uu____5 = current;
+        state[_Y] =
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____5,
+              r),
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____5, 64U - r));
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____6 = state[0U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____7 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v07 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____6,
+            Lib_IntVector_Intrinsics_vec256_and(uu____7, state[2U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____8 = state[1U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____9 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v17 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____8,
+            Lib_IntVector_Intrinsics_vec256_and(uu____9, state[3U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____10 = state[2U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____11 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v27 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____10,
+            Lib_IntVector_Intrinsics_vec256_and(uu____11, state[4U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____12 = state[3U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____13 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v37 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____12,
+            Lib_IntVector_Intrinsics_vec256_and(uu____13, state[0U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____14 = state[4U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____15 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v4 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____14,
+            Lib_IntVector_Intrinsics_vec256_and(uu____15, state[1U + 5U * i]));
+        state[0U + 5U * i] = v07;
+        state[1U + 5U * i] = v17;
+        state[2U + 5U * i] = v27;
+        state[3U + 5U * i] = v37;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      Lib_IntVector_Intrinsics_vec256 uu____16 = state[0U];
+      state[0U] =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____16,
+          Lib_IntVector_Intrinsics_vec256_load64(c));
+    }
+  }
+  uint32_t rem = inputByteLen % 168U;
+  uint8_t b00[256U] = { 0U };
+  uint8_t b10[256U] = { 0U };
+  uint8_t b20[256U] = { 0U };
+  uint8_t b30[256U] = { 0U };
+  K____uint8_t___uint8_t____K____uint8_t___uint8_t_
+  b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
+  uint32_t rem1 = inputByteLen % 168U;
+  uint8_t *b01 = input0;
+  uint8_t *b11 = input1;
+  uint8_t *b21 = input2;
+  uint8_t *b31 = input3;
+  uint8_t *bl3 = b_.snd.snd.snd;
+  uint8_t *bl2 = b_.snd.snd.fst;
+  uint8_t *bl1 = b_.snd.fst;
+  uint8_t *bl0 = b_.fst;
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x1FU;
+  b12[rem] = 0x1FU;
+  b22[rem] = 0x1FU;
+  b32[rem] = 0x1FU;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b33 = b_.snd.snd.snd;
+  uint8_t *b23 = b_.snd.snd.fst;
+  uint8_t *b13 = b_.snd.fst;
+  uint8_t *b03 = b_.fst;
+  ws[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
+  Lib_IntVector_Intrinsics_vec256 v00 = ws[0U];
+  Lib_IntVector_Intrinsics_vec256 v10 = ws[1U];
+  Lib_IntVector_Intrinsics_vec256 v20 = ws[2U];
+  Lib_IntVector_Intrinsics_vec256 v30 = ws[3U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v00, v10);
+  Lib_IntVector_Intrinsics_vec256
+  v1_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v00, v10);
+  Lib_IntVector_Intrinsics_vec256
+  v2_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v20, v30);
+  Lib_IntVector_Intrinsics_vec256
+  v3_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v20, v30);
+  Lib_IntVector_Intrinsics_vec256
+  v0__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_, v2_);
+  Lib_IntVector_Intrinsics_vec256
+  v1__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_, v2_);
+  Lib_IntVector_Intrinsics_vec256
+  v2__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_, v3_);
+  Lib_IntVector_Intrinsics_vec256
+  v3__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_, v3_);
+  Lib_IntVector_Intrinsics_vec256 ws00 = v0__;
+  Lib_IntVector_Intrinsics_vec256 ws110 = v2__;
+  Lib_IntVector_Intrinsics_vec256 ws210 = v1__;
+  Lib_IntVector_Intrinsics_vec256 ws32 = v3__;
+  Lib_IntVector_Intrinsics_vec256 v01 = ws[4U];
+  Lib_IntVector_Intrinsics_vec256 v11 = ws[5U];
+  Lib_IntVector_Intrinsics_vec256 v21 = ws[6U];
+  Lib_IntVector_Intrinsics_vec256 v31 = ws[7U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v01, v11);
+  Lib_IntVector_Intrinsics_vec256
+  v1_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v01, v11);
+  Lib_IntVector_Intrinsics_vec256
+  v2_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v21, v31);
+  Lib_IntVector_Intrinsics_vec256
+  v3_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v21, v31);
+  Lib_IntVector_Intrinsics_vec256
+  v0__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_0, v2_0);
+  Lib_IntVector_Intrinsics_vec256
+  v1__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_0, v2_0);
+  Lib_IntVector_Intrinsics_vec256
+  v2__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_0, v3_0);
+  Lib_IntVector_Intrinsics_vec256
+  v3__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_0, v3_0);
+  Lib_IntVector_Intrinsics_vec256 ws40 = v0__0;
+  Lib_IntVector_Intrinsics_vec256 ws50 = v2__0;
+  Lib_IntVector_Intrinsics_vec256 ws60 = v1__0;
+  Lib_IntVector_Intrinsics_vec256 ws70 = v3__0;
+  Lib_IntVector_Intrinsics_vec256 v02 = ws[8U];
+  Lib_IntVector_Intrinsics_vec256 v12 = ws[9U];
+  Lib_IntVector_Intrinsics_vec256 v22 = ws[10U];
+  Lib_IntVector_Intrinsics_vec256 v32 = ws[11U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v02, v12);
+  Lib_IntVector_Intrinsics_vec256
+  v1_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v02, v12);
+  Lib_IntVector_Intrinsics_vec256
+  v2_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v22, v32);
+  Lib_IntVector_Intrinsics_vec256
+  v3_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v22, v32);
+  Lib_IntVector_Intrinsics_vec256
+  v0__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_1, v2_1);
+  Lib_IntVector_Intrinsics_vec256
+  v1__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_1, v2_1);
+  Lib_IntVector_Intrinsics_vec256
+  v2__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_1, v3_1);
+  Lib_IntVector_Intrinsics_vec256
+  v3__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_1, v3_1);
+  Lib_IntVector_Intrinsics_vec256 ws80 = v0__1;
+  Lib_IntVector_Intrinsics_vec256 ws90 = v2__1;
+  Lib_IntVector_Intrinsics_vec256 ws100 = v1__1;
+  Lib_IntVector_Intrinsics_vec256 ws111 = v3__1;
+  Lib_IntVector_Intrinsics_vec256 v03 = ws[12U];
+  Lib_IntVector_Intrinsics_vec256 v13 = ws[13U];
+  Lib_IntVector_Intrinsics_vec256 v23 = ws[14U];
+  Lib_IntVector_Intrinsics_vec256 v33 = ws[15U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v03, v13);
+  Lib_IntVector_Intrinsics_vec256
+  v1_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v03, v13);
+  Lib_IntVector_Intrinsics_vec256
+  v2_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v23, v33);
+  Lib_IntVector_Intrinsics_vec256
+  v3_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v23, v33);
+  Lib_IntVector_Intrinsics_vec256
+  v0__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_2, v2_2);
+  Lib_IntVector_Intrinsics_vec256
+  v1__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_2, v2_2);
+  Lib_IntVector_Intrinsics_vec256
+  v2__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_2, v3_2);
+  Lib_IntVector_Intrinsics_vec256
+  v3__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_2, v3_2);
+  Lib_IntVector_Intrinsics_vec256 ws120 = v0__2;
+  Lib_IntVector_Intrinsics_vec256 ws130 = v2__2;
+  Lib_IntVector_Intrinsics_vec256 ws140 = v1__2;
+  Lib_IntVector_Intrinsics_vec256 ws150 = v3__2;
+  Lib_IntVector_Intrinsics_vec256 v04 = ws[16U];
+  Lib_IntVector_Intrinsics_vec256 v14 = ws[17U];
+  Lib_IntVector_Intrinsics_vec256 v24 = ws[18U];
+  Lib_IntVector_Intrinsics_vec256 v34 = ws[19U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v04, v14);
+  Lib_IntVector_Intrinsics_vec256
+  v1_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v04, v14);
+  Lib_IntVector_Intrinsics_vec256
+  v2_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v24, v34);
+  Lib_IntVector_Intrinsics_vec256
+  v3_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v24, v34);
+  Lib_IntVector_Intrinsics_vec256
+  v0__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_3, v2_3);
+  Lib_IntVector_Intrinsics_vec256
+  v1__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_3, v2_3);
+  Lib_IntVector_Intrinsics_vec256
+  v2__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_3, v3_3);
+  Lib_IntVector_Intrinsics_vec256
+  v3__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_3, v3_3);
+  Lib_IntVector_Intrinsics_vec256 ws160 = v0__3;
+  Lib_IntVector_Intrinsics_vec256 ws170 = v2__3;
+  Lib_IntVector_Intrinsics_vec256 ws180 = v1__3;
+  Lib_IntVector_Intrinsics_vec256 ws190 = v3__3;
+  Lib_IntVector_Intrinsics_vec256 v05 = ws[20U];
+  Lib_IntVector_Intrinsics_vec256 v15 = ws[21U];
+  Lib_IntVector_Intrinsics_vec256 v25 = ws[22U];
+  Lib_IntVector_Intrinsics_vec256 v35 = ws[23U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v05, v15);
+  Lib_IntVector_Intrinsics_vec256
+  v1_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v05, v15);
+  Lib_IntVector_Intrinsics_vec256
+  v2_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v25, v35);
+  Lib_IntVector_Intrinsics_vec256
+  v3_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v25, v35);
+  Lib_IntVector_Intrinsics_vec256
+  v0__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_4, v2_4);
+  Lib_IntVector_Intrinsics_vec256
+  v1__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_4, v2_4);
+  Lib_IntVector_Intrinsics_vec256
+  v2__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_4, v3_4);
+  Lib_IntVector_Intrinsics_vec256
+  v3__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_4, v3_4);
+  Lib_IntVector_Intrinsics_vec256 ws200 = v0__4;
+  Lib_IntVector_Intrinsics_vec256 ws211 = v2__4;
+  Lib_IntVector_Intrinsics_vec256 ws220 = v1__4;
+  Lib_IntVector_Intrinsics_vec256 ws230 = v3__4;
+  Lib_IntVector_Intrinsics_vec256 v06 = ws[24U];
+  Lib_IntVector_Intrinsics_vec256 v16 = ws[25U];
+  Lib_IntVector_Intrinsics_vec256 v26 = ws[26U];
+  Lib_IntVector_Intrinsics_vec256 v36 = ws[27U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v06, v16);
+  Lib_IntVector_Intrinsics_vec256
+  v1_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v06, v16);
+  Lib_IntVector_Intrinsics_vec256
+  v2_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v26, v36);
+  Lib_IntVector_Intrinsics_vec256
+  v3_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v26, v36);
+  Lib_IntVector_Intrinsics_vec256
+  v0__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_5, v2_5);
+  Lib_IntVector_Intrinsics_vec256
+  v1__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_5, v2_5);
+  Lib_IntVector_Intrinsics_vec256
+  v2__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_5, v3_5);
+  Lib_IntVector_Intrinsics_vec256
+  v3__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_5, v3_5);
+  Lib_IntVector_Intrinsics_vec256 ws240 = v0__5;
+  Lib_IntVector_Intrinsics_vec256 ws250 = v2__5;
+  Lib_IntVector_Intrinsics_vec256 ws260 = v1__5;
+  Lib_IntVector_Intrinsics_vec256 ws270 = v3__5;
+  Lib_IntVector_Intrinsics_vec256 v07 = ws[28U];
+  Lib_IntVector_Intrinsics_vec256 v17 = ws[29U];
+  Lib_IntVector_Intrinsics_vec256 v27 = ws[30U];
+  Lib_IntVector_Intrinsics_vec256 v37 = ws[31U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v07, v17);
+  Lib_IntVector_Intrinsics_vec256
+  v1_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v07, v17);
+  Lib_IntVector_Intrinsics_vec256
+  v2_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v27, v37);
+  Lib_IntVector_Intrinsics_vec256
+  v3_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v27, v37);
+  Lib_IntVector_Intrinsics_vec256
+  v0__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_6, v2_6);
+  Lib_IntVector_Intrinsics_vec256
+  v1__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_6, v2_6);
+  Lib_IntVector_Intrinsics_vec256
+  v2__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_6, v3_6);
+  Lib_IntVector_Intrinsics_vec256
+  v3__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_6, v3_6);
+  Lib_IntVector_Intrinsics_vec256 ws280 = v0__6;
+  Lib_IntVector_Intrinsics_vec256 ws290 = v2__6;
+  Lib_IntVector_Intrinsics_vec256 ws300 = v1__6;
+  Lib_IntVector_Intrinsics_vec256 ws310 = v3__6;
+  ws[0U] = ws00;
+  ws[1U] = ws110;
+  ws[2U] = ws210;
+  ws[3U] = ws32;
+  ws[4U] = ws40;
+  ws[5U] = ws50;
+  ws[6U] = ws60;
+  ws[7U] = ws70;
+  ws[8U] = ws80;
+  ws[9U] = ws90;
+  ws[10U] = ws100;
+  ws[11U] = ws111;
+  ws[12U] = ws120;
+  ws[13U] = ws130;
+  ws[14U] = ws140;
+  ws[15U] = ws150;
+  ws[16U] = ws160;
+  ws[17U] = ws170;
+  ws[18U] = ws180;
+  ws[19U] = ws190;
+  ws[20U] = ws200;
+  ws[21U] = ws211;
+  ws[22U] = ws220;
+  ws[23U] = ws230;
+  ws[24U] = ws240;
+  ws[25U] = ws250;
+  ws[26U] = ws260;
+  ws[27U] = ws270;
+  ws[28U] = ws280;
+  ws[29U] = ws290;
+  ws[30U] = ws300;
+  ws[31U] = ws310;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = Lib_IntVector_Intrinsics_vec256_xor(state[i], ws[i]);
+  }
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
+  K____uint8_t___uint8_t____K____uint8_t___uint8_t_
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[167U] = 0x80U;
+  b15[167U] = 0x80U;
+  b25[167U] = 0x80U;
+  b35[167U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws33[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
+  uint8_t *b2 = b.snd.snd.fst;
+  uint8_t *b1 = b.snd.fst;
+  uint8_t *b0 = b.fst;
+  ws33[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0);
+  ws33[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1);
+  ws33[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2);
+  ws33[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws33[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 32U);
+  ws33[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 32U);
+  ws33[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 32U);
+  ws33[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws33[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 64U);
+  ws33[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 64U);
+  ws33[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 64U);
+  ws33[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws33[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 96U);
+  ws33[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 96U);
+  ws33[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 96U);
+  ws33[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws33[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 128U);
+  ws33[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 128U);
+  ws33[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 128U);
+  ws33[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws33[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 160U);
+  ws33[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 160U);
+  ws33[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 160U);
+  ws33[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws33[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 192U);
+  ws33[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 192U);
+  ws33[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 192U);
+  ws33[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws33[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 224U);
+  ws33[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 224U);
+  ws33[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 224U);
+  ws33[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
+  Lib_IntVector_Intrinsics_vec256 v08 = ws33[0U];
+  Lib_IntVector_Intrinsics_vec256 v18 = ws33[1U];
+  Lib_IntVector_Intrinsics_vec256 v28 = ws33[2U];
+  Lib_IntVector_Intrinsics_vec256 v38 = ws33[3U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_7 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v08, v18);
+  Lib_IntVector_Intrinsics_vec256
+  v1_7 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v08, v18);
+  Lib_IntVector_Intrinsics_vec256
+  v2_7 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v28, v38);
+  Lib_IntVector_Intrinsics_vec256
+  v3_7 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v28, v38);
+  Lib_IntVector_Intrinsics_vec256
+  v0__7 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_7, v2_7);
+  Lib_IntVector_Intrinsics_vec256
+  v1__7 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_7, v2_7);
+  Lib_IntVector_Intrinsics_vec256
+  v2__7 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_7, v3_7);
+  Lib_IntVector_Intrinsics_vec256
+  v3__7 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_7, v3_7);
+  Lib_IntVector_Intrinsics_vec256 ws0 = v0__7;
+  Lib_IntVector_Intrinsics_vec256 ws1 = v2__7;
+  Lib_IntVector_Intrinsics_vec256 ws2 = v1__7;
+  Lib_IntVector_Intrinsics_vec256 ws3 = v3__7;
+  Lib_IntVector_Intrinsics_vec256 v09 = ws33[4U];
+  Lib_IntVector_Intrinsics_vec256 v19 = ws33[5U];
+  Lib_IntVector_Intrinsics_vec256 v29 = ws33[6U];
+  Lib_IntVector_Intrinsics_vec256 v39 = ws33[7U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_8 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v09, v19);
+  Lib_IntVector_Intrinsics_vec256
+  v1_8 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v09, v19);
+  Lib_IntVector_Intrinsics_vec256
+  v2_8 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v29, v39);
+  Lib_IntVector_Intrinsics_vec256
+  v3_8 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v29, v39);
+  Lib_IntVector_Intrinsics_vec256
+  v0__8 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_8, v2_8);
+  Lib_IntVector_Intrinsics_vec256
+  v1__8 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_8, v2_8);
+  Lib_IntVector_Intrinsics_vec256
+  v2__8 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_8, v3_8);
+  Lib_IntVector_Intrinsics_vec256
+  v3__8 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_8, v3_8);
+  Lib_IntVector_Intrinsics_vec256 ws4 = v0__8;
+  Lib_IntVector_Intrinsics_vec256 ws5 = v2__8;
+  Lib_IntVector_Intrinsics_vec256 ws6 = v1__8;
+  Lib_IntVector_Intrinsics_vec256 ws7 = v3__8;
+  Lib_IntVector_Intrinsics_vec256 v010 = ws33[8U];
+  Lib_IntVector_Intrinsics_vec256 v110 = ws33[9U];
+  Lib_IntVector_Intrinsics_vec256 v210 = ws33[10U];
+  Lib_IntVector_Intrinsics_vec256 v310 = ws33[11U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_9 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v010, v110);
+  Lib_IntVector_Intrinsics_vec256
+  v1_9 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v010, v110);
+  Lib_IntVector_Intrinsics_vec256
+  v2_9 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v210, v310);
+  Lib_IntVector_Intrinsics_vec256
+  v3_9 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v210, v310);
+  Lib_IntVector_Intrinsics_vec256
+  v0__9 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_9, v2_9);
+  Lib_IntVector_Intrinsics_vec256
+  v1__9 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_9, v2_9);
+  Lib_IntVector_Intrinsics_vec256
+  v2__9 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_9, v3_9);
+  Lib_IntVector_Intrinsics_vec256
+  v3__9 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_9, v3_9);
+  Lib_IntVector_Intrinsics_vec256 ws8 = v0__9;
+  Lib_IntVector_Intrinsics_vec256 ws9 = v2__9;
+  Lib_IntVector_Intrinsics_vec256 ws10 = v1__9;
+  Lib_IntVector_Intrinsics_vec256 ws11 = v3__9;
+  Lib_IntVector_Intrinsics_vec256 v011 = ws33[12U];
+  Lib_IntVector_Intrinsics_vec256 v111 = ws33[13U];
+  Lib_IntVector_Intrinsics_vec256 v211 = ws33[14U];
+  Lib_IntVector_Intrinsics_vec256 v311 = ws33[15U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_10 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v011, v111);
+  Lib_IntVector_Intrinsics_vec256
+  v1_10 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v011, v111);
+  Lib_IntVector_Intrinsics_vec256
+  v2_10 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v211, v311);
+  Lib_IntVector_Intrinsics_vec256
+  v3_10 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v211, v311);
+  Lib_IntVector_Intrinsics_vec256
+  v0__10 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_10, v2_10);
+  Lib_IntVector_Intrinsics_vec256
+  v1__10 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_10, v2_10);
+  Lib_IntVector_Intrinsics_vec256
+  v2__10 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_10, v3_10);
+  Lib_IntVector_Intrinsics_vec256
+  v3__10 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_10, v3_10);
+  Lib_IntVector_Intrinsics_vec256 ws12 = v0__10;
+  Lib_IntVector_Intrinsics_vec256 ws13 = v2__10;
+  Lib_IntVector_Intrinsics_vec256 ws14 = v1__10;
+  Lib_IntVector_Intrinsics_vec256 ws15 = v3__10;
+  Lib_IntVector_Intrinsics_vec256 v012 = ws33[16U];
+  Lib_IntVector_Intrinsics_vec256 v112 = ws33[17U];
+  Lib_IntVector_Intrinsics_vec256 v212 = ws33[18U];
+  Lib_IntVector_Intrinsics_vec256 v312 = ws33[19U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_11 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v012, v112);
+  Lib_IntVector_Intrinsics_vec256
+  v1_11 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v012, v112);
+  Lib_IntVector_Intrinsics_vec256
+  v2_11 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v212, v312);
+  Lib_IntVector_Intrinsics_vec256
+  v3_11 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v212, v312);
+  Lib_IntVector_Intrinsics_vec256
+  v0__11 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_11, v2_11);
+  Lib_IntVector_Intrinsics_vec256
+  v1__11 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_11, v2_11);
+  Lib_IntVector_Intrinsics_vec256
+  v2__11 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_11, v3_11);
+  Lib_IntVector_Intrinsics_vec256
+  v3__11 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_11, v3_11);
+  Lib_IntVector_Intrinsics_vec256 ws16 = v0__11;
+  Lib_IntVector_Intrinsics_vec256 ws17 = v2__11;
+  Lib_IntVector_Intrinsics_vec256 ws18 = v1__11;
+  Lib_IntVector_Intrinsics_vec256 ws19 = v3__11;
+  Lib_IntVector_Intrinsics_vec256 v013 = ws33[20U];
+  Lib_IntVector_Intrinsics_vec256 v113 = ws33[21U];
+  Lib_IntVector_Intrinsics_vec256 v213 = ws33[22U];
+  Lib_IntVector_Intrinsics_vec256 v313 = ws33[23U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_12 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v013, v113);
+  Lib_IntVector_Intrinsics_vec256
+  v1_12 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v013, v113);
+  Lib_IntVector_Intrinsics_vec256
+  v2_12 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v213, v313);
+  Lib_IntVector_Intrinsics_vec256
+  v3_12 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v213, v313);
+  Lib_IntVector_Intrinsics_vec256
+  v0__12 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_12, v2_12);
+  Lib_IntVector_Intrinsics_vec256
+  v1__12 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_12, v2_12);
+  Lib_IntVector_Intrinsics_vec256
+  v2__12 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_12, v3_12);
+  Lib_IntVector_Intrinsics_vec256
+  v3__12 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_12, v3_12);
+  Lib_IntVector_Intrinsics_vec256 ws20 = v0__12;
+  Lib_IntVector_Intrinsics_vec256 ws21 = v2__12;
+  Lib_IntVector_Intrinsics_vec256 ws22 = v1__12;
+  Lib_IntVector_Intrinsics_vec256 ws23 = v3__12;
+  Lib_IntVector_Intrinsics_vec256 v014 = ws33[24U];
+  Lib_IntVector_Intrinsics_vec256 v114 = ws33[25U];
+  Lib_IntVector_Intrinsics_vec256 v214 = ws33[26U];
+  Lib_IntVector_Intrinsics_vec256 v314 = ws33[27U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_13 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v014, v114);
+  Lib_IntVector_Intrinsics_vec256
+  v1_13 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v014, v114);
+  Lib_IntVector_Intrinsics_vec256
+  v2_13 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v214, v314);
+  Lib_IntVector_Intrinsics_vec256
+  v3_13 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v214, v314);
+  Lib_IntVector_Intrinsics_vec256
+  v0__13 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_13, v2_13);
+  Lib_IntVector_Intrinsics_vec256
+  v1__13 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_13, v2_13);
+  Lib_IntVector_Intrinsics_vec256
+  v2__13 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_13, v3_13);
+  Lib_IntVector_Intrinsics_vec256
+  v3__13 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_13, v3_13);
+  Lib_IntVector_Intrinsics_vec256 ws24 = v0__13;
+  Lib_IntVector_Intrinsics_vec256 ws25 = v2__13;
+  Lib_IntVector_Intrinsics_vec256 ws26 = v1__13;
+  Lib_IntVector_Intrinsics_vec256 ws27 = v3__13;
+  Lib_IntVector_Intrinsics_vec256 v0 = ws33[28U];
+  Lib_IntVector_Intrinsics_vec256 v1 = ws33[29U];
+  Lib_IntVector_Intrinsics_vec256 v2 = ws33[30U];
+  Lib_IntVector_Intrinsics_vec256 v3 = ws33[31U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_14 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v0, v1);
+  Lib_IntVector_Intrinsics_vec256
+  v1_14 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v0, v1);
+  Lib_IntVector_Intrinsics_vec256
+  v2_14 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v2, v3);
+  Lib_IntVector_Intrinsics_vec256
+  v3_14 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v2, v3);
+  Lib_IntVector_Intrinsics_vec256
+  v0__14 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_14, v2_14);
+  Lib_IntVector_Intrinsics_vec256
+  v1__14 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_14, v2_14);
+  Lib_IntVector_Intrinsics_vec256
+  v2__14 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_14, v3_14);
+  Lib_IntVector_Intrinsics_vec256
+  v3__14 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_14, v3_14);
+  Lib_IntVector_Intrinsics_vec256 ws28 = v0__14;
+  Lib_IntVector_Intrinsics_vec256 ws29 = v2__14;
+  Lib_IntVector_Intrinsics_vec256 ws30 = v1__14;
+  Lib_IntVector_Intrinsics_vec256 ws31 = v3__14;
+  ws33[0U] = ws0;
+  ws33[1U] = ws1;
+  ws33[2U] = ws2;
+  ws33[3U] = ws3;
+  ws33[4U] = ws4;
+  ws33[5U] = ws5;
+  ws33[6U] = ws6;
+  ws33[7U] = ws7;
+  ws33[8U] = ws8;
+  ws33[9U] = ws9;
+  ws33[10U] = ws10;
+  ws33[11U] = ws11;
+  ws33[12U] = ws12;
+  ws33[13U] = ws13;
+  ws33[14U] = ws14;
+  ws33[15U] = ws15;
+  ws33[16U] = ws16;
+  ws33[17U] = ws17;
+  ws33[18U] = ws18;
+  ws33[19U] = ws19;
+  ws33[20U] = ws20;
+  ws33[21U] = ws21;
+  ws33[22U] = ws22;
+  ws33[23U] = ws23;
+  ws33[24U] = ws24;
+  ws33[25U] = ws25;
+  ws33[26U] = ws26;
+  ws33[27U] = ws27;
+  ws33[28U] = ws28;
+  ws33[29U] = ws29;
+  ws33[30U] = ws30;
+  ws33[31U] = ws31;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = Lib_IntVector_Intrinsics_vec256_xor(state[i], ws33[i]);
+  }
+  for (uint32_t i0 = 0U; i0 < 24U; i0++)
+  {
+    KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      Lib_IntVector_Intrinsics_vec256 uu____17 = state[i + 0U];
+      Lib_IntVector_Intrinsics_vec256 uu____18 = state[i + 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____19 = state[i + 10U];
+      _C[i] =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____17,
+          Lib_IntVector_Intrinsics_vec256_xor(uu____18,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____19,
+              Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
+    KRML_MAYBE_FOR5(i1,
+      0U,
+      5U,
+      1U,
+      Lib_IntVector_Intrinsics_vec256 uu____20 = _C[(i1 + 4U) % 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____21 = _C[(i1 + 1U) % 5U];
+      Lib_IntVector_Intrinsics_vec256
+      _D =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____20,
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____21,
+              1U),
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____21, 63U)));
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        state[i1 + 5U * i] = Lib_IntVector_Intrinsics_vec256_xor(state[i1 + 5U * i], _D);););
+    Lib_IntVector_Intrinsics_vec256 x = state[1U];
+    Lib_IntVector_Intrinsics_vec256 current = x;
+    for (uint32_t i = 0U; i < 24U; i++)
+    {
+      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+      Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
+      Lib_IntVector_Intrinsics_vec256 uu____22 = current;
+      state[_Y] =
+        Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____22, r),
+          Lib_IntVector_Intrinsics_vec256_shift_right64(uu____22, 64U - r));
+      current = temp;
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      Lib_IntVector_Intrinsics_vec256 uu____23 = state[0U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____24 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v015 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____23,
+          Lib_IntVector_Intrinsics_vec256_and(uu____24, state[2U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____25 = state[1U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____26 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v115 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____25,
+          Lib_IntVector_Intrinsics_vec256_and(uu____26, state[3U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____27 = state[2U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____28 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v215 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____27,
+          Lib_IntVector_Intrinsics_vec256_and(uu____28, state[4U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____29 = state[3U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____30 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v315 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____29,
+          Lib_IntVector_Intrinsics_vec256_and(uu____30, state[0U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____31 = state[4U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____32 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v4 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____31,
+          Lib_IntVector_Intrinsics_vec256_and(uu____32, state[1U + 5U * i]));
+      state[0U + 5U * i] = v015;
+      state[1U + 5U * i] = v115;
+      state[2U + 5U * i] = v215;
+      state[3U + 5U * i] = v315;
+      state[4U + 5U * i] = v4;);
+    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
+    Lib_IntVector_Intrinsics_vec256 uu____33 = state[0U];
+    state[0U] =
+      Lib_IntVector_Intrinsics_vec256_xor(uu____33,
+        Lib_IntVector_Intrinsics_vec256_load64(c));
+  }
+}
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < outputByteLen / 168U; i0++)
+  {
+    uint8_t hbuf[1024U] = { 0U };
+    KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws[32U] KRML_POST_ALIGN(32) = { 0U };
+    memcpy(ws, state, 25U * sizeof (Lib_IntVector_Intrinsics_vec256));
+    Lib_IntVector_Intrinsics_vec256 v00 = ws[0U];
+    Lib_IntVector_Intrinsics_vec256 v10 = ws[1U];
+    Lib_IntVector_Intrinsics_vec256 v20 = ws[2U];
+    Lib_IntVector_Intrinsics_vec256 v30 = ws[3U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v1_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v2_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v3_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v0__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v1__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v2__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256
+    v3__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256 ws0 = v0__;
+    Lib_IntVector_Intrinsics_vec256 ws1 = v2__;
+    Lib_IntVector_Intrinsics_vec256 ws2 = v1__;
+    Lib_IntVector_Intrinsics_vec256 ws3 = v3__;
+    Lib_IntVector_Intrinsics_vec256 v01 = ws[4U];
+    Lib_IntVector_Intrinsics_vec256 v11 = ws[5U];
+    Lib_IntVector_Intrinsics_vec256 v21 = ws[6U];
+    Lib_IntVector_Intrinsics_vec256 v31 = ws[7U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v1_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v2_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v3_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v0__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v1__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v2__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256
+    v3__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256 ws4 = v0__0;
+    Lib_IntVector_Intrinsics_vec256 ws5 = v2__0;
+    Lib_IntVector_Intrinsics_vec256 ws6 = v1__0;
+    Lib_IntVector_Intrinsics_vec256 ws7 = v3__0;
+    Lib_IntVector_Intrinsics_vec256 v02 = ws[8U];
+    Lib_IntVector_Intrinsics_vec256 v12 = ws[9U];
+    Lib_IntVector_Intrinsics_vec256 v22 = ws[10U];
+    Lib_IntVector_Intrinsics_vec256 v32 = ws[11U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v1_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v2_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v3_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v0__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v1__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v2__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256
+    v3__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256 ws8 = v0__1;
+    Lib_IntVector_Intrinsics_vec256 ws9 = v2__1;
+    Lib_IntVector_Intrinsics_vec256 ws10 = v1__1;
+    Lib_IntVector_Intrinsics_vec256 ws11 = v3__1;
+    Lib_IntVector_Intrinsics_vec256 v03 = ws[12U];
+    Lib_IntVector_Intrinsics_vec256 v13 = ws[13U];
+    Lib_IntVector_Intrinsics_vec256 v23 = ws[14U];
+    Lib_IntVector_Intrinsics_vec256 v33 = ws[15U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v1_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v2_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v3_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v0__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v1__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v2__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256
+    v3__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256 ws12 = v0__2;
+    Lib_IntVector_Intrinsics_vec256 ws13 = v2__2;
+    Lib_IntVector_Intrinsics_vec256 ws14 = v1__2;
+    Lib_IntVector_Intrinsics_vec256 ws15 = v3__2;
+    Lib_IntVector_Intrinsics_vec256 v04 = ws[16U];
+    Lib_IntVector_Intrinsics_vec256 v14 = ws[17U];
+    Lib_IntVector_Intrinsics_vec256 v24 = ws[18U];
+    Lib_IntVector_Intrinsics_vec256 v34 = ws[19U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v1_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v2_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v3_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v0__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v1__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v2__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256
+    v3__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256 ws16 = v0__3;
+    Lib_IntVector_Intrinsics_vec256 ws17 = v2__3;
+    Lib_IntVector_Intrinsics_vec256 ws18 = v1__3;
+    Lib_IntVector_Intrinsics_vec256 ws19 = v3__3;
+    Lib_IntVector_Intrinsics_vec256 v05 = ws[20U];
+    Lib_IntVector_Intrinsics_vec256 v15 = ws[21U];
+    Lib_IntVector_Intrinsics_vec256 v25 = ws[22U];
+    Lib_IntVector_Intrinsics_vec256 v35 = ws[23U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v1_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v2_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v3_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v0__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v1__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v2__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256
+    v3__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256 ws20 = v0__4;
+    Lib_IntVector_Intrinsics_vec256 ws21 = v2__4;
+    Lib_IntVector_Intrinsics_vec256 ws22 = v1__4;
+    Lib_IntVector_Intrinsics_vec256 ws23 = v3__4;
+    Lib_IntVector_Intrinsics_vec256 v06 = ws[24U];
+    Lib_IntVector_Intrinsics_vec256 v16 = ws[25U];
+    Lib_IntVector_Intrinsics_vec256 v26 = ws[26U];
+    Lib_IntVector_Intrinsics_vec256 v36 = ws[27U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v1_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v2_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v3_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v0__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v1__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v2__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256
+    v3__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256 ws24 = v0__5;
+    Lib_IntVector_Intrinsics_vec256 ws25 = v2__5;
+    Lib_IntVector_Intrinsics_vec256 ws26 = v1__5;
+    Lib_IntVector_Intrinsics_vec256 ws27 = v3__5;
+    Lib_IntVector_Intrinsics_vec256 v0 = ws[28U];
+    Lib_IntVector_Intrinsics_vec256 v1 = ws[29U];
+    Lib_IntVector_Intrinsics_vec256 v2 = ws[30U];
+    Lib_IntVector_Intrinsics_vec256 v3 = ws[31U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v1_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v2_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v3_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v0__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v1__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v2__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256
+    v3__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256 ws28 = v0__6;
+    Lib_IntVector_Intrinsics_vec256 ws29 = v2__6;
+    Lib_IntVector_Intrinsics_vec256 ws30 = v1__6;
+    Lib_IntVector_Intrinsics_vec256 ws31 = v3__6;
+    ws[0U] = ws0;
+    ws[1U] = ws1;
+    ws[2U] = ws2;
+    ws[3U] = ws3;
+    ws[4U] = ws4;
+    ws[5U] = ws5;
+    ws[6U] = ws6;
+    ws[7U] = ws7;
+    ws[8U] = ws8;
+    ws[9U] = ws9;
+    ws[10U] = ws10;
+    ws[11U] = ws11;
+    ws[12U] = ws12;
+    ws[13U] = ws13;
+    ws[14U] = ws14;
+    ws[15U] = ws15;
+    ws[16U] = ws16;
+    ws[17U] = ws17;
+    ws[18U] = ws18;
+    ws[19U] = ws19;
+    ws[20U] = ws20;
+    ws[21U] = ws21;
+    ws[22U] = ws22;
+    ws[23U] = ws23;
+    ws[24U] = ws24;
+    ws[25U] = ws25;
+    ws[26U] = ws26;
+    ws[27U] = ws27;
+    ws[28U] = ws28;
+    ws[29U] = ws29;
+    ws[30U] = ws30;
+    ws[31U] = ws31;
+    for (uint32_t i = 0U; i < 32U; i++)
+    {
+      Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      uint8_t *b0 = output0;
+      uint8_t *b1 = output1;
+      uint8_t *b2 = output2;
+      uint8_t *b3 = output3;
+      memcpy(b0 + i0 * 168U + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
+      memcpy(b1 + i0 * 168U + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
+      memcpy(b2 + i0 * 168U + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
+      memcpy(b3 + i0 * 168U + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t)););
+    uint8_t *b0 = output0;
+    uint8_t *b1 = output1;
+    uint8_t *b2 = output2;
+    uint8_t *b3 = output3;
+    memcpy(b0 + i0 * 168U + 160U, hbuf + 640U, 8U * sizeof (uint8_t));
+    memcpy(b1 + i0 * 168U + 160U, hbuf + 672U, 8U * sizeof (uint8_t));
+    memcpy(b2 + i0 * 168U + 160U, hbuf + 704U, 8U * sizeof (uint8_t));
+    memcpy(b3 + i0 * 168U + 160U, hbuf + 736U, 8U * sizeof (uint8_t));
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____0 = state[i + 0U];
+        Lib_IntVector_Intrinsics_vec256 uu____1 = state[i + 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____2 = state[i + 10U];
+        _C[i] =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____0,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____1,
+              Lib_IntVector_Intrinsics_vec256_xor(uu____2,
+                Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____3 = _C[(i2 + 4U) % 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____4 = _C[(i2 + 1U) % 5U];
+        Lib_IntVector_Intrinsics_vec256
+        _D =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____3,
+            Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____4,
+                1U),
+              Lib_IntVector_Intrinsics_vec256_shift_right64(uu____4, 63U)));
+        KRML_MAYBE_FOR5(i,
+          0U,
+          5U,
+          1U,
+          state[i2 + 5U * i] = Lib_IntVector_Intrinsics_vec256_xor(state[i2 + 5U * i], _D);););
+      Lib_IntVector_Intrinsics_vec256 x = state[1U];
+      Lib_IntVector_Intrinsics_vec256 current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
+        Lib_IntVector_Intrinsics_vec256 uu____5 = current;
+        state[_Y] =
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____5,
+              r),
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____5, 64U - r));
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____6 = state[0U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____7 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v07 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____6,
+            Lib_IntVector_Intrinsics_vec256_and(uu____7, state[2U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____8 = state[1U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____9 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v17 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____8,
+            Lib_IntVector_Intrinsics_vec256_and(uu____9, state[3U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____10 = state[2U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____11 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v27 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____10,
+            Lib_IntVector_Intrinsics_vec256_and(uu____11, state[4U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____12 = state[3U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____13 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v37 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____12,
+            Lib_IntVector_Intrinsics_vec256_and(uu____13, state[0U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____14 = state[4U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____15 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v4 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____14,
+            Lib_IntVector_Intrinsics_vec256_and(uu____15, state[1U + 5U * i]));
+        state[0U + 5U * i] = v07;
+        state[1U + 5U * i] = v17;
+        state[2U + 5U * i] = v27;
+        state[3U + 5U * i] = v37;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      Lib_IntVector_Intrinsics_vec256 uu____16 = state[0U];
+      state[0U] =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____16,
+          Lib_IntVector_Intrinsics_vec256_load64(c));
+    }
+  }
+}
+

From f1d02ee1245a46b2b52150719c6a9feb2e0a3642 Mon Sep 17 00:00:00 2001
From: mamonet <maamoun.tk@gmail.com>
Date: Mon, 15 Jan 2024 20:21:19 +0200
Subject: [PATCH 3/6] Add SHA3 absorb nblocks/last API

---
 include/Hacl_Hash_SHA3_Scalar.h       |   14 +
 include/Hacl_Hash_SHA3_Simd256.h      |   14 +-
 include/msvc/Hacl_Hash_SHA3_Scalar.h  |   14 +
 include/msvc/Hacl_Hash_SHA3_Simd256.h |   14 +-
 src/Hacl_Hash_SHA3_Scalar.c           |  338 +++
 src/Hacl_Hash_SHA3_Simd256.c          | 1457 ++++------
 src/msvc/Hacl_Hash_SHA3_Scalar.c      |  760 ++++-
 src/msvc/Hacl_Hash_SHA3_Simd256.c     | 3685 +++++++++++++++++--------
 8 files changed, 4216 insertions(+), 2080 deletions(-)

diff --git a/include/Hacl_Hash_SHA3_Scalar.h b/include/Hacl_Hash_SHA3_Scalar.h
index 4b893cd8..d0b7f253 100644
--- a/include/Hacl_Hash_SHA3_Scalar.h
+++ b/include/Hacl_Hash_SHA3_Scalar.h
@@ -63,6 +63,20 @@ uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void);
 
 void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s);
 
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+);
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+);
+
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen);
 
diff --git a/include/Hacl_Hash_SHA3_Simd256.h b/include/Hacl_Hash_SHA3_Simd256.h
index 534a8899..d231e273 100644
--- a/include/Hacl_Hash_SHA3_Simd256.h
+++ b/include/Hacl_Hash_SHA3_Simd256.h
@@ -35,6 +35,8 @@ extern "C" {
 #include "krml/lowstar_endianness.h"
 #include "krml/internal/target.h"
 
+#include "libintvector.h"
+
 typedef struct K____uint8_t___uint8_t__s
 {
   uint8_t *fst;
@@ -141,7 +143,17 @@ uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void);
 void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s);
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+);
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,
diff --git a/include/msvc/Hacl_Hash_SHA3_Scalar.h b/include/msvc/Hacl_Hash_SHA3_Scalar.h
index 4b893cd8..d0b7f253 100644
--- a/include/msvc/Hacl_Hash_SHA3_Scalar.h
+++ b/include/msvc/Hacl_Hash_SHA3_Scalar.h
@@ -63,6 +63,20 @@ uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void);
 
 void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s);
 
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+);
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+);
+
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen);
 
diff --git a/include/msvc/Hacl_Hash_SHA3_Simd256.h b/include/msvc/Hacl_Hash_SHA3_Simd256.h
index 534a8899..d231e273 100644
--- a/include/msvc/Hacl_Hash_SHA3_Simd256.h
+++ b/include/msvc/Hacl_Hash_SHA3_Simd256.h
@@ -35,6 +35,8 @@ extern "C" {
 #include "krml/lowstar_endianness.h"
 #include "krml/internal/target.h"
 
+#include "libintvector.h"
+
 typedef struct K____uint8_t___uint8_t__s
 {
   uint8_t *fst;
@@ -141,7 +143,17 @@ uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void);
 void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s);
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+);
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,
diff --git a/src/Hacl_Hash_SHA3_Scalar.c b/src/Hacl_Hash_SHA3_Scalar.c
index 19a1936a..7393ebf2 100644
--- a/src/Hacl_Hash_SHA3_Scalar.c
+++ b/src/Hacl_Hash_SHA3_Scalar.c
@@ -2392,6 +2392,344 @@ void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s)
   KRML_HOST_FREE(s);
 }
 
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
+  {
+    uint8_t b1[256U] = { 0U };
+    uint8_t *b_ = b1;
+    uint8_t *b0 = input;
+    uint8_t *bl0 = b_;
+    memcpy(bl0, b0 + i0 * 168U, 168U * sizeof (uint8_t));
+    uint64_t ws[32U] = { 0U };
+    uint8_t *b = b_;
+    uint64_t u = load64_le(b);
+    ws[0U] = u;
+    uint64_t u0 = load64_le(b + 8U);
+    ws[1U] = u0;
+    uint64_t u1 = load64_le(b + 16U);
+    ws[2U] = u1;
+    uint64_t u2 = load64_le(b + 24U);
+    ws[3U] = u2;
+    uint64_t u3 = load64_le(b + 32U);
+    ws[4U] = u3;
+    uint64_t u4 = load64_le(b + 40U);
+    ws[5U] = u4;
+    uint64_t u5 = load64_le(b + 48U);
+    ws[6U] = u5;
+    uint64_t u6 = load64_le(b + 56U);
+    ws[7U] = u6;
+    uint64_t u7 = load64_le(b + 64U);
+    ws[8U] = u7;
+    uint64_t u8 = load64_le(b + 72U);
+    ws[9U] = u8;
+    uint64_t u9 = load64_le(b + 80U);
+    ws[10U] = u9;
+    uint64_t u10 = load64_le(b + 88U);
+    ws[11U] = u10;
+    uint64_t u11 = load64_le(b + 96U);
+    ws[12U] = u11;
+    uint64_t u12 = load64_le(b + 104U);
+    ws[13U] = u12;
+    uint64_t u13 = load64_le(b + 112U);
+    ws[14U] = u13;
+    uint64_t u14 = load64_le(b + 120U);
+    ws[15U] = u14;
+    uint64_t u15 = load64_le(b + 128U);
+    ws[16U] = u15;
+    uint64_t u16 = load64_le(b + 136U);
+    ws[17U] = u16;
+    uint64_t u17 = load64_le(b + 144U);
+    ws[18U] = u17;
+    uint64_t u18 = load64_le(b + 152U);
+    ws[19U] = u18;
+    uint64_t u19 = load64_le(b + 160U);
+    ws[20U] = u19;
+    uint64_t u20 = load64_le(b + 168U);
+    ws[21U] = u20;
+    uint64_t u21 = load64_le(b + 176U);
+    ws[22U] = u21;
+    uint64_t u22 = load64_le(b + 184U);
+    ws[23U] = u22;
+    uint64_t u23 = load64_le(b + 192U);
+    ws[24U] = u23;
+    uint64_t u24 = load64_le(b + 200U);
+    ws[25U] = u24;
+    uint64_t u25 = load64_le(b + 208U);
+    ws[26U] = u25;
+    uint64_t u26 = load64_le(b + 216U);
+    ws[27U] = u26;
+    uint64_t u27 = load64_le(b + 224U);
+    ws[28U] = u27;
+    uint64_t u28 = load64_le(b + 232U);
+    ws[29U] = u28;
+    uint64_t u29 = load64_le(b + 240U);
+    ws[30U] = u29;
+    uint64_t u30 = load64_le(b + 248U);
+    ws[31U] = u30;
+    for (uint32_t i = 0U; i < 25U; i++)
+    {
+      state[i] = state[i] ^ ws[i];
+    }
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      uint64_t _C[5U] = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        _C[i] =
+          state[i
+          + 0U]
+          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
+        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
+      uint64_t x = state[1U];
+      uint64_t current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        uint64_t temp = state[_Y];
+        uint64_t uu____1 = current;
+        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+        state[0U + 5U * i] = v0;
+        state[1U + 5U * i] = v1;
+        state[2U + 5U * i] = v2;
+        state[3U + 5U * i] = v3;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      state[0U] = state[0U] ^ c;
+    }
+  }
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+)
+{
+  uint32_t rem = inputByteLen % 168U;
+  uint8_t b2[256U] = { 0U };
+  uint8_t *b_ = b2;
+  uint32_t rem1 = inputByteLen % 168U;
+  uint8_t *b00 = input;
+  uint8_t *bl0 = b_;
+  memcpy(bl0, b00 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b01 = b_;
+  b01[rem] = 0x1FU;
+  uint64_t ws[32U] = { 0U };
+  uint8_t *b = b_;
+  uint64_t u0 = load64_le(b);
+  ws[0U] = u0;
+  uint64_t u1 = load64_le(b + 8U);
+  ws[1U] = u1;
+  uint64_t u2 = load64_le(b + 16U);
+  ws[2U] = u2;
+  uint64_t u3 = load64_le(b + 24U);
+  ws[3U] = u3;
+  uint64_t u4 = load64_le(b + 32U);
+  ws[4U] = u4;
+  uint64_t u5 = load64_le(b + 40U);
+  ws[5U] = u5;
+  uint64_t u6 = load64_le(b + 48U);
+  ws[6U] = u6;
+  uint64_t u7 = load64_le(b + 56U);
+  ws[7U] = u7;
+  uint64_t u8 = load64_le(b + 64U);
+  ws[8U] = u8;
+  uint64_t u9 = load64_le(b + 72U);
+  ws[9U] = u9;
+  uint64_t u10 = load64_le(b + 80U);
+  ws[10U] = u10;
+  uint64_t u11 = load64_le(b + 88U);
+  ws[11U] = u11;
+  uint64_t u12 = load64_le(b + 96U);
+  ws[12U] = u12;
+  uint64_t u13 = load64_le(b + 104U);
+  ws[13U] = u13;
+  uint64_t u14 = load64_le(b + 112U);
+  ws[14U] = u14;
+  uint64_t u15 = load64_le(b + 120U);
+  ws[15U] = u15;
+  uint64_t u16 = load64_le(b + 128U);
+  ws[16U] = u16;
+  uint64_t u17 = load64_le(b + 136U);
+  ws[17U] = u17;
+  uint64_t u18 = load64_le(b + 144U);
+  ws[18U] = u18;
+  uint64_t u19 = load64_le(b + 152U);
+  ws[19U] = u19;
+  uint64_t u20 = load64_le(b + 160U);
+  ws[20U] = u20;
+  uint64_t u21 = load64_le(b + 168U);
+  ws[21U] = u21;
+  uint64_t u22 = load64_le(b + 176U);
+  ws[22U] = u22;
+  uint64_t u23 = load64_le(b + 184U);
+  ws[23U] = u23;
+  uint64_t u24 = load64_le(b + 192U);
+  ws[24U] = u24;
+  uint64_t u25 = load64_le(b + 200U);
+  ws[25U] = u25;
+  uint64_t u26 = load64_le(b + 208U);
+  ws[26U] = u26;
+  uint64_t u27 = load64_le(b + 216U);
+  ws[27U] = u27;
+  uint64_t u28 = load64_le(b + 224U);
+  ws[28U] = u28;
+  uint64_t u29 = load64_le(b + 232U);
+  ws[29U] = u29;
+  uint64_t u30 = load64_le(b + 240U);
+  ws[30U] = u30;
+  uint64_t u31 = load64_le(b + 248U);
+  ws[31U] = u31;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws[i];
+  }
+  uint8_t b3[256U] = { 0U };
+  uint8_t *b4 = b3;
+  uint8_t *b0 = b4;
+  b0[167U] = 0x80U;
+  uint64_t ws0[32U] = { 0U };
+  uint8_t *b1 = b4;
+  uint64_t u = load64_le(b1);
+  ws0[0U] = u;
+  uint64_t u32 = load64_le(b1 + 8U);
+  ws0[1U] = u32;
+  uint64_t u33 = load64_le(b1 + 16U);
+  ws0[2U] = u33;
+  uint64_t u34 = load64_le(b1 + 24U);
+  ws0[3U] = u34;
+  uint64_t u35 = load64_le(b1 + 32U);
+  ws0[4U] = u35;
+  uint64_t u36 = load64_le(b1 + 40U);
+  ws0[5U] = u36;
+  uint64_t u37 = load64_le(b1 + 48U);
+  ws0[6U] = u37;
+  uint64_t u38 = load64_le(b1 + 56U);
+  ws0[7U] = u38;
+  uint64_t u39 = load64_le(b1 + 64U);
+  ws0[8U] = u39;
+  uint64_t u40 = load64_le(b1 + 72U);
+  ws0[9U] = u40;
+  uint64_t u41 = load64_le(b1 + 80U);
+  ws0[10U] = u41;
+  uint64_t u42 = load64_le(b1 + 88U);
+  ws0[11U] = u42;
+  uint64_t u43 = load64_le(b1 + 96U);
+  ws0[12U] = u43;
+  uint64_t u44 = load64_le(b1 + 104U);
+  ws0[13U] = u44;
+  uint64_t u45 = load64_le(b1 + 112U);
+  ws0[14U] = u45;
+  uint64_t u46 = load64_le(b1 + 120U);
+  ws0[15U] = u46;
+  uint64_t u47 = load64_le(b1 + 128U);
+  ws0[16U] = u47;
+  uint64_t u48 = load64_le(b1 + 136U);
+  ws0[17U] = u48;
+  uint64_t u49 = load64_le(b1 + 144U);
+  ws0[18U] = u49;
+  uint64_t u50 = load64_le(b1 + 152U);
+  ws0[19U] = u50;
+  uint64_t u51 = load64_le(b1 + 160U);
+  ws0[20U] = u51;
+  uint64_t u52 = load64_le(b1 + 168U);
+  ws0[21U] = u52;
+  uint64_t u53 = load64_le(b1 + 176U);
+  ws0[22U] = u53;
+  uint64_t u54 = load64_le(b1 + 184U);
+  ws0[23U] = u54;
+  uint64_t u55 = load64_le(b1 + 192U);
+  ws0[24U] = u55;
+  uint64_t u56 = load64_le(b1 + 200U);
+  ws0[25U] = u56;
+  uint64_t u57 = load64_le(b1 + 208U);
+  ws0[26U] = u57;
+  uint64_t u58 = load64_le(b1 + 216U);
+  ws0[27U] = u58;
+  uint64_t u59 = load64_le(b1 + 224U);
+  ws0[28U] = u59;
+  uint64_t u60 = load64_le(b1 + 232U);
+  ws0[29U] = u60;
+  uint64_t u61 = load64_le(b1 + 240U);
+  ws0[30U] = u61;
+  uint64_t u62 = load64_le(b1 + 248U);
+  ws0[31U] = u62;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws0[i];
+  }
+  for (uint32_t i0 = 0U; i0 < 24U; i0++)
+  {
+    uint64_t _C[5U] = { 0U };
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      _C[i] = state[i + 0U] ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+    KRML_MAYBE_FOR5(i1,
+      0U,
+      5U,
+      1U,
+      uint64_t uu____0 = _C[(i1 + 1U) % 5U];
+      uint64_t _D = _C[(i1 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+      KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i1 + 5U * i] = state[i1 + 5U * i] ^ _D;););
+    uint64_t x = state[1U];
+    uint64_t current = x;
+    for (uint32_t i = 0U; i < 24U; i++)
+    {
+      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+      uint64_t temp = state[_Y];
+      uint64_t uu____1 = current;
+      state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+      current = temp;
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+      uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+      uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+      uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+      uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+      state[0U + 5U * i] = v0;
+      state[1U + 5U * i] = v1;
+      state[2U + 5U * i] = v2;
+      state[3U + 5U * i] = v3;
+      state[4U + 5U * i] = v4;);
+    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
+    state[0U] = state[0U] ^ c;
+  }
+}
+
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen)
 {
diff --git a/src/Hacl_Hash_SHA3_Simd256.c b/src/Hacl_Hash_SHA3_Simd256.c
index a401bd71..9748a375 100644
--- a/src/Hacl_Hash_SHA3_Simd256.c
+++ b/src/Hacl_Hash_SHA3_Simd256.c
@@ -26,7 +26,6 @@
 #include "Hacl_Hash_SHA3_Simd256.h"
 
 #include "internal/Hacl_Hash_SHA3_Scalar.h"
-#include "libintvector.h"
 
 void
 Hacl_Hash_SHA3_Simd256_shake128(
@@ -738,42 +737,42 @@ Hacl_Hash_SHA3_Simd256_shake128(
   b25[rateInBytes - 1U] = 0x80U;
   b35[rateInBytes - 1U] = 0x80U;
   KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b36 = b.snd.snd.snd;
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
   ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
   ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
   ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
   ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
   ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
   ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
   ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
   ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
   ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
   ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
   ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
   ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
   ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
   ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
   ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
   ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
   ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
   ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
   ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
   ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
   ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
   ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
   ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
   ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -1295,71 +1294,49 @@ Hacl_Hash_SHA3_Simd256_shake128(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    if (rateInBytes % 32U > 0U)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 32U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 64U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 96U,
-        rateInBytes % 32U * sizeof (uint8_t));
-    }
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -1654,72 +1631,49 @@ Hacl_Hash_SHA3_Simd256_shake128(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + outputByteLen - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  if (remOut % 32U > 0U)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b1 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 32U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b2 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 64U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b3 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 96U,
-      remOut % 32U * sizeof (uint8_t));
-    return;
-  }
+  uint8_t *b36 = rb.snd.snd.snd;
+  uint8_t *b2 = rb.snd.snd.fst;
+  uint8_t *b1 = rb.snd.fst;
+  uint8_t *b0 = rb.fst;
+  memcpy(b0 + outputByteLen - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + outputByteLen - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + outputByteLen - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + outputByteLen - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
@@ -2432,42 +2386,42 @@ Hacl_Hash_SHA3_Simd256_shake256(
   b25[rateInBytes - 1U] = 0x80U;
   b35[rateInBytes - 1U] = 0x80U;
   KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b36 = b.snd.snd.snd;
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
   ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
   ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
   ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
   ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
   ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
   ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
   ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
   ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
   ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
   ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
   ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
   ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
   ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
   ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
   ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
   ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
   ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
   ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
   ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
   ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
   ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
   ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
   ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
   ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -2989,71 +2943,49 @@ Hacl_Hash_SHA3_Simd256_shake256(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    if (rateInBytes % 32U > 0U)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 32U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 64U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 96U,
-        rateInBytes % 32U * sizeof (uint8_t));
-    }
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -3348,72 +3280,49 @@ Hacl_Hash_SHA3_Simd256_shake256(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + outputByteLen - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  if (remOut % 32U > 0U)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b1 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 32U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b2 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 64U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b3 + outputByteLen - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 96U,
-      remOut % 32U * sizeof (uint8_t));
-    return;
-  }
+  uint8_t *b36 = rb.snd.snd.snd;
+  uint8_t *b2 = rb.snd.snd.fst;
+  uint8_t *b1 = rb.snd.fst;
+  uint8_t *b0 = rb.fst;
+  memcpy(b0 + outputByteLen - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + outputByteLen - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + outputByteLen - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + outputByteLen - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
@@ -4125,42 +4034,42 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   b25[rateInBytes - 1U] = 0x80U;
   b35[rateInBytes - 1U] = 0x80U;
   KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b36 = b.snd.snd.snd;
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
   ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
   ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
   ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
   ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
   ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
   ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
   ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
   ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
   ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
   ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
   ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
   ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
   ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
   ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
   ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
   ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
   ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
   ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
   ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
   ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
   ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
   ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
   ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
   ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -4682,71 +4591,49 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    if (rateInBytes % 32U > 0U)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 32U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 64U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 96U,
-        rateInBytes % 32U * sizeof (uint8_t));
-    }
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -5041,72 +4928,49 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 28U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 28U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 28U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 28U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  if (remOut % 32U > 0U)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 28U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b1 + 28U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 32U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b2 + 28U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 64U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b3 + 28U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 96U,
-      remOut % 32U * sizeof (uint8_t));
-    return;
-  }
+  uint8_t *b36 = rb.snd.snd.snd;
+  uint8_t *b2 = rb.snd.snd.fst;
+  uint8_t *b1 = rb.snd.fst;
+  uint8_t *b0 = rb.fst;
+  memcpy(b0 + 28U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 28U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 28U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 28U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
@@ -5818,42 +5682,42 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   b25[rateInBytes - 1U] = 0x80U;
   b35[rateInBytes - 1U] = 0x80U;
   KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b36 = b.snd.snd.snd;
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
   ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
   ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
   ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
   ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
   ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
   ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
   ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
   ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
   ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
   ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
   ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
   ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
   ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
   ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
   ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
   ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
   ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
   ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
   ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
   ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
   ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
   ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
   ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
   ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -6375,71 +6239,49 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    if (rateInBytes % 32U > 0U)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 32U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 64U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 96U,
-        rateInBytes % 32U * sizeof (uint8_t));
-    }
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -6734,72 +6576,49 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 32U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 32U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 32U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 32U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  if (remOut % 32U > 0U)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 32U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b1 + 32U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 32U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b2 + 32U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 64U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b3 + 32U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 96U,
-      remOut % 32U * sizeof (uint8_t));
-    return;
-  }
+  uint8_t *b36 = rb.snd.snd.snd;
+  uint8_t *b2 = rb.snd.snd.fst;
+  uint8_t *b1 = rb.snd.fst;
+  uint8_t *b0 = rb.fst;
+  memcpy(b0 + 32U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 32U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 32U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 32U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
@@ -7511,42 +7330,42 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   b25[rateInBytes - 1U] = 0x80U;
   b35[rateInBytes - 1U] = 0x80U;
   KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b36 = b.snd.snd.snd;
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
   ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
   ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
   ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
   ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
   ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
   ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
   ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
   ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
   ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
   ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
   ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
   ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
   ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
   ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
   ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
   ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
   ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
   ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
   ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
   ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
   ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
   ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
   ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
   ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -8068,71 +7887,49 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    if (rateInBytes % 32U > 0U)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 32U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 64U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 96U,
-        rateInBytes % 32U * sizeof (uint8_t));
-    }
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -8427,72 +8224,49 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 48U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 48U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 48U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 48U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  if (remOut % 32U > 0U)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 48U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b1 + 48U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 32U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b2 + 48U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 64U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b3 + 48U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 96U,
-      remOut % 32U * sizeof (uint8_t));
-    return;
-  }
+  uint8_t *b36 = rb.snd.snd.snd;
+  uint8_t *b2 = rb.snd.snd.fst;
+  uint8_t *b1 = rb.snd.fst;
+  uint8_t *b0 = rb.fst;
+  memcpy(b0 + 48U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 48U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 48U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 48U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
@@ -9204,42 +8978,42 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   b25[rateInBytes - 1U] = 0x80U;
   b35[rateInBytes - 1U] = 0x80U;
   KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b36 = b.snd.snd.snd;
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
   ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
   ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
   ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
   ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
   ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
   ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
   ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
   ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
   ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
   ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
   ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
   ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
   ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
   ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
   ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
   ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
   ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
   ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
   ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
   ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
   ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
   ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
   ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
   ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b36 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -9761,71 +9535,49 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    if (rateInBytes % 32U > 0U)
-    {
-      uint8_t *b3 = rb.snd.snd.snd;
-      uint8_t *b2 = rb.snd.snd.fst;
-      uint8_t *b1 = rb.snd.fst;
-      uint8_t *b0 = rb.fst;
-      memcpy(b0 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 32U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 64U,
-        rateInBytes % 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * rateInBytes + rateInBytes / 32U * 32U,
-        hbuf + rateInBytes / 32U * 128U + 96U,
-        rateInBytes % 32U * sizeof (uint8_t));
-    }
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -10120,72 +9872,49 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 64U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 64U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 64U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 64U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  if (remOut % 32U > 0U)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 64U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b1 + 64U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 32U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b2 + 64U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 64U,
-      remOut % 32U * sizeof (uint8_t));
-    memcpy(b3 + 64U - remOut + remOut / 32U * 32U,
-      hbuf + remOut / 32U * 128U + 96U,
-      remOut % 32U * sizeof (uint8_t));
-    return;
-  }
+  uint8_t *b36 = rb.snd.snd.snd;
+  uint8_t *b2 = rb.snd.snd.fst;
+  uint8_t *b1 = rb.snd.fst;
+  uint8_t *b0 = rb.fst;
+  memcpy(b0 + 64U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 64U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 64U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 64U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void)
@@ -10200,7 +9929,7 @@ void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s)
 }
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,
@@ -10591,6 +10320,18 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb(
           Lib_IntVector_Intrinsics_vec256_load64(c));
     }
   }
+}
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+)
+{
   uint32_t rem = inputByteLen % 168U;
   uint8_t b00[256U] = { 0U };
   uint8_t b10[256U] = { 0U };
@@ -11170,26 +10911,26 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb(
       0U,
       5U,
       1U,
-      Lib_IntVector_Intrinsics_vec256 uu____17 = state[i + 0U];
-      Lib_IntVector_Intrinsics_vec256 uu____18 = state[i + 5U];
-      Lib_IntVector_Intrinsics_vec256 uu____19 = state[i + 10U];
+      Lib_IntVector_Intrinsics_vec256 uu____0 = state[i + 0U];
+      Lib_IntVector_Intrinsics_vec256 uu____1 = state[i + 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____2 = state[i + 10U];
       _C[i] =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____17,
-          Lib_IntVector_Intrinsics_vec256_xor(uu____18,
-            Lib_IntVector_Intrinsics_vec256_xor(uu____19,
+        Lib_IntVector_Intrinsics_vec256_xor(uu____0,
+          Lib_IntVector_Intrinsics_vec256_xor(uu____1,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____2,
               Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
     KRML_MAYBE_FOR5(i1,
       0U,
       5U,
       1U,
-      Lib_IntVector_Intrinsics_vec256 uu____20 = _C[(i1 + 4U) % 5U];
-      Lib_IntVector_Intrinsics_vec256 uu____21 = _C[(i1 + 1U) % 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____3 = _C[(i1 + 4U) % 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____4 = _C[(i1 + 1U) % 5U];
       Lib_IntVector_Intrinsics_vec256
       _D =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____20,
-          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____21,
+        Lib_IntVector_Intrinsics_vec256_xor(uu____3,
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____4,
               1U),
-            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____21, 63U)));
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____4, 63U)));
       KRML_MAYBE_FOR5(i,
         0U,
         5U,
@@ -11202,60 +10943,60 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb(
       uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
       uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
       Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
-      Lib_IntVector_Intrinsics_vec256 uu____22 = current;
+      Lib_IntVector_Intrinsics_vec256 uu____5 = current;
       state[_Y] =
-        Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____22, r),
-          Lib_IntVector_Intrinsics_vec256_shift_right64(uu____22, 64U - r));
+        Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____5, r),
+          Lib_IntVector_Intrinsics_vec256_shift_right64(uu____5, 64U - r));
       current = temp;
     }
     KRML_MAYBE_FOR5(i,
       0U,
       5U,
       1U,
-      Lib_IntVector_Intrinsics_vec256 uu____23 = state[0U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256 uu____6 = state[0U + 5U * i];
       Lib_IntVector_Intrinsics_vec256
-      uu____24 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+      uu____7 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
       Lib_IntVector_Intrinsics_vec256
       v015 =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____23,
-          Lib_IntVector_Intrinsics_vec256_and(uu____24, state[2U + 5U * i]));
-      Lib_IntVector_Intrinsics_vec256 uu____25 = state[1U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256_xor(uu____6,
+          Lib_IntVector_Intrinsics_vec256_and(uu____7, state[2U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____8 = state[1U + 5U * i];
       Lib_IntVector_Intrinsics_vec256
-      uu____26 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+      uu____9 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
       Lib_IntVector_Intrinsics_vec256
       v115 =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____25,
-          Lib_IntVector_Intrinsics_vec256_and(uu____26, state[3U + 5U * i]));
-      Lib_IntVector_Intrinsics_vec256 uu____27 = state[2U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256_xor(uu____8,
+          Lib_IntVector_Intrinsics_vec256_and(uu____9, state[3U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____10 = state[2U + 5U * i];
       Lib_IntVector_Intrinsics_vec256
-      uu____28 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+      uu____11 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
       Lib_IntVector_Intrinsics_vec256
       v215 =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____27,
-          Lib_IntVector_Intrinsics_vec256_and(uu____28, state[4U + 5U * i]));
-      Lib_IntVector_Intrinsics_vec256 uu____29 = state[3U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256_xor(uu____10,
+          Lib_IntVector_Intrinsics_vec256_and(uu____11, state[4U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____12 = state[3U + 5U * i];
       Lib_IntVector_Intrinsics_vec256
-      uu____30 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+      uu____13 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
       Lib_IntVector_Intrinsics_vec256
       v315 =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____29,
-          Lib_IntVector_Intrinsics_vec256_and(uu____30, state[0U + 5U * i]));
-      Lib_IntVector_Intrinsics_vec256 uu____31 = state[4U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256_xor(uu____12,
+          Lib_IntVector_Intrinsics_vec256_and(uu____13, state[0U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____14 = state[4U + 5U * i];
       Lib_IntVector_Intrinsics_vec256
-      uu____32 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+      uu____15 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
       Lib_IntVector_Intrinsics_vec256
       v4 =
-        Lib_IntVector_Intrinsics_vec256_xor(uu____31,
-          Lib_IntVector_Intrinsics_vec256_and(uu____32, state[1U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256_xor(uu____14,
+          Lib_IntVector_Intrinsics_vec256_and(uu____15, state[1U + 5U * i]));
       state[0U + 5U * i] = v015;
       state[1U + 5U * i] = v115;
       state[2U + 5U * i] = v215;
       state[3U + 5U * i] = v315;
       state[4U + 5U * i] = v4;);
     uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
-    Lib_IntVector_Intrinsics_vec256 uu____33 = state[0U];
+    Lib_IntVector_Intrinsics_vec256 uu____16 = state[0U];
     state[0U] =
-      Lib_IntVector_Intrinsics_vec256_xor(uu____33,
+      Lib_IntVector_Intrinsics_vec256_xor(uu____16,
         Lib_IntVector_Intrinsics_vec256_load64(c));
   }
 }
@@ -11468,61 +11209,49 @@ Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__6;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__6;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    KRML_MAYBE_FOR5(i,
-      0U,
-      5U,
-      1U,
-      uint8_t *b0 = output0;
-      uint8_t *b1 = output1;
-      uint8_t *b2 = output2;
-      uint8_t *b3 = output3;
-      memcpy(b0 + i0 * 168U + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b1 + i0 * 168U + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b2 + i0 * 168U + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b3 + i0 * 168U + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t)););
     uint8_t *b0 = output0;
     uint8_t *b1 = output1;
     uint8_t *b2 = output2;
     uint8_t *b3 = output3;
-    memcpy(b0 + i0 * 168U + 160U, hbuf + 640U, 8U * sizeof (uint8_t));
-    memcpy(b1 + i0 * 168U + 160U, hbuf + 672U, 8U * sizeof (uint8_t));
-    memcpy(b2 + i0 * 168U + 160U, hbuf + 704U, 8U * sizeof (uint8_t));
-    memcpy(b3 + i0 * 168U + 160U, hbuf + 736U, 8U * sizeof (uint8_t));
+    memcpy(b0 + i0 * 168U, hbuf, 168U * sizeof (uint8_t));
+    memcpy(b1 + i0 * 168U, hbuf + 256U, 168U * sizeof (uint8_t));
+    memcpy(b2 + i0 * 168U, hbuf + 512U, 168U * sizeof (uint8_t));
+    memcpy(b3 + i0 * 168U, hbuf + 768U, 168U * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
diff --git a/src/msvc/Hacl_Hash_SHA3_Scalar.c b/src/msvc/Hacl_Hash_SHA3_Scalar.c
index 43d57482..7393ebf2 100644
--- a/src/msvc/Hacl_Hash_SHA3_Scalar.c
+++ b/src/msvc/Hacl_Hash_SHA3_Scalar.c
@@ -55,10 +55,10 @@ Hacl_Impl_SHA3_Vec_keccak_rndc[24U] =
 
 void
 Hacl_Hash_SHA3_Scalar_shake128(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 )
 {
   uint32_t rateInBytes = 168U;
@@ -447,10 +447,10 @@ Hacl_Hash_SHA3_Scalar_shake128(
 
 void
 Hacl_Hash_SHA3_Scalar_shake256(
-  uint32_t inputByteLen,
-  uint8_t *input,
+  uint8_t *output,
   uint32_t outputByteLen,
-  uint8_t *output
+  uint8_t *input,
+  uint32_t inputByteLen
 )
 {
   uint32_t rateInBytes = 136U;
@@ -837,7 +837,7 @@ Hacl_Hash_SHA3_Scalar_shake256(
   memcpy(output + outputByteLen - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_224(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_224(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 144U;
   uint64_t s[25U] = { 0U };
@@ -1223,7 +1223,7 @@ void Hacl_Hash_SHA3_Scalar_sha3_224(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 28U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_256(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_256(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 136U;
   uint64_t s[25U] = { 0U };
@@ -1609,7 +1609,7 @@ void Hacl_Hash_SHA3_Scalar_sha3_256(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 32U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_384(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 104U;
   uint64_t s[25U] = { 0U };
@@ -1995,7 +1995,7 @@ void Hacl_Hash_SHA3_Scalar_sha3_384(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 48U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
-void Hacl_Hash_SHA3_Scalar_sha3_512(uint32_t inputByteLen, uint8_t *input, uint8_t *output)
+void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen)
 {
   uint32_t rateInBytes = 72U;
   uint64_t s[25U] = { 0U };
@@ -2381,3 +2381,743 @@ void Hacl_Hash_SHA3_Scalar_sha3_512(uint32_t inputByteLen, uint8_t *input, uint8
   memcpy(output + 64U - remOut, hbuf, remOut * sizeof (uint8_t));
 }
 
+uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void)
+{
+  uint64_t *buf = (uint64_t *)KRML_HOST_CALLOC(25U, sizeof (uint64_t));
+  return buf;
+}
+
+void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s)
+{
+  KRML_HOST_FREE(s);
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
+  {
+    uint8_t b1[256U] = { 0U };
+    uint8_t *b_ = b1;
+    uint8_t *b0 = input;
+    uint8_t *bl0 = b_;
+    memcpy(bl0, b0 + i0 * 168U, 168U * sizeof (uint8_t));
+    uint64_t ws[32U] = { 0U };
+    uint8_t *b = b_;
+    uint64_t u = load64_le(b);
+    ws[0U] = u;
+    uint64_t u0 = load64_le(b + 8U);
+    ws[1U] = u0;
+    uint64_t u1 = load64_le(b + 16U);
+    ws[2U] = u1;
+    uint64_t u2 = load64_le(b + 24U);
+    ws[3U] = u2;
+    uint64_t u3 = load64_le(b + 32U);
+    ws[4U] = u3;
+    uint64_t u4 = load64_le(b + 40U);
+    ws[5U] = u4;
+    uint64_t u5 = load64_le(b + 48U);
+    ws[6U] = u5;
+    uint64_t u6 = load64_le(b + 56U);
+    ws[7U] = u6;
+    uint64_t u7 = load64_le(b + 64U);
+    ws[8U] = u7;
+    uint64_t u8 = load64_le(b + 72U);
+    ws[9U] = u8;
+    uint64_t u9 = load64_le(b + 80U);
+    ws[10U] = u9;
+    uint64_t u10 = load64_le(b + 88U);
+    ws[11U] = u10;
+    uint64_t u11 = load64_le(b + 96U);
+    ws[12U] = u11;
+    uint64_t u12 = load64_le(b + 104U);
+    ws[13U] = u12;
+    uint64_t u13 = load64_le(b + 112U);
+    ws[14U] = u13;
+    uint64_t u14 = load64_le(b + 120U);
+    ws[15U] = u14;
+    uint64_t u15 = load64_le(b + 128U);
+    ws[16U] = u15;
+    uint64_t u16 = load64_le(b + 136U);
+    ws[17U] = u16;
+    uint64_t u17 = load64_le(b + 144U);
+    ws[18U] = u17;
+    uint64_t u18 = load64_le(b + 152U);
+    ws[19U] = u18;
+    uint64_t u19 = load64_le(b + 160U);
+    ws[20U] = u19;
+    uint64_t u20 = load64_le(b + 168U);
+    ws[21U] = u20;
+    uint64_t u21 = load64_le(b + 176U);
+    ws[22U] = u21;
+    uint64_t u22 = load64_le(b + 184U);
+    ws[23U] = u22;
+    uint64_t u23 = load64_le(b + 192U);
+    ws[24U] = u23;
+    uint64_t u24 = load64_le(b + 200U);
+    ws[25U] = u24;
+    uint64_t u25 = load64_le(b + 208U);
+    ws[26U] = u25;
+    uint64_t u26 = load64_le(b + 216U);
+    ws[27U] = u26;
+    uint64_t u27 = load64_le(b + 224U);
+    ws[28U] = u27;
+    uint64_t u28 = load64_le(b + 232U);
+    ws[29U] = u28;
+    uint64_t u29 = load64_le(b + 240U);
+    ws[30U] = u29;
+    uint64_t u30 = load64_le(b + 248U);
+    ws[31U] = u30;
+    for (uint32_t i = 0U; i < 25U; i++)
+    {
+      state[i] = state[i] ^ ws[i];
+    }
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      uint64_t _C[5U] = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        _C[i] =
+          state[i
+          + 0U]
+          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
+        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
+      uint64_t x = state[1U];
+      uint64_t current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        uint64_t temp = state[_Y];
+        uint64_t uu____1 = current;
+        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+        state[0U + 5U * i] = v0;
+        state[1U + 5U * i] = v1;
+        state[2U + 5U * i] = v2;
+        state[3U + 5U * i] = v3;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      state[0U] = state[0U] ^ c;
+    }
+  }
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+  uint64_t *state,
+  uint8_t *input,
+  uint32_t inputByteLen
+)
+{
+  uint32_t rem = inputByteLen % 168U;
+  uint8_t b2[256U] = { 0U };
+  uint8_t *b_ = b2;
+  uint32_t rem1 = inputByteLen % 168U;
+  uint8_t *b00 = input;
+  uint8_t *bl0 = b_;
+  memcpy(bl0, b00 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b01 = b_;
+  b01[rem] = 0x1FU;
+  uint64_t ws[32U] = { 0U };
+  uint8_t *b = b_;
+  uint64_t u0 = load64_le(b);
+  ws[0U] = u0;
+  uint64_t u1 = load64_le(b + 8U);
+  ws[1U] = u1;
+  uint64_t u2 = load64_le(b + 16U);
+  ws[2U] = u2;
+  uint64_t u3 = load64_le(b + 24U);
+  ws[3U] = u3;
+  uint64_t u4 = load64_le(b + 32U);
+  ws[4U] = u4;
+  uint64_t u5 = load64_le(b + 40U);
+  ws[5U] = u5;
+  uint64_t u6 = load64_le(b + 48U);
+  ws[6U] = u6;
+  uint64_t u7 = load64_le(b + 56U);
+  ws[7U] = u7;
+  uint64_t u8 = load64_le(b + 64U);
+  ws[8U] = u8;
+  uint64_t u9 = load64_le(b + 72U);
+  ws[9U] = u9;
+  uint64_t u10 = load64_le(b + 80U);
+  ws[10U] = u10;
+  uint64_t u11 = load64_le(b + 88U);
+  ws[11U] = u11;
+  uint64_t u12 = load64_le(b + 96U);
+  ws[12U] = u12;
+  uint64_t u13 = load64_le(b + 104U);
+  ws[13U] = u13;
+  uint64_t u14 = load64_le(b + 112U);
+  ws[14U] = u14;
+  uint64_t u15 = load64_le(b + 120U);
+  ws[15U] = u15;
+  uint64_t u16 = load64_le(b + 128U);
+  ws[16U] = u16;
+  uint64_t u17 = load64_le(b + 136U);
+  ws[17U] = u17;
+  uint64_t u18 = load64_le(b + 144U);
+  ws[18U] = u18;
+  uint64_t u19 = load64_le(b + 152U);
+  ws[19U] = u19;
+  uint64_t u20 = load64_le(b + 160U);
+  ws[20U] = u20;
+  uint64_t u21 = load64_le(b + 168U);
+  ws[21U] = u21;
+  uint64_t u22 = load64_le(b + 176U);
+  ws[22U] = u22;
+  uint64_t u23 = load64_le(b + 184U);
+  ws[23U] = u23;
+  uint64_t u24 = load64_le(b + 192U);
+  ws[24U] = u24;
+  uint64_t u25 = load64_le(b + 200U);
+  ws[25U] = u25;
+  uint64_t u26 = load64_le(b + 208U);
+  ws[26U] = u26;
+  uint64_t u27 = load64_le(b + 216U);
+  ws[27U] = u27;
+  uint64_t u28 = load64_le(b + 224U);
+  ws[28U] = u28;
+  uint64_t u29 = load64_le(b + 232U);
+  ws[29U] = u29;
+  uint64_t u30 = load64_le(b + 240U);
+  ws[30U] = u30;
+  uint64_t u31 = load64_le(b + 248U);
+  ws[31U] = u31;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws[i];
+  }
+  uint8_t b3[256U] = { 0U };
+  uint8_t *b4 = b3;
+  uint8_t *b0 = b4;
+  b0[167U] = 0x80U;
+  uint64_t ws0[32U] = { 0U };
+  uint8_t *b1 = b4;
+  uint64_t u = load64_le(b1);
+  ws0[0U] = u;
+  uint64_t u32 = load64_le(b1 + 8U);
+  ws0[1U] = u32;
+  uint64_t u33 = load64_le(b1 + 16U);
+  ws0[2U] = u33;
+  uint64_t u34 = load64_le(b1 + 24U);
+  ws0[3U] = u34;
+  uint64_t u35 = load64_le(b1 + 32U);
+  ws0[4U] = u35;
+  uint64_t u36 = load64_le(b1 + 40U);
+  ws0[5U] = u36;
+  uint64_t u37 = load64_le(b1 + 48U);
+  ws0[6U] = u37;
+  uint64_t u38 = load64_le(b1 + 56U);
+  ws0[7U] = u38;
+  uint64_t u39 = load64_le(b1 + 64U);
+  ws0[8U] = u39;
+  uint64_t u40 = load64_le(b1 + 72U);
+  ws0[9U] = u40;
+  uint64_t u41 = load64_le(b1 + 80U);
+  ws0[10U] = u41;
+  uint64_t u42 = load64_le(b1 + 88U);
+  ws0[11U] = u42;
+  uint64_t u43 = load64_le(b1 + 96U);
+  ws0[12U] = u43;
+  uint64_t u44 = load64_le(b1 + 104U);
+  ws0[13U] = u44;
+  uint64_t u45 = load64_le(b1 + 112U);
+  ws0[14U] = u45;
+  uint64_t u46 = load64_le(b1 + 120U);
+  ws0[15U] = u46;
+  uint64_t u47 = load64_le(b1 + 128U);
+  ws0[16U] = u47;
+  uint64_t u48 = load64_le(b1 + 136U);
+  ws0[17U] = u48;
+  uint64_t u49 = load64_le(b1 + 144U);
+  ws0[18U] = u49;
+  uint64_t u50 = load64_le(b1 + 152U);
+  ws0[19U] = u50;
+  uint64_t u51 = load64_le(b1 + 160U);
+  ws0[20U] = u51;
+  uint64_t u52 = load64_le(b1 + 168U);
+  ws0[21U] = u52;
+  uint64_t u53 = load64_le(b1 + 176U);
+  ws0[22U] = u53;
+  uint64_t u54 = load64_le(b1 + 184U);
+  ws0[23U] = u54;
+  uint64_t u55 = load64_le(b1 + 192U);
+  ws0[24U] = u55;
+  uint64_t u56 = load64_le(b1 + 200U);
+  ws0[25U] = u56;
+  uint64_t u57 = load64_le(b1 + 208U);
+  ws0[26U] = u57;
+  uint64_t u58 = load64_le(b1 + 216U);
+  ws0[27U] = u58;
+  uint64_t u59 = load64_le(b1 + 224U);
+  ws0[28U] = u59;
+  uint64_t u60 = load64_le(b1 + 232U);
+  ws0[29U] = u60;
+  uint64_t u61 = load64_le(b1 + 240U);
+  ws0[30U] = u61;
+  uint64_t u62 = load64_le(b1 + 248U);
+  ws0[31U] = u62;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws0[i];
+  }
+  for (uint32_t i0 = 0U; i0 < 24U; i0++)
+  {
+    uint64_t _C[5U] = { 0U };
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      _C[i] = state[i + 0U] ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+    KRML_MAYBE_FOR5(i1,
+      0U,
+      5U,
+      1U,
+      uint64_t uu____0 = _C[(i1 + 1U) % 5U];
+      uint64_t _D = _C[(i1 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+      KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i1 + 5U * i] = state[i1 + 5U * i] ^ _D;););
+    uint64_t x = state[1U];
+    uint64_t current = x;
+    for (uint32_t i = 0U; i < 24U; i++)
+    {
+      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+      uint64_t temp = state[_Y];
+      uint64_t uu____1 = current;
+      state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+      current = temp;
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+      uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+      uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+      uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+      uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+      state[0U + 5U * i] = v0;
+      state[1U + 5U * i] = v1;
+      state[2U + 5U * i] = v2;
+      state[3U + 5U * i] = v3;
+      state[4U + 5U * i] = v4;);
+    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
+    state[0U] = state[0U] ^ c;
+  }
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen)
+{
+  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
+  {
+    uint8_t b1[256U] = { 0U };
+    uint8_t *b_ = b1;
+    uint8_t *b0 = input;
+    uint8_t *bl0 = b_;
+    memcpy(bl0, b0 + i0 * 168U, 168U * sizeof (uint8_t));
+    uint64_t ws[32U] = { 0U };
+    uint8_t *b = b_;
+    uint64_t u = load64_le(b);
+    ws[0U] = u;
+    uint64_t u0 = load64_le(b + 8U);
+    ws[1U] = u0;
+    uint64_t u1 = load64_le(b + 16U);
+    ws[2U] = u1;
+    uint64_t u2 = load64_le(b + 24U);
+    ws[3U] = u2;
+    uint64_t u3 = load64_le(b + 32U);
+    ws[4U] = u3;
+    uint64_t u4 = load64_le(b + 40U);
+    ws[5U] = u4;
+    uint64_t u5 = load64_le(b + 48U);
+    ws[6U] = u5;
+    uint64_t u6 = load64_le(b + 56U);
+    ws[7U] = u6;
+    uint64_t u7 = load64_le(b + 64U);
+    ws[8U] = u7;
+    uint64_t u8 = load64_le(b + 72U);
+    ws[9U] = u8;
+    uint64_t u9 = load64_le(b + 80U);
+    ws[10U] = u9;
+    uint64_t u10 = load64_le(b + 88U);
+    ws[11U] = u10;
+    uint64_t u11 = load64_le(b + 96U);
+    ws[12U] = u11;
+    uint64_t u12 = load64_le(b + 104U);
+    ws[13U] = u12;
+    uint64_t u13 = load64_le(b + 112U);
+    ws[14U] = u13;
+    uint64_t u14 = load64_le(b + 120U);
+    ws[15U] = u14;
+    uint64_t u15 = load64_le(b + 128U);
+    ws[16U] = u15;
+    uint64_t u16 = load64_le(b + 136U);
+    ws[17U] = u16;
+    uint64_t u17 = load64_le(b + 144U);
+    ws[18U] = u17;
+    uint64_t u18 = load64_le(b + 152U);
+    ws[19U] = u18;
+    uint64_t u19 = load64_le(b + 160U);
+    ws[20U] = u19;
+    uint64_t u20 = load64_le(b + 168U);
+    ws[21U] = u20;
+    uint64_t u21 = load64_le(b + 176U);
+    ws[22U] = u21;
+    uint64_t u22 = load64_le(b + 184U);
+    ws[23U] = u22;
+    uint64_t u23 = load64_le(b + 192U);
+    ws[24U] = u23;
+    uint64_t u24 = load64_le(b + 200U);
+    ws[25U] = u24;
+    uint64_t u25 = load64_le(b + 208U);
+    ws[26U] = u25;
+    uint64_t u26 = load64_le(b + 216U);
+    ws[27U] = u26;
+    uint64_t u27 = load64_le(b + 224U);
+    ws[28U] = u27;
+    uint64_t u28 = load64_le(b + 232U);
+    ws[29U] = u28;
+    uint64_t u29 = load64_le(b + 240U);
+    ws[30U] = u29;
+    uint64_t u30 = load64_le(b + 248U);
+    ws[31U] = u30;
+    for (uint32_t i = 0U; i < 25U; i++)
+    {
+      state[i] = state[i] ^ ws[i];
+    }
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      uint64_t _C[5U] = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        _C[i] =
+          state[i
+          + 0U]
+          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
+        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
+      uint64_t x = state[1U];
+      uint64_t current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        uint64_t temp = state[_Y];
+        uint64_t uu____1 = current;
+        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+        state[0U + 5U * i] = v0;
+        state[1U + 5U * i] = v1;
+        state[2U + 5U * i] = v2;
+        state[3U + 5U * i] = v3;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      state[0U] = state[0U] ^ c;
+    }
+  }
+  uint32_t rem = inputByteLen % 168U;
+  uint8_t b2[256U] = { 0U };
+  uint8_t *b_ = b2;
+  uint32_t rem1 = inputByteLen % 168U;
+  uint8_t *b00 = input;
+  uint8_t *bl0 = b_;
+  memcpy(bl0, b00 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b01 = b_;
+  b01[rem] = 0x1FU;
+  uint64_t ws[32U] = { 0U };
+  uint8_t *b = b_;
+  uint64_t u0 = load64_le(b);
+  ws[0U] = u0;
+  uint64_t u1 = load64_le(b + 8U);
+  ws[1U] = u1;
+  uint64_t u2 = load64_le(b + 16U);
+  ws[2U] = u2;
+  uint64_t u3 = load64_le(b + 24U);
+  ws[3U] = u3;
+  uint64_t u4 = load64_le(b + 32U);
+  ws[4U] = u4;
+  uint64_t u5 = load64_le(b + 40U);
+  ws[5U] = u5;
+  uint64_t u6 = load64_le(b + 48U);
+  ws[6U] = u6;
+  uint64_t u7 = load64_le(b + 56U);
+  ws[7U] = u7;
+  uint64_t u8 = load64_le(b + 64U);
+  ws[8U] = u8;
+  uint64_t u9 = load64_le(b + 72U);
+  ws[9U] = u9;
+  uint64_t u10 = load64_le(b + 80U);
+  ws[10U] = u10;
+  uint64_t u11 = load64_le(b + 88U);
+  ws[11U] = u11;
+  uint64_t u12 = load64_le(b + 96U);
+  ws[12U] = u12;
+  uint64_t u13 = load64_le(b + 104U);
+  ws[13U] = u13;
+  uint64_t u14 = load64_le(b + 112U);
+  ws[14U] = u14;
+  uint64_t u15 = load64_le(b + 120U);
+  ws[15U] = u15;
+  uint64_t u16 = load64_le(b + 128U);
+  ws[16U] = u16;
+  uint64_t u17 = load64_le(b + 136U);
+  ws[17U] = u17;
+  uint64_t u18 = load64_le(b + 144U);
+  ws[18U] = u18;
+  uint64_t u19 = load64_le(b + 152U);
+  ws[19U] = u19;
+  uint64_t u20 = load64_le(b + 160U);
+  ws[20U] = u20;
+  uint64_t u21 = load64_le(b + 168U);
+  ws[21U] = u21;
+  uint64_t u22 = load64_le(b + 176U);
+  ws[22U] = u22;
+  uint64_t u23 = load64_le(b + 184U);
+  ws[23U] = u23;
+  uint64_t u24 = load64_le(b + 192U);
+  ws[24U] = u24;
+  uint64_t u25 = load64_le(b + 200U);
+  ws[25U] = u25;
+  uint64_t u26 = load64_le(b + 208U);
+  ws[26U] = u26;
+  uint64_t u27 = load64_le(b + 216U);
+  ws[27U] = u27;
+  uint64_t u28 = load64_le(b + 224U);
+  ws[28U] = u28;
+  uint64_t u29 = load64_le(b + 232U);
+  ws[29U] = u29;
+  uint64_t u30 = load64_le(b + 240U);
+  ws[30U] = u30;
+  uint64_t u31 = load64_le(b + 248U);
+  ws[31U] = u31;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws[i];
+  }
+  uint8_t b3[256U] = { 0U };
+  uint8_t *b4 = b3;
+  uint8_t *b0 = b4;
+  b0[167U] = 0x80U;
+  uint64_t ws0[32U] = { 0U };
+  uint8_t *b1 = b4;
+  uint64_t u = load64_le(b1);
+  ws0[0U] = u;
+  uint64_t u32 = load64_le(b1 + 8U);
+  ws0[1U] = u32;
+  uint64_t u33 = load64_le(b1 + 16U);
+  ws0[2U] = u33;
+  uint64_t u34 = load64_le(b1 + 24U);
+  ws0[3U] = u34;
+  uint64_t u35 = load64_le(b1 + 32U);
+  ws0[4U] = u35;
+  uint64_t u36 = load64_le(b1 + 40U);
+  ws0[5U] = u36;
+  uint64_t u37 = load64_le(b1 + 48U);
+  ws0[6U] = u37;
+  uint64_t u38 = load64_le(b1 + 56U);
+  ws0[7U] = u38;
+  uint64_t u39 = load64_le(b1 + 64U);
+  ws0[8U] = u39;
+  uint64_t u40 = load64_le(b1 + 72U);
+  ws0[9U] = u40;
+  uint64_t u41 = load64_le(b1 + 80U);
+  ws0[10U] = u41;
+  uint64_t u42 = load64_le(b1 + 88U);
+  ws0[11U] = u42;
+  uint64_t u43 = load64_le(b1 + 96U);
+  ws0[12U] = u43;
+  uint64_t u44 = load64_le(b1 + 104U);
+  ws0[13U] = u44;
+  uint64_t u45 = load64_le(b1 + 112U);
+  ws0[14U] = u45;
+  uint64_t u46 = load64_le(b1 + 120U);
+  ws0[15U] = u46;
+  uint64_t u47 = load64_le(b1 + 128U);
+  ws0[16U] = u47;
+  uint64_t u48 = load64_le(b1 + 136U);
+  ws0[17U] = u48;
+  uint64_t u49 = load64_le(b1 + 144U);
+  ws0[18U] = u49;
+  uint64_t u50 = load64_le(b1 + 152U);
+  ws0[19U] = u50;
+  uint64_t u51 = load64_le(b1 + 160U);
+  ws0[20U] = u51;
+  uint64_t u52 = load64_le(b1 + 168U);
+  ws0[21U] = u52;
+  uint64_t u53 = load64_le(b1 + 176U);
+  ws0[22U] = u53;
+  uint64_t u54 = load64_le(b1 + 184U);
+  ws0[23U] = u54;
+  uint64_t u55 = load64_le(b1 + 192U);
+  ws0[24U] = u55;
+  uint64_t u56 = load64_le(b1 + 200U);
+  ws0[25U] = u56;
+  uint64_t u57 = load64_le(b1 + 208U);
+  ws0[26U] = u57;
+  uint64_t u58 = load64_le(b1 + 216U);
+  ws0[27U] = u58;
+  uint64_t u59 = load64_le(b1 + 224U);
+  ws0[28U] = u59;
+  uint64_t u60 = load64_le(b1 + 232U);
+  ws0[29U] = u60;
+  uint64_t u61 = load64_le(b1 + 240U);
+  ws0[30U] = u61;
+  uint64_t u62 = load64_le(b1 + 248U);
+  ws0[31U] = u62;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = state[i] ^ ws0[i];
+  }
+  for (uint32_t i0 = 0U; i0 < 24U; i0++)
+  {
+    uint64_t _C[5U] = { 0U };
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      _C[i] = state[i + 0U] ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+    KRML_MAYBE_FOR5(i1,
+      0U,
+      5U,
+      1U,
+      uint64_t uu____2 = _C[(i1 + 1U) % 5U];
+      uint64_t _D = _C[(i1 + 4U) % 5U] ^ (uu____2 << 1U | uu____2 >> 63U);
+      KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i1 + 5U * i] = state[i1 + 5U * i] ^ _D;););
+    uint64_t x = state[1U];
+    uint64_t current = x;
+    for (uint32_t i = 0U; i < 24U; i++)
+    {
+      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+      uint64_t temp = state[_Y];
+      uint64_t uu____3 = current;
+      state[_Y] = uu____3 << r | uu____3 >> (64U - r);
+      current = temp;
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+      uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+      uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+      uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+      uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+      state[0U + 5U * i] = v0;
+      state[1U + 5U * i] = v1;
+      state[2U + 5U * i] = v2;
+      state[3U + 5U * i] = v3;
+      state[4U + 5U * i] = v4;);
+    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
+    state[0U] = state[0U] ^ c;
+  }
+}
+
+void
+Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
+  uint64_t *state,
+  uint8_t *output,
+  uint32_t outputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < outputByteLen / 168U; i0++)
+  {
+    uint8_t hbuf[256U] = { 0U };
+    uint64_t ws[32U] = { 0U };
+    memcpy(ws, state, 25U * sizeof (uint64_t));
+    for (uint32_t i = 0U; i < 32U; i++)
+    {
+      store64_le(hbuf + i * 8U, ws[i]);
+    }
+    memcpy(output + i0 * 168U, hbuf, 168U * sizeof (uint8_t));
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      uint64_t _C[5U] = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        _C[i] =
+          state[i
+          + 0U]
+          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
+        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
+        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
+      uint64_t x = state[1U];
+      uint64_t current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        uint64_t temp = state[_Y];
+        uint64_t uu____1 = current;
+        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
+        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
+        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
+        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
+        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
+        state[0U + 5U * i] = v0;
+        state[1U + 5U * i] = v1;
+        state[2U + 5U * i] = v2;
+        state[3U + 5U * i] = v3;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      state[0U] = state[0U] ^ c;
+    }
+  }
+}
+
diff --git a/src/msvc/Hacl_Hash_SHA3_Simd256.c b/src/msvc/Hacl_Hash_SHA3_Simd256.c
index b9bfcee5..9748a375 100644
--- a/src/msvc/Hacl_Hash_SHA3_Simd256.c
+++ b/src/msvc/Hacl_Hash_SHA3_Simd256.c
@@ -26,20 +26,19 @@
 #include "Hacl_Hash_SHA3_Simd256.h"
 
 #include "internal/Hacl_Hash_SHA3_Scalar.h"
-#include "libintvector.h"
 
 void
 Hacl_Hash_SHA3_Simd256_shake128(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -438,63 +437,63 @@ Hacl_Hash_SHA3_Simd256_shake128(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x1FU;
+  b12[rem] = 0x1FU;
+  b22[rem] = 0x1FU;
+  b32[rem] = 0x1FU;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x1FU;
-  b13[rem] = 0x1FU;
-  b23[rem] = 0x1FU;
-  b33[rem] = 0x1FU;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -723,57 +722,57 @@ Hacl_Hash_SHA3_Simd256_shake128(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
-  uint8_t *b36 = b.snd.snd.snd;
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -1295,62 +1294,49 @@ Hacl_Hash_SHA3_Simd256_shake128(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -1645,76 +1631,63 @@ Hacl_Hash_SHA3_Simd256_shake128(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + outputByteLen - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
+  uint8_t *b36 = rb.snd.snd.snd;
   uint8_t *b2 = rb.snd.snd.fst;
   uint8_t *b1 = rb.snd.fst;
   uint8_t *b0 = rb.fst;
-  memcpy(b0 + outputByteLen - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  memcpy(b0 + outputByteLen - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + outputByteLen - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + outputByteLen - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + outputByteLen - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
 Hacl_Hash_SHA3_Simd256_shake256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint32_t outputByteLen,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -2113,63 +2086,63 @@ Hacl_Hash_SHA3_Simd256_shake256(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x1FU;
+  b12[rem] = 0x1FU;
+  b22[rem] = 0x1FU;
+  b32[rem] = 0x1FU;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x1FU;
-  b13[rem] = 0x1FU;
-  b23[rem] = 0x1FU;
-  b33[rem] = 0x1FU;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -2398,57 +2371,57 @@ Hacl_Hash_SHA3_Simd256_shake256(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
-  uint8_t *b36 = b.snd.snd.snd;
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -2970,62 +2943,49 @@ Hacl_Hash_SHA3_Simd256_shake256(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -3320,75 +3280,62 @@ Hacl_Hash_SHA3_Simd256_shake256(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + outputByteLen - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + outputByteLen - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
+  uint8_t *b36 = rb.snd.snd.snd;
   uint8_t *b2 = rb.snd.snd.fst;
   uint8_t *b1 = rb.snd.fst;
   uint8_t *b0 = rb.fst;
-  memcpy(b0 + outputByteLen - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + outputByteLen - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  memcpy(b0 + outputByteLen - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + outputByteLen - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + outputByteLen - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + outputByteLen - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_224(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -3787,63 +3734,63 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -4072,57 +4019,57 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
-  uint8_t *b36 = b.snd.snd.snd;
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -4644,62 +4591,49 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -4994,75 +4928,62 @@ Hacl_Hash_SHA3_Simd256_sha3_224(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 28U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 28U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 28U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 28U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
+  uint8_t *b36 = rb.snd.snd.snd;
   uint8_t *b2 = rb.snd.snd.fst;
   uint8_t *b1 = rb.snd.fst;
   uint8_t *b0 = rb.fst;
-  memcpy(b0 + 28U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 28U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 28U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 28U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  memcpy(b0 + 28U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 28U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 28U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 28U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_256(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -5461,63 +5382,63 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -5746,57 +5667,57 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
-  uint8_t *b36 = b.snd.snd.snd;
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -6318,62 +6239,49 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -6668,75 +6576,62 @@ Hacl_Hash_SHA3_Simd256_sha3_256(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 32U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 32U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 32U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 32U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
+  uint8_t *b36 = rb.snd.snd.snd;
   uint8_t *b2 = rb.snd.snd.fst;
   uint8_t *b1 = rb.snd.fst;
   uint8_t *b0 = rb.fst;
-  memcpy(b0 + 32U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 32U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 32U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 32U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  memcpy(b0 + 32U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 32U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 32U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 32U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_384(
-  uint32_t inputByteLen,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
   uint8_t *input0,
   uint8_t *input1,
   uint8_t *input2,
   uint8_t *input3,
-  uint8_t *output0,
-  uint8_t *output1,
-  uint8_t *output2,
-  uint8_t *output3
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -7135,63 +7030,63 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -7420,57 +7315,57 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
-  uint8_t *b36 = b.snd.snd.snd;
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -7992,62 +7887,49 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -8342,75 +8224,62 @@ Hacl_Hash_SHA3_Simd256_sha3_384(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 48U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 48U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 48U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 48U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
+  uint8_t *b36 = rb.snd.snd.snd;
   uint8_t *b2 = rb.snd.snd.fst;
   uint8_t *b1 = rb.snd.fst;
   uint8_t *b0 = rb.fst;
-  memcpy(b0 + 48U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 48U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 48U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 48U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  memcpy(b0 + 48U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 48U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 48U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 48U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
 }
 
 void
 Hacl_Hash_SHA3_Simd256_sha3_512(
-  uint32_t inputByteLen,
-  uint8_t *input0,
-  uint8_t *input1,
-  uint8_t *input2,
-  uint8_t *input3,
   uint8_t *output0,
   uint8_t *output1,
   uint8_t *output2,
-  uint8_t *output3
+  uint8_t *output3,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
 )
 {
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
@@ -8809,63 +8678,63 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
   b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
   uint32_t rem1 = inputByteLen % rateInBytes;
-  uint8_t *b32 = ib.snd.snd.snd;
-  uint8_t *b22 = ib.snd.snd.fst;
-  uint8_t *b12 = ib.snd.fst;
-  uint8_t *b02 = ib.fst;
+  uint8_t *b31 = ib.snd.snd.snd;
+  uint8_t *b21 = ib.snd.snd.fst;
+  uint8_t *b11 = ib.snd.fst;
+  uint8_t *b01 = ib.fst;
   uint8_t *bl3 = b_.snd.snd.snd;
   uint8_t *bl2 = b_.snd.snd.fst;
   uint8_t *bl1 = b_.snd.fst;
   uint8_t *bl0 = b_.fst;
-  memcpy(bl0, b02 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl1, b12 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl2, b22 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  memcpy(bl3, b32 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x06U;
+  b12[rem] = 0x06U;
+  b22[rem] = 0x06U;
+  b32[rem] = 0x06U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
   uint8_t *b33 = b_.snd.snd.snd;
   uint8_t *b23 = b_.snd.snd.fst;
   uint8_t *b13 = b_.snd.fst;
   uint8_t *b03 = b_.fst;
-  b03[rem] = 0x06U;
-  b13[rem] = 0x06U;
-  b23[rem] = 0x06U;
-  b33[rem] = 0x06U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws32[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b34 = b_.snd.snd.snd;
-  uint8_t *b24 = b_.snd.snd.fst;
-  uint8_t *b14 = b_.snd.fst;
-  uint8_t *b04 = b_.fst;
-  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04);
-  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14);
-  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24);
-  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34);
-  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 32U);
-  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 32U);
-  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 32U);
-  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 32U);
-  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 64U);
-  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 64U);
-  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 64U);
-  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 64U);
-  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 96U);
-  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 96U);
-  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 96U);
-  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 96U);
-  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 128U);
-  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 128U);
-  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 128U);
-  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 128U);
-  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 160U);
-  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 160U);
-  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 160U);
-  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 160U);
-  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 192U);
-  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 192U);
-  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 192U);
-  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 192U);
-  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b04 + 224U);
-  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b14 + 224U);
-  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b24 + 224U);
-  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b34 + 224U);
+  ws32[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws32[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws32[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws32[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws32[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws32[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws32[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws32[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws32[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws32[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws32[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws32[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws32[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws32[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws32[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws32[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws32[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws32[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws32[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws32[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws32[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws32[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws32[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws32[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws32[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws32[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws32[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws32[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws32[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws32[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws32[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws32[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
   Lib_IntVector_Intrinsics_vec256 v00 = ws32[0U];
   Lib_IntVector_Intrinsics_vec256 v10 = ws32[1U];
   Lib_IntVector_Intrinsics_vec256 v20 = ws32[2U];
@@ -9094,57 +8963,57 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   {
     s[i] = Lib_IntVector_Intrinsics_vec256_xor(s[i], ws32[i]);
   }
-  uint8_t b05[256U] = { 0U };
-  uint8_t b15[256U] = { 0U };
-  uint8_t b25[256U] = { 0U };
-  uint8_t b35[256U] = { 0U };
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
   K____uint8_t___uint8_t____K____uint8_t___uint8_t_
-  b = { .fst = b05, .snd = { .fst = b15, .snd = { .fst = b25, .snd = b35 } } };
-  uint8_t *b36 = b.snd.snd.snd;
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[rateInBytes - 1U] = 0x80U;
+  b15[rateInBytes - 1U] = 0x80U;
+  b25[rateInBytes - 1U] = 0x80U;
+  b35[rateInBytes - 1U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
   uint8_t *b26 = b.snd.snd.fst;
   uint8_t *b16 = b.snd.fst;
   uint8_t *b06 = b.fst;
-  b06[rateInBytes - 1U] = 0x80U;
-  b16[rateInBytes - 1U] = 0x80U;
-  b26[rateInBytes - 1U] = 0x80U;
-  b36[rateInBytes - 1U] = 0x80U;
-  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws34[32U] KRML_POST_ALIGN(32) = { 0U };
-  uint8_t *b37 = b.snd.snd.snd;
-  uint8_t *b27 = b.snd.snd.fst;
-  uint8_t *b17 = b.snd.fst;
-  uint8_t *b07 = b.fst;
-  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07);
-  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17);
-  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27);
-  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37);
-  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 32U);
-  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 32U);
-  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 32U);
-  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 32U);
-  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 64U);
-  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 64U);
-  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 64U);
-  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 64U);
-  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 96U);
-  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 96U);
-  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 96U);
-  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 96U);
-  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 128U);
-  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 128U);
-  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 128U);
-  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 128U);
-  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 160U);
-  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 160U);
-  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 160U);
-  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 160U);
-  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 192U);
-  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 192U);
-  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 192U);
-  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 192U);
-  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b07 + 224U);
-  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b17 + 224U);
-  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b27 + 224U);
-  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b37 + 224U);
+  ws34[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06);
+  ws34[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16);
+  ws34[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26);
+  ws34[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws34[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 32U);
+  ws34[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 32U);
+  ws34[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 32U);
+  ws34[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws34[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 64U);
+  ws34[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 64U);
+  ws34[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 64U);
+  ws34[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws34[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 96U);
+  ws34[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 96U);
+  ws34[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 96U);
+  ws34[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws34[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 128U);
+  ws34[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 128U);
+  ws34[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 128U);
+  ws34[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws34[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 160U);
+  ws34[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 160U);
+  ws34[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 160U);
+  ws34[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws34[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 192U);
+  ws34[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 192U);
+  ws34[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 192U);
+  ws34[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws34[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b06 + 224U);
+  ws34[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b16 + 224U);
+  ws34[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b26 + 224U);
+  ws34[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
   Lib_IntVector_Intrinsics_vec256 v08 = ws34[0U];
   Lib_IntVector_Intrinsics_vec256 v18 = ws34[1U];
   Lib_IntVector_Intrinsics_vec256 v28 = ws34[2U];
@@ -9666,62 +9535,49 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
     Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
     Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
     ws[0U] = ws0;
-    ws[1U] = ws1;
-    ws[2U] = ws2;
-    ws[3U] = ws3;
-    ws[4U] = ws4;
-    ws[5U] = ws5;
-    ws[6U] = ws6;
-    ws[7U] = ws7;
-    ws[8U] = ws8;
-    ws[9U] = ws9;
-    ws[10U] = ws10;
-    ws[11U] = ws11;
-    ws[12U] = ws12;
-    ws[13U] = ws13;
-    ws[14U] = ws14;
-    ws[15U] = ws15;
-    ws[16U] = ws16;
-    ws[17U] = ws17;
-    ws[18U] = ws18;
-    ws[19U] = ws19;
-    ws[20U] = ws20;
-    ws[21U] = ws21;
-    ws[22U] = ws22;
-    ws[23U] = ws23;
-    ws[24U] = ws24;
-    ws[25U] = ws25;
-    ws[26U] = ws26;
-    ws[27U] = ws27;
-    ws[28U] = ws28;
-    ws[29U] = ws29;
-    ws[30U] = ws30;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
     ws[31U] = ws31;
     for (uint32_t i = 0U; i < 32U; i++)
     {
       Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
     }
-    for (uint32_t i = 0U; i < rateInBytes / 32U; i++)
-    {
-      uint8_t *b31 = rb.snd.snd.snd;
-      uint8_t *b21 = rb.snd.snd.fst;
-      uint8_t *b11 = rb.snd.fst;
-      uint8_t *b01 = rb.fst;
-      memcpy(b01 + i0 * rateInBytes + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-      memcpy(b11 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-      memcpy(b21 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-      memcpy(b31 + i0 * rateInBytes + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-    }
-    uint32_t rem0 = rateInBytes % 32U;
-    uint32_t j = rateInBytes / 32U;
-    uint8_t *b31 = rb.snd.snd.snd;
-    uint8_t *b21 = rb.snd.snd.fst;
-    uint8_t *b11 = rb.snd.fst;
-    uint8_t *b01 = rb.fst;
-    memcpy(b01 + i0 * rateInBytes + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-    memcpy(b11 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-    memcpy(b21 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-    memcpy(b31 + i0 * rateInBytes + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+    uint8_t *b36 = rb.snd.snd.snd;
+    uint8_t *b2 = rb.snd.snd.fst;
+    uint8_t *b1 = rb.snd.fst;
+    uint8_t *b0 = rb.fst;
+    memcpy(b0 + i0 * rateInBytes, hbuf, rateInBytes * sizeof (uint8_t));
+    memcpy(b1 + i0 * rateInBytes, hbuf + 256U, rateInBytes * sizeof (uint8_t));
+    memcpy(b2 + i0 * rateInBytes, hbuf + 512U, rateInBytes * sizeof (uint8_t));
+    memcpy(b36 + i0 * rateInBytes, hbuf + 768U, rateInBytes * sizeof (uint8_t));
     for (uint32_t i1 = 0U; i1 < 24U; i1++)
     {
       KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
@@ -10016,61 +9872,1482 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   Lib_IntVector_Intrinsics_vec256 ws30 = v1__22;
   Lib_IntVector_Intrinsics_vec256 ws31 = v3__22;
   ws[0U] = ws0;
-  ws[1U] = ws1;
-  ws[2U] = ws2;
-  ws[3U] = ws3;
-  ws[4U] = ws4;
-  ws[5U] = ws5;
-  ws[6U] = ws6;
-  ws[7U] = ws7;
-  ws[8U] = ws8;
-  ws[9U] = ws9;
-  ws[10U] = ws10;
-  ws[11U] = ws11;
-  ws[12U] = ws12;
-  ws[13U] = ws13;
-  ws[14U] = ws14;
-  ws[15U] = ws15;
-  ws[16U] = ws16;
-  ws[17U] = ws17;
-  ws[18U] = ws18;
-  ws[19U] = ws19;
-  ws[20U] = ws20;
-  ws[21U] = ws21;
-  ws[22U] = ws22;
-  ws[23U] = ws23;
-  ws[24U] = ws24;
-  ws[25U] = ws25;
-  ws[26U] = ws26;
-  ws[27U] = ws27;
-  ws[28U] = ws28;
-  ws[29U] = ws29;
-  ws[30U] = ws30;
+  ws[1U] = ws4;
+  ws[2U] = ws8;
+  ws[3U] = ws12;
+  ws[4U] = ws16;
+  ws[5U] = ws20;
+  ws[6U] = ws24;
+  ws[7U] = ws28;
+  ws[8U] = ws1;
+  ws[9U] = ws5;
+  ws[10U] = ws9;
+  ws[11U] = ws13;
+  ws[12U] = ws17;
+  ws[13U] = ws21;
+  ws[14U] = ws25;
+  ws[15U] = ws29;
+  ws[16U] = ws2;
+  ws[17U] = ws6;
+  ws[18U] = ws10;
+  ws[19U] = ws14;
+  ws[20U] = ws18;
+  ws[21U] = ws22;
+  ws[22U] = ws26;
+  ws[23U] = ws30;
+  ws[24U] = ws3;
+  ws[25U] = ws7;
+  ws[26U] = ws11;
+  ws[27U] = ws15;
+  ws[28U] = ws19;
+  ws[29U] = ws23;
+  ws[30U] = ws27;
   ws[31U] = ws31;
   for (uint32_t i = 0U; i < 32U; i++)
   {
     Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
   }
-  for (uint32_t i = 0U; i < remOut / 32U; i++)
-  {
-    uint8_t *b3 = rb.snd.snd.snd;
-    uint8_t *b2 = rb.snd.snd.fst;
-    uint8_t *b1 = rb.snd.fst;
-    uint8_t *b0 = rb.fst;
-    memcpy(b0 + 64U - remOut + i * 32U, hbuf + i * 128U, 32U * sizeof (uint8_t));
-    memcpy(b1 + 64U - remOut + i * 32U, hbuf + i * 128U + 32U, 32U * sizeof (uint8_t));
-    memcpy(b2 + 64U - remOut + i * 32U, hbuf + i * 128U + 64U, 32U * sizeof (uint8_t));
-    memcpy(b3 + 64U - remOut + i * 32U, hbuf + i * 128U + 96U, 32U * sizeof (uint8_t));
-  }
-  uint32_t rem0 = remOut % 32U;
-  uint32_t j = remOut / 32U;
-  uint8_t *b3 = rb.snd.snd.snd;
+  uint8_t *b36 = rb.snd.snd.snd;
   uint8_t *b2 = rb.snd.snd.fst;
   uint8_t *b1 = rb.snd.fst;
   uint8_t *b0 = rb.fst;
-  memcpy(b0 + 64U - remOut + j * 32U, hbuf + j * 128U, rem0 * sizeof (uint8_t));
-  memcpy(b1 + 64U - remOut + j * 32U, hbuf + j * 128U + 32U, rem0 * sizeof (uint8_t));
-  memcpy(b2 + 64U - remOut + j * 32U, hbuf + j * 128U + 64U, rem0 * sizeof (uint8_t));
-  memcpy(b3 + 64U - remOut + j * 32U, hbuf + j * 128U + 96U, rem0 * sizeof (uint8_t));
+  memcpy(b0 + 64U - remOut, hbuf, remOut * sizeof (uint8_t));
+  memcpy(b1 + 64U - remOut, hbuf + 256U, remOut * sizeof (uint8_t));
+  memcpy(b2 + 64U - remOut, hbuf + 512U, remOut * sizeof (uint8_t));
+  memcpy(b36 + 64U - remOut, hbuf + 768U, remOut * sizeof (uint8_t));
+}
+
+uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void)
+{
+  uint64_t *buf = (uint64_t *)KRML_HOST_CALLOC(100U, sizeof (uint64_t));
+  return buf;
+}
+
+void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s)
+{
+  KRML_HOST_FREE(s);
+}
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
+  {
+    uint8_t b00[256U] = { 0U };
+    uint8_t b10[256U] = { 0U };
+    uint8_t b20[256U] = { 0U };
+    uint8_t b30[256U] = { 0U };
+    K____uint8_t___uint8_t____K____uint8_t___uint8_t_
+    b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
+    uint8_t *b01 = input0;
+    uint8_t *b11 = input1;
+    uint8_t *b21 = input2;
+    uint8_t *b31 = input3;
+    uint8_t *bl3 = b_.snd.snd.snd;
+    uint8_t *bl2 = b_.snd.snd.fst;
+    uint8_t *bl1 = b_.snd.fst;
+    uint8_t *bl0 = b_.fst;
+    memcpy(bl0, b01 + i0 * 168U, 168U * sizeof (uint8_t));
+    memcpy(bl1, b11 + i0 * 168U, 168U * sizeof (uint8_t));
+    memcpy(bl2, b21 + i0 * 168U, 168U * sizeof (uint8_t));
+    memcpy(bl3, b31 + i0 * 168U, 168U * sizeof (uint8_t));
+    KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws[32U] KRML_POST_ALIGN(32) = { 0U };
+    uint8_t *b3 = b_.snd.snd.snd;
+    uint8_t *b2 = b_.snd.snd.fst;
+    uint8_t *b1 = b_.snd.fst;
+    uint8_t *b0 = b_.fst;
+    ws[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0);
+    ws[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1);
+    ws[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2);
+    ws[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+    ws[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 32U);
+    ws[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 32U);
+    ws[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 32U);
+    ws[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+    ws[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 64U);
+    ws[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 64U);
+    ws[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 64U);
+    ws[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+    ws[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 96U);
+    ws[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 96U);
+    ws[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 96U);
+    ws[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+    ws[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 128U);
+    ws[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 128U);
+    ws[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 128U);
+    ws[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+    ws[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 160U);
+    ws[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 160U);
+    ws[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 160U);
+    ws[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+    ws[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 192U);
+    ws[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 192U);
+    ws[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 192U);
+    ws[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+    ws[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 224U);
+    ws[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 224U);
+    ws[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 224U);
+    ws[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
+    Lib_IntVector_Intrinsics_vec256 v00 = ws[0U];
+    Lib_IntVector_Intrinsics_vec256 v10 = ws[1U];
+    Lib_IntVector_Intrinsics_vec256 v20 = ws[2U];
+    Lib_IntVector_Intrinsics_vec256 v30 = ws[3U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v1_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v2_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v3_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v0__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v1__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v2__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256
+    v3__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256 ws0 = v0__;
+    Lib_IntVector_Intrinsics_vec256 ws1 = v2__;
+    Lib_IntVector_Intrinsics_vec256 ws2 = v1__;
+    Lib_IntVector_Intrinsics_vec256 ws3 = v3__;
+    Lib_IntVector_Intrinsics_vec256 v01 = ws[4U];
+    Lib_IntVector_Intrinsics_vec256 v11 = ws[5U];
+    Lib_IntVector_Intrinsics_vec256 v21 = ws[6U];
+    Lib_IntVector_Intrinsics_vec256 v31 = ws[7U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v1_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v2_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v3_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v0__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v1__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v2__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256
+    v3__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256 ws4 = v0__0;
+    Lib_IntVector_Intrinsics_vec256 ws5 = v2__0;
+    Lib_IntVector_Intrinsics_vec256 ws6 = v1__0;
+    Lib_IntVector_Intrinsics_vec256 ws7 = v3__0;
+    Lib_IntVector_Intrinsics_vec256 v02 = ws[8U];
+    Lib_IntVector_Intrinsics_vec256 v12 = ws[9U];
+    Lib_IntVector_Intrinsics_vec256 v22 = ws[10U];
+    Lib_IntVector_Intrinsics_vec256 v32 = ws[11U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v1_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v2_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v3_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v0__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v1__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v2__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256
+    v3__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256 ws8 = v0__1;
+    Lib_IntVector_Intrinsics_vec256 ws9 = v2__1;
+    Lib_IntVector_Intrinsics_vec256 ws10 = v1__1;
+    Lib_IntVector_Intrinsics_vec256 ws11 = v3__1;
+    Lib_IntVector_Intrinsics_vec256 v03 = ws[12U];
+    Lib_IntVector_Intrinsics_vec256 v13 = ws[13U];
+    Lib_IntVector_Intrinsics_vec256 v23 = ws[14U];
+    Lib_IntVector_Intrinsics_vec256 v33 = ws[15U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v1_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v2_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v3_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v0__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v1__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v2__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256
+    v3__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256 ws12 = v0__2;
+    Lib_IntVector_Intrinsics_vec256 ws13 = v2__2;
+    Lib_IntVector_Intrinsics_vec256 ws14 = v1__2;
+    Lib_IntVector_Intrinsics_vec256 ws15 = v3__2;
+    Lib_IntVector_Intrinsics_vec256 v04 = ws[16U];
+    Lib_IntVector_Intrinsics_vec256 v14 = ws[17U];
+    Lib_IntVector_Intrinsics_vec256 v24 = ws[18U];
+    Lib_IntVector_Intrinsics_vec256 v34 = ws[19U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v1_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v2_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v3_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v0__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v1__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v2__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256
+    v3__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256 ws16 = v0__3;
+    Lib_IntVector_Intrinsics_vec256 ws17 = v2__3;
+    Lib_IntVector_Intrinsics_vec256 ws18 = v1__3;
+    Lib_IntVector_Intrinsics_vec256 ws19 = v3__3;
+    Lib_IntVector_Intrinsics_vec256 v05 = ws[20U];
+    Lib_IntVector_Intrinsics_vec256 v15 = ws[21U];
+    Lib_IntVector_Intrinsics_vec256 v25 = ws[22U];
+    Lib_IntVector_Intrinsics_vec256 v35 = ws[23U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v1_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v2_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v3_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v0__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v1__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v2__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256
+    v3__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256 ws20 = v0__4;
+    Lib_IntVector_Intrinsics_vec256 ws21 = v2__4;
+    Lib_IntVector_Intrinsics_vec256 ws22 = v1__4;
+    Lib_IntVector_Intrinsics_vec256 ws23 = v3__4;
+    Lib_IntVector_Intrinsics_vec256 v06 = ws[24U];
+    Lib_IntVector_Intrinsics_vec256 v16 = ws[25U];
+    Lib_IntVector_Intrinsics_vec256 v26 = ws[26U];
+    Lib_IntVector_Intrinsics_vec256 v36 = ws[27U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v1_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v2_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v3_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v0__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v1__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v2__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256
+    v3__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256 ws24 = v0__5;
+    Lib_IntVector_Intrinsics_vec256 ws25 = v2__5;
+    Lib_IntVector_Intrinsics_vec256 ws26 = v1__5;
+    Lib_IntVector_Intrinsics_vec256 ws27 = v3__5;
+    Lib_IntVector_Intrinsics_vec256 v0 = ws[28U];
+    Lib_IntVector_Intrinsics_vec256 v1 = ws[29U];
+    Lib_IntVector_Intrinsics_vec256 v2 = ws[30U];
+    Lib_IntVector_Intrinsics_vec256 v3 = ws[31U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v1_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v2_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v3_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v0__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v1__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v2__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256
+    v3__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256 ws28 = v0__6;
+    Lib_IntVector_Intrinsics_vec256 ws29 = v2__6;
+    Lib_IntVector_Intrinsics_vec256 ws30 = v1__6;
+    Lib_IntVector_Intrinsics_vec256 ws31 = v3__6;
+    ws[0U] = ws0;
+    ws[1U] = ws1;
+    ws[2U] = ws2;
+    ws[3U] = ws3;
+    ws[4U] = ws4;
+    ws[5U] = ws5;
+    ws[6U] = ws6;
+    ws[7U] = ws7;
+    ws[8U] = ws8;
+    ws[9U] = ws9;
+    ws[10U] = ws10;
+    ws[11U] = ws11;
+    ws[12U] = ws12;
+    ws[13U] = ws13;
+    ws[14U] = ws14;
+    ws[15U] = ws15;
+    ws[16U] = ws16;
+    ws[17U] = ws17;
+    ws[18U] = ws18;
+    ws[19U] = ws19;
+    ws[20U] = ws20;
+    ws[21U] = ws21;
+    ws[22U] = ws22;
+    ws[23U] = ws23;
+    ws[24U] = ws24;
+    ws[25U] = ws25;
+    ws[26U] = ws26;
+    ws[27U] = ws27;
+    ws[28U] = ws28;
+    ws[29U] = ws29;
+    ws[30U] = ws30;
+    ws[31U] = ws31;
+    for (uint32_t i = 0U; i < 25U; i++)
+    {
+      state[i] = Lib_IntVector_Intrinsics_vec256_xor(state[i], ws[i]);
+    }
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____0 = state[i + 0U];
+        Lib_IntVector_Intrinsics_vec256 uu____1 = state[i + 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____2 = state[i + 10U];
+        _C[i] =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____0,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____1,
+              Lib_IntVector_Intrinsics_vec256_xor(uu____2,
+                Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____3 = _C[(i2 + 4U) % 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____4 = _C[(i2 + 1U) % 5U];
+        Lib_IntVector_Intrinsics_vec256
+        _D =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____3,
+            Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____4,
+                1U),
+              Lib_IntVector_Intrinsics_vec256_shift_right64(uu____4, 63U)));
+        KRML_MAYBE_FOR5(i,
+          0U,
+          5U,
+          1U,
+          state[i2 + 5U * i] = Lib_IntVector_Intrinsics_vec256_xor(state[i2 + 5U * i], _D);););
+      Lib_IntVector_Intrinsics_vec256 x = state[1U];
+      Lib_IntVector_Intrinsics_vec256 current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
+        Lib_IntVector_Intrinsics_vec256 uu____5 = current;
+        state[_Y] =
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____5,
+              r),
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____5, 64U - r));
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____6 = state[0U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____7 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v07 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____6,
+            Lib_IntVector_Intrinsics_vec256_and(uu____7, state[2U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____8 = state[1U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____9 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v17 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____8,
+            Lib_IntVector_Intrinsics_vec256_and(uu____9, state[3U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____10 = state[2U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____11 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v27 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____10,
+            Lib_IntVector_Intrinsics_vec256_and(uu____11, state[4U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____12 = state[3U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____13 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v37 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____12,
+            Lib_IntVector_Intrinsics_vec256_and(uu____13, state[0U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____14 = state[4U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____15 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v4 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____14,
+            Lib_IntVector_Intrinsics_vec256_and(uu____15, state[1U + 5U * i]));
+        state[0U + 5U * i] = v07;
+        state[1U + 5U * i] = v17;
+        state[2U + 5U * i] = v27;
+        state[3U + 5U * i] = v37;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      Lib_IntVector_Intrinsics_vec256 uu____16 = state[0U];
+      state[0U] =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____16,
+          Lib_IntVector_Intrinsics_vec256_load64(c));
+    }
+  }
+}
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *input0,
+  uint8_t *input1,
+  uint8_t *input2,
+  uint8_t *input3,
+  uint32_t inputByteLen
+)
+{
+  uint32_t rem = inputByteLen % 168U;
+  uint8_t b00[256U] = { 0U };
+  uint8_t b10[256U] = { 0U };
+  uint8_t b20[256U] = { 0U };
+  uint8_t b30[256U] = { 0U };
+  K____uint8_t___uint8_t____K____uint8_t___uint8_t_
+  b_ = { .fst = b00, .snd = { .fst = b10, .snd = { .fst = b20, .snd = b30 } } };
+  uint32_t rem1 = inputByteLen % 168U;
+  uint8_t *b01 = input0;
+  uint8_t *b11 = input1;
+  uint8_t *b21 = input2;
+  uint8_t *b31 = input3;
+  uint8_t *bl3 = b_.snd.snd.snd;
+  uint8_t *bl2 = b_.snd.snd.fst;
+  uint8_t *bl1 = b_.snd.fst;
+  uint8_t *bl0 = b_.fst;
+  memcpy(bl0, b01 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl1, b11 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl2, b21 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  memcpy(bl3, b31 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
+  uint8_t *b32 = b_.snd.snd.snd;
+  uint8_t *b22 = b_.snd.snd.fst;
+  uint8_t *b12 = b_.snd.fst;
+  uint8_t *b02 = b_.fst;
+  b02[rem] = 0x1FU;
+  b12[rem] = 0x1FU;
+  b22[rem] = 0x1FU;
+  b32[rem] = 0x1FU;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b33 = b_.snd.snd.snd;
+  uint8_t *b23 = b_.snd.snd.fst;
+  uint8_t *b13 = b_.snd.fst;
+  uint8_t *b03 = b_.fst;
+  ws[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03);
+  ws[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13);
+  ws[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23);
+  ws[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33);
+  ws[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 32U);
+  ws[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 32U);
+  ws[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 32U);
+  ws[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 32U);
+  ws[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 64U);
+  ws[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 64U);
+  ws[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 64U);
+  ws[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 64U);
+  ws[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 96U);
+  ws[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 96U);
+  ws[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 96U);
+  ws[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 96U);
+  ws[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 128U);
+  ws[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 128U);
+  ws[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 128U);
+  ws[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 128U);
+  ws[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 160U);
+  ws[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 160U);
+  ws[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 160U);
+  ws[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 160U);
+  ws[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 192U);
+  ws[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 192U);
+  ws[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 192U);
+  ws[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 192U);
+  ws[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b03 + 224U);
+  ws[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b13 + 224U);
+  ws[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b23 + 224U);
+  ws[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b33 + 224U);
+  Lib_IntVector_Intrinsics_vec256 v00 = ws[0U];
+  Lib_IntVector_Intrinsics_vec256 v10 = ws[1U];
+  Lib_IntVector_Intrinsics_vec256 v20 = ws[2U];
+  Lib_IntVector_Intrinsics_vec256 v30 = ws[3U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v00, v10);
+  Lib_IntVector_Intrinsics_vec256
+  v1_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v00, v10);
+  Lib_IntVector_Intrinsics_vec256
+  v2_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v20, v30);
+  Lib_IntVector_Intrinsics_vec256
+  v3_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v20, v30);
+  Lib_IntVector_Intrinsics_vec256
+  v0__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_, v2_);
+  Lib_IntVector_Intrinsics_vec256
+  v1__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_, v2_);
+  Lib_IntVector_Intrinsics_vec256
+  v2__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_, v3_);
+  Lib_IntVector_Intrinsics_vec256
+  v3__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_, v3_);
+  Lib_IntVector_Intrinsics_vec256 ws00 = v0__;
+  Lib_IntVector_Intrinsics_vec256 ws110 = v2__;
+  Lib_IntVector_Intrinsics_vec256 ws210 = v1__;
+  Lib_IntVector_Intrinsics_vec256 ws32 = v3__;
+  Lib_IntVector_Intrinsics_vec256 v01 = ws[4U];
+  Lib_IntVector_Intrinsics_vec256 v11 = ws[5U];
+  Lib_IntVector_Intrinsics_vec256 v21 = ws[6U];
+  Lib_IntVector_Intrinsics_vec256 v31 = ws[7U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v01, v11);
+  Lib_IntVector_Intrinsics_vec256
+  v1_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v01, v11);
+  Lib_IntVector_Intrinsics_vec256
+  v2_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v21, v31);
+  Lib_IntVector_Intrinsics_vec256
+  v3_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v21, v31);
+  Lib_IntVector_Intrinsics_vec256
+  v0__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_0, v2_0);
+  Lib_IntVector_Intrinsics_vec256
+  v1__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_0, v2_0);
+  Lib_IntVector_Intrinsics_vec256
+  v2__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_0, v3_0);
+  Lib_IntVector_Intrinsics_vec256
+  v3__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_0, v3_0);
+  Lib_IntVector_Intrinsics_vec256 ws40 = v0__0;
+  Lib_IntVector_Intrinsics_vec256 ws50 = v2__0;
+  Lib_IntVector_Intrinsics_vec256 ws60 = v1__0;
+  Lib_IntVector_Intrinsics_vec256 ws70 = v3__0;
+  Lib_IntVector_Intrinsics_vec256 v02 = ws[8U];
+  Lib_IntVector_Intrinsics_vec256 v12 = ws[9U];
+  Lib_IntVector_Intrinsics_vec256 v22 = ws[10U];
+  Lib_IntVector_Intrinsics_vec256 v32 = ws[11U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v02, v12);
+  Lib_IntVector_Intrinsics_vec256
+  v1_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v02, v12);
+  Lib_IntVector_Intrinsics_vec256
+  v2_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v22, v32);
+  Lib_IntVector_Intrinsics_vec256
+  v3_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v22, v32);
+  Lib_IntVector_Intrinsics_vec256
+  v0__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_1, v2_1);
+  Lib_IntVector_Intrinsics_vec256
+  v1__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_1, v2_1);
+  Lib_IntVector_Intrinsics_vec256
+  v2__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_1, v3_1);
+  Lib_IntVector_Intrinsics_vec256
+  v3__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_1, v3_1);
+  Lib_IntVector_Intrinsics_vec256 ws80 = v0__1;
+  Lib_IntVector_Intrinsics_vec256 ws90 = v2__1;
+  Lib_IntVector_Intrinsics_vec256 ws100 = v1__1;
+  Lib_IntVector_Intrinsics_vec256 ws111 = v3__1;
+  Lib_IntVector_Intrinsics_vec256 v03 = ws[12U];
+  Lib_IntVector_Intrinsics_vec256 v13 = ws[13U];
+  Lib_IntVector_Intrinsics_vec256 v23 = ws[14U];
+  Lib_IntVector_Intrinsics_vec256 v33 = ws[15U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v03, v13);
+  Lib_IntVector_Intrinsics_vec256
+  v1_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v03, v13);
+  Lib_IntVector_Intrinsics_vec256
+  v2_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v23, v33);
+  Lib_IntVector_Intrinsics_vec256
+  v3_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v23, v33);
+  Lib_IntVector_Intrinsics_vec256
+  v0__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_2, v2_2);
+  Lib_IntVector_Intrinsics_vec256
+  v1__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_2, v2_2);
+  Lib_IntVector_Intrinsics_vec256
+  v2__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_2, v3_2);
+  Lib_IntVector_Intrinsics_vec256
+  v3__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_2, v3_2);
+  Lib_IntVector_Intrinsics_vec256 ws120 = v0__2;
+  Lib_IntVector_Intrinsics_vec256 ws130 = v2__2;
+  Lib_IntVector_Intrinsics_vec256 ws140 = v1__2;
+  Lib_IntVector_Intrinsics_vec256 ws150 = v3__2;
+  Lib_IntVector_Intrinsics_vec256 v04 = ws[16U];
+  Lib_IntVector_Intrinsics_vec256 v14 = ws[17U];
+  Lib_IntVector_Intrinsics_vec256 v24 = ws[18U];
+  Lib_IntVector_Intrinsics_vec256 v34 = ws[19U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v04, v14);
+  Lib_IntVector_Intrinsics_vec256
+  v1_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v04, v14);
+  Lib_IntVector_Intrinsics_vec256
+  v2_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v24, v34);
+  Lib_IntVector_Intrinsics_vec256
+  v3_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v24, v34);
+  Lib_IntVector_Intrinsics_vec256
+  v0__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_3, v2_3);
+  Lib_IntVector_Intrinsics_vec256
+  v1__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_3, v2_3);
+  Lib_IntVector_Intrinsics_vec256
+  v2__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_3, v3_3);
+  Lib_IntVector_Intrinsics_vec256
+  v3__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_3, v3_3);
+  Lib_IntVector_Intrinsics_vec256 ws160 = v0__3;
+  Lib_IntVector_Intrinsics_vec256 ws170 = v2__3;
+  Lib_IntVector_Intrinsics_vec256 ws180 = v1__3;
+  Lib_IntVector_Intrinsics_vec256 ws190 = v3__3;
+  Lib_IntVector_Intrinsics_vec256 v05 = ws[20U];
+  Lib_IntVector_Intrinsics_vec256 v15 = ws[21U];
+  Lib_IntVector_Intrinsics_vec256 v25 = ws[22U];
+  Lib_IntVector_Intrinsics_vec256 v35 = ws[23U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v05, v15);
+  Lib_IntVector_Intrinsics_vec256
+  v1_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v05, v15);
+  Lib_IntVector_Intrinsics_vec256
+  v2_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v25, v35);
+  Lib_IntVector_Intrinsics_vec256
+  v3_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v25, v35);
+  Lib_IntVector_Intrinsics_vec256
+  v0__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_4, v2_4);
+  Lib_IntVector_Intrinsics_vec256
+  v1__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_4, v2_4);
+  Lib_IntVector_Intrinsics_vec256
+  v2__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_4, v3_4);
+  Lib_IntVector_Intrinsics_vec256
+  v3__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_4, v3_4);
+  Lib_IntVector_Intrinsics_vec256 ws200 = v0__4;
+  Lib_IntVector_Intrinsics_vec256 ws211 = v2__4;
+  Lib_IntVector_Intrinsics_vec256 ws220 = v1__4;
+  Lib_IntVector_Intrinsics_vec256 ws230 = v3__4;
+  Lib_IntVector_Intrinsics_vec256 v06 = ws[24U];
+  Lib_IntVector_Intrinsics_vec256 v16 = ws[25U];
+  Lib_IntVector_Intrinsics_vec256 v26 = ws[26U];
+  Lib_IntVector_Intrinsics_vec256 v36 = ws[27U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v06, v16);
+  Lib_IntVector_Intrinsics_vec256
+  v1_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v06, v16);
+  Lib_IntVector_Intrinsics_vec256
+  v2_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v26, v36);
+  Lib_IntVector_Intrinsics_vec256
+  v3_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v26, v36);
+  Lib_IntVector_Intrinsics_vec256
+  v0__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_5, v2_5);
+  Lib_IntVector_Intrinsics_vec256
+  v1__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_5, v2_5);
+  Lib_IntVector_Intrinsics_vec256
+  v2__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_5, v3_5);
+  Lib_IntVector_Intrinsics_vec256
+  v3__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_5, v3_5);
+  Lib_IntVector_Intrinsics_vec256 ws240 = v0__5;
+  Lib_IntVector_Intrinsics_vec256 ws250 = v2__5;
+  Lib_IntVector_Intrinsics_vec256 ws260 = v1__5;
+  Lib_IntVector_Intrinsics_vec256 ws270 = v3__5;
+  Lib_IntVector_Intrinsics_vec256 v07 = ws[28U];
+  Lib_IntVector_Intrinsics_vec256 v17 = ws[29U];
+  Lib_IntVector_Intrinsics_vec256 v27 = ws[30U];
+  Lib_IntVector_Intrinsics_vec256 v37 = ws[31U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v07, v17);
+  Lib_IntVector_Intrinsics_vec256
+  v1_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v07, v17);
+  Lib_IntVector_Intrinsics_vec256
+  v2_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v27, v37);
+  Lib_IntVector_Intrinsics_vec256
+  v3_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v27, v37);
+  Lib_IntVector_Intrinsics_vec256
+  v0__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_6, v2_6);
+  Lib_IntVector_Intrinsics_vec256
+  v1__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_6, v2_6);
+  Lib_IntVector_Intrinsics_vec256
+  v2__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_6, v3_6);
+  Lib_IntVector_Intrinsics_vec256
+  v3__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_6, v3_6);
+  Lib_IntVector_Intrinsics_vec256 ws280 = v0__6;
+  Lib_IntVector_Intrinsics_vec256 ws290 = v2__6;
+  Lib_IntVector_Intrinsics_vec256 ws300 = v1__6;
+  Lib_IntVector_Intrinsics_vec256 ws310 = v3__6;
+  ws[0U] = ws00;
+  ws[1U] = ws110;
+  ws[2U] = ws210;
+  ws[3U] = ws32;
+  ws[4U] = ws40;
+  ws[5U] = ws50;
+  ws[6U] = ws60;
+  ws[7U] = ws70;
+  ws[8U] = ws80;
+  ws[9U] = ws90;
+  ws[10U] = ws100;
+  ws[11U] = ws111;
+  ws[12U] = ws120;
+  ws[13U] = ws130;
+  ws[14U] = ws140;
+  ws[15U] = ws150;
+  ws[16U] = ws160;
+  ws[17U] = ws170;
+  ws[18U] = ws180;
+  ws[19U] = ws190;
+  ws[20U] = ws200;
+  ws[21U] = ws211;
+  ws[22U] = ws220;
+  ws[23U] = ws230;
+  ws[24U] = ws240;
+  ws[25U] = ws250;
+  ws[26U] = ws260;
+  ws[27U] = ws270;
+  ws[28U] = ws280;
+  ws[29U] = ws290;
+  ws[30U] = ws300;
+  ws[31U] = ws310;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = Lib_IntVector_Intrinsics_vec256_xor(state[i], ws[i]);
+  }
+  uint8_t b04[256U] = { 0U };
+  uint8_t b14[256U] = { 0U };
+  uint8_t b24[256U] = { 0U };
+  uint8_t b34[256U] = { 0U };
+  K____uint8_t___uint8_t____K____uint8_t___uint8_t_
+  b = { .fst = b04, .snd = { .fst = b14, .snd = { .fst = b24, .snd = b34 } } };
+  uint8_t *b35 = b.snd.snd.snd;
+  uint8_t *b25 = b.snd.snd.fst;
+  uint8_t *b15 = b.snd.fst;
+  uint8_t *b05 = b.fst;
+  b05[167U] = 0x80U;
+  b15[167U] = 0x80U;
+  b25[167U] = 0x80U;
+  b35[167U] = 0x80U;
+  KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws33[32U] KRML_POST_ALIGN(32) = { 0U };
+  uint8_t *b3 = b.snd.snd.snd;
+  uint8_t *b2 = b.snd.snd.fst;
+  uint8_t *b1 = b.snd.fst;
+  uint8_t *b0 = b.fst;
+  ws33[0U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0);
+  ws33[1U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1);
+  ws33[2U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2);
+  ws33[3U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3);
+  ws33[4U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 32U);
+  ws33[5U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 32U);
+  ws33[6U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 32U);
+  ws33[7U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 32U);
+  ws33[8U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 64U);
+  ws33[9U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 64U);
+  ws33[10U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 64U);
+  ws33[11U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 64U);
+  ws33[12U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 96U);
+  ws33[13U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 96U);
+  ws33[14U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 96U);
+  ws33[15U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 96U);
+  ws33[16U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 128U);
+  ws33[17U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 128U);
+  ws33[18U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 128U);
+  ws33[19U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 128U);
+  ws33[20U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 160U);
+  ws33[21U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 160U);
+  ws33[22U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 160U);
+  ws33[23U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 160U);
+  ws33[24U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 192U);
+  ws33[25U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 192U);
+  ws33[26U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 192U);
+  ws33[27U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 192U);
+  ws33[28U] = Lib_IntVector_Intrinsics_vec256_load64_le(b0 + 224U);
+  ws33[29U] = Lib_IntVector_Intrinsics_vec256_load64_le(b1 + 224U);
+  ws33[30U] = Lib_IntVector_Intrinsics_vec256_load64_le(b2 + 224U);
+  ws33[31U] = Lib_IntVector_Intrinsics_vec256_load64_le(b3 + 224U);
+  Lib_IntVector_Intrinsics_vec256 v08 = ws33[0U];
+  Lib_IntVector_Intrinsics_vec256 v18 = ws33[1U];
+  Lib_IntVector_Intrinsics_vec256 v28 = ws33[2U];
+  Lib_IntVector_Intrinsics_vec256 v38 = ws33[3U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_7 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v08, v18);
+  Lib_IntVector_Intrinsics_vec256
+  v1_7 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v08, v18);
+  Lib_IntVector_Intrinsics_vec256
+  v2_7 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v28, v38);
+  Lib_IntVector_Intrinsics_vec256
+  v3_7 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v28, v38);
+  Lib_IntVector_Intrinsics_vec256
+  v0__7 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_7, v2_7);
+  Lib_IntVector_Intrinsics_vec256
+  v1__7 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_7, v2_7);
+  Lib_IntVector_Intrinsics_vec256
+  v2__7 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_7, v3_7);
+  Lib_IntVector_Intrinsics_vec256
+  v3__7 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_7, v3_7);
+  Lib_IntVector_Intrinsics_vec256 ws0 = v0__7;
+  Lib_IntVector_Intrinsics_vec256 ws1 = v2__7;
+  Lib_IntVector_Intrinsics_vec256 ws2 = v1__7;
+  Lib_IntVector_Intrinsics_vec256 ws3 = v3__7;
+  Lib_IntVector_Intrinsics_vec256 v09 = ws33[4U];
+  Lib_IntVector_Intrinsics_vec256 v19 = ws33[5U];
+  Lib_IntVector_Intrinsics_vec256 v29 = ws33[6U];
+  Lib_IntVector_Intrinsics_vec256 v39 = ws33[7U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_8 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v09, v19);
+  Lib_IntVector_Intrinsics_vec256
+  v1_8 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v09, v19);
+  Lib_IntVector_Intrinsics_vec256
+  v2_8 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v29, v39);
+  Lib_IntVector_Intrinsics_vec256
+  v3_8 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v29, v39);
+  Lib_IntVector_Intrinsics_vec256
+  v0__8 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_8, v2_8);
+  Lib_IntVector_Intrinsics_vec256
+  v1__8 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_8, v2_8);
+  Lib_IntVector_Intrinsics_vec256
+  v2__8 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_8, v3_8);
+  Lib_IntVector_Intrinsics_vec256
+  v3__8 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_8, v3_8);
+  Lib_IntVector_Intrinsics_vec256 ws4 = v0__8;
+  Lib_IntVector_Intrinsics_vec256 ws5 = v2__8;
+  Lib_IntVector_Intrinsics_vec256 ws6 = v1__8;
+  Lib_IntVector_Intrinsics_vec256 ws7 = v3__8;
+  Lib_IntVector_Intrinsics_vec256 v010 = ws33[8U];
+  Lib_IntVector_Intrinsics_vec256 v110 = ws33[9U];
+  Lib_IntVector_Intrinsics_vec256 v210 = ws33[10U];
+  Lib_IntVector_Intrinsics_vec256 v310 = ws33[11U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_9 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v010, v110);
+  Lib_IntVector_Intrinsics_vec256
+  v1_9 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v010, v110);
+  Lib_IntVector_Intrinsics_vec256
+  v2_9 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v210, v310);
+  Lib_IntVector_Intrinsics_vec256
+  v3_9 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v210, v310);
+  Lib_IntVector_Intrinsics_vec256
+  v0__9 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_9, v2_9);
+  Lib_IntVector_Intrinsics_vec256
+  v1__9 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_9, v2_9);
+  Lib_IntVector_Intrinsics_vec256
+  v2__9 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_9, v3_9);
+  Lib_IntVector_Intrinsics_vec256
+  v3__9 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_9, v3_9);
+  Lib_IntVector_Intrinsics_vec256 ws8 = v0__9;
+  Lib_IntVector_Intrinsics_vec256 ws9 = v2__9;
+  Lib_IntVector_Intrinsics_vec256 ws10 = v1__9;
+  Lib_IntVector_Intrinsics_vec256 ws11 = v3__9;
+  Lib_IntVector_Intrinsics_vec256 v011 = ws33[12U];
+  Lib_IntVector_Intrinsics_vec256 v111 = ws33[13U];
+  Lib_IntVector_Intrinsics_vec256 v211 = ws33[14U];
+  Lib_IntVector_Intrinsics_vec256 v311 = ws33[15U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_10 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v011, v111);
+  Lib_IntVector_Intrinsics_vec256
+  v1_10 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v011, v111);
+  Lib_IntVector_Intrinsics_vec256
+  v2_10 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v211, v311);
+  Lib_IntVector_Intrinsics_vec256
+  v3_10 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v211, v311);
+  Lib_IntVector_Intrinsics_vec256
+  v0__10 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_10, v2_10);
+  Lib_IntVector_Intrinsics_vec256
+  v1__10 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_10, v2_10);
+  Lib_IntVector_Intrinsics_vec256
+  v2__10 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_10, v3_10);
+  Lib_IntVector_Intrinsics_vec256
+  v3__10 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_10, v3_10);
+  Lib_IntVector_Intrinsics_vec256 ws12 = v0__10;
+  Lib_IntVector_Intrinsics_vec256 ws13 = v2__10;
+  Lib_IntVector_Intrinsics_vec256 ws14 = v1__10;
+  Lib_IntVector_Intrinsics_vec256 ws15 = v3__10;
+  Lib_IntVector_Intrinsics_vec256 v012 = ws33[16U];
+  Lib_IntVector_Intrinsics_vec256 v112 = ws33[17U];
+  Lib_IntVector_Intrinsics_vec256 v212 = ws33[18U];
+  Lib_IntVector_Intrinsics_vec256 v312 = ws33[19U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_11 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v012, v112);
+  Lib_IntVector_Intrinsics_vec256
+  v1_11 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v012, v112);
+  Lib_IntVector_Intrinsics_vec256
+  v2_11 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v212, v312);
+  Lib_IntVector_Intrinsics_vec256
+  v3_11 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v212, v312);
+  Lib_IntVector_Intrinsics_vec256
+  v0__11 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_11, v2_11);
+  Lib_IntVector_Intrinsics_vec256
+  v1__11 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_11, v2_11);
+  Lib_IntVector_Intrinsics_vec256
+  v2__11 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_11, v3_11);
+  Lib_IntVector_Intrinsics_vec256
+  v3__11 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_11, v3_11);
+  Lib_IntVector_Intrinsics_vec256 ws16 = v0__11;
+  Lib_IntVector_Intrinsics_vec256 ws17 = v2__11;
+  Lib_IntVector_Intrinsics_vec256 ws18 = v1__11;
+  Lib_IntVector_Intrinsics_vec256 ws19 = v3__11;
+  Lib_IntVector_Intrinsics_vec256 v013 = ws33[20U];
+  Lib_IntVector_Intrinsics_vec256 v113 = ws33[21U];
+  Lib_IntVector_Intrinsics_vec256 v213 = ws33[22U];
+  Lib_IntVector_Intrinsics_vec256 v313 = ws33[23U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_12 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v013, v113);
+  Lib_IntVector_Intrinsics_vec256
+  v1_12 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v013, v113);
+  Lib_IntVector_Intrinsics_vec256
+  v2_12 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v213, v313);
+  Lib_IntVector_Intrinsics_vec256
+  v3_12 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v213, v313);
+  Lib_IntVector_Intrinsics_vec256
+  v0__12 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_12, v2_12);
+  Lib_IntVector_Intrinsics_vec256
+  v1__12 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_12, v2_12);
+  Lib_IntVector_Intrinsics_vec256
+  v2__12 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_12, v3_12);
+  Lib_IntVector_Intrinsics_vec256
+  v3__12 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_12, v3_12);
+  Lib_IntVector_Intrinsics_vec256 ws20 = v0__12;
+  Lib_IntVector_Intrinsics_vec256 ws21 = v2__12;
+  Lib_IntVector_Intrinsics_vec256 ws22 = v1__12;
+  Lib_IntVector_Intrinsics_vec256 ws23 = v3__12;
+  Lib_IntVector_Intrinsics_vec256 v014 = ws33[24U];
+  Lib_IntVector_Intrinsics_vec256 v114 = ws33[25U];
+  Lib_IntVector_Intrinsics_vec256 v214 = ws33[26U];
+  Lib_IntVector_Intrinsics_vec256 v314 = ws33[27U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_13 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v014, v114);
+  Lib_IntVector_Intrinsics_vec256
+  v1_13 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v014, v114);
+  Lib_IntVector_Intrinsics_vec256
+  v2_13 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v214, v314);
+  Lib_IntVector_Intrinsics_vec256
+  v3_13 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v214, v314);
+  Lib_IntVector_Intrinsics_vec256
+  v0__13 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_13, v2_13);
+  Lib_IntVector_Intrinsics_vec256
+  v1__13 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_13, v2_13);
+  Lib_IntVector_Intrinsics_vec256
+  v2__13 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_13, v3_13);
+  Lib_IntVector_Intrinsics_vec256
+  v3__13 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_13, v3_13);
+  Lib_IntVector_Intrinsics_vec256 ws24 = v0__13;
+  Lib_IntVector_Intrinsics_vec256 ws25 = v2__13;
+  Lib_IntVector_Intrinsics_vec256 ws26 = v1__13;
+  Lib_IntVector_Intrinsics_vec256 ws27 = v3__13;
+  Lib_IntVector_Intrinsics_vec256 v0 = ws33[28U];
+  Lib_IntVector_Intrinsics_vec256 v1 = ws33[29U];
+  Lib_IntVector_Intrinsics_vec256 v2 = ws33[30U];
+  Lib_IntVector_Intrinsics_vec256 v3 = ws33[31U];
+  Lib_IntVector_Intrinsics_vec256
+  v0_14 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v0, v1);
+  Lib_IntVector_Intrinsics_vec256
+  v1_14 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v0, v1);
+  Lib_IntVector_Intrinsics_vec256
+  v2_14 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v2, v3);
+  Lib_IntVector_Intrinsics_vec256
+  v3_14 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v2, v3);
+  Lib_IntVector_Intrinsics_vec256
+  v0__14 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_14, v2_14);
+  Lib_IntVector_Intrinsics_vec256
+  v1__14 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_14, v2_14);
+  Lib_IntVector_Intrinsics_vec256
+  v2__14 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_14, v3_14);
+  Lib_IntVector_Intrinsics_vec256
+  v3__14 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_14, v3_14);
+  Lib_IntVector_Intrinsics_vec256 ws28 = v0__14;
+  Lib_IntVector_Intrinsics_vec256 ws29 = v2__14;
+  Lib_IntVector_Intrinsics_vec256 ws30 = v1__14;
+  Lib_IntVector_Intrinsics_vec256 ws31 = v3__14;
+  ws33[0U] = ws0;
+  ws33[1U] = ws1;
+  ws33[2U] = ws2;
+  ws33[3U] = ws3;
+  ws33[4U] = ws4;
+  ws33[5U] = ws5;
+  ws33[6U] = ws6;
+  ws33[7U] = ws7;
+  ws33[8U] = ws8;
+  ws33[9U] = ws9;
+  ws33[10U] = ws10;
+  ws33[11U] = ws11;
+  ws33[12U] = ws12;
+  ws33[13U] = ws13;
+  ws33[14U] = ws14;
+  ws33[15U] = ws15;
+  ws33[16U] = ws16;
+  ws33[17U] = ws17;
+  ws33[18U] = ws18;
+  ws33[19U] = ws19;
+  ws33[20U] = ws20;
+  ws33[21U] = ws21;
+  ws33[22U] = ws22;
+  ws33[23U] = ws23;
+  ws33[24U] = ws24;
+  ws33[25U] = ws25;
+  ws33[26U] = ws26;
+  ws33[27U] = ws27;
+  ws33[28U] = ws28;
+  ws33[29U] = ws29;
+  ws33[30U] = ws30;
+  ws33[31U] = ws31;
+  for (uint32_t i = 0U; i < 25U; i++)
+  {
+    state[i] = Lib_IntVector_Intrinsics_vec256_xor(state[i], ws33[i]);
+  }
+  for (uint32_t i0 = 0U; i0 < 24U; i0++)
+  {
+    KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      Lib_IntVector_Intrinsics_vec256 uu____0 = state[i + 0U];
+      Lib_IntVector_Intrinsics_vec256 uu____1 = state[i + 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____2 = state[i + 10U];
+      _C[i] =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____0,
+          Lib_IntVector_Intrinsics_vec256_xor(uu____1,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____2,
+              Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
+    KRML_MAYBE_FOR5(i1,
+      0U,
+      5U,
+      1U,
+      Lib_IntVector_Intrinsics_vec256 uu____3 = _C[(i1 + 4U) % 5U];
+      Lib_IntVector_Intrinsics_vec256 uu____4 = _C[(i1 + 1U) % 5U];
+      Lib_IntVector_Intrinsics_vec256
+      _D =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____3,
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____4,
+              1U),
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____4, 63U)));
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        state[i1 + 5U * i] = Lib_IntVector_Intrinsics_vec256_xor(state[i1 + 5U * i], _D);););
+    Lib_IntVector_Intrinsics_vec256 x = state[1U];
+    Lib_IntVector_Intrinsics_vec256 current = x;
+    for (uint32_t i = 0U; i < 24U; i++)
+    {
+      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+      Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
+      Lib_IntVector_Intrinsics_vec256 uu____5 = current;
+      state[_Y] =
+        Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____5, r),
+          Lib_IntVector_Intrinsics_vec256_shift_right64(uu____5, 64U - r));
+      current = temp;
+    }
+    KRML_MAYBE_FOR5(i,
+      0U,
+      5U,
+      1U,
+      Lib_IntVector_Intrinsics_vec256 uu____6 = state[0U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____7 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v015 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____6,
+          Lib_IntVector_Intrinsics_vec256_and(uu____7, state[2U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____8 = state[1U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____9 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v115 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____8,
+          Lib_IntVector_Intrinsics_vec256_and(uu____9, state[3U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____10 = state[2U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____11 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v215 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____10,
+          Lib_IntVector_Intrinsics_vec256_and(uu____11, state[4U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____12 = state[3U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____13 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v315 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____12,
+          Lib_IntVector_Intrinsics_vec256_and(uu____13, state[0U + 5U * i]));
+      Lib_IntVector_Intrinsics_vec256 uu____14 = state[4U + 5U * i];
+      Lib_IntVector_Intrinsics_vec256
+      uu____15 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+      Lib_IntVector_Intrinsics_vec256
+      v4 =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____14,
+          Lib_IntVector_Intrinsics_vec256_and(uu____15, state[1U + 5U * i]));
+      state[0U + 5U * i] = v015;
+      state[1U + 5U * i] = v115;
+      state[2U + 5U * i] = v215;
+      state[3U + 5U * i] = v315;
+      state[4U + 5U * i] = v4;);
+    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
+    Lib_IntVector_Intrinsics_vec256 uu____16 = state[0U];
+    state[0U] =
+      Lib_IntVector_Intrinsics_vec256_xor(uu____16,
+        Lib_IntVector_Intrinsics_vec256_load64(c));
+  }
+}
+
+void
+Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
+  Lib_IntVector_Intrinsics_vec256 *state,
+  uint8_t *output0,
+  uint8_t *output1,
+  uint8_t *output2,
+  uint8_t *output3,
+  uint32_t outputByteLen
+)
+{
+  for (uint32_t i0 = 0U; i0 < outputByteLen / 168U; i0++)
+  {
+    uint8_t hbuf[1024U] = { 0U };
+    KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 ws[32U] KRML_POST_ALIGN(32) = { 0U };
+    memcpy(ws, state, 25U * sizeof (Lib_IntVector_Intrinsics_vec256));
+    Lib_IntVector_Intrinsics_vec256 v00 = ws[0U];
+    Lib_IntVector_Intrinsics_vec256 v10 = ws[1U];
+    Lib_IntVector_Intrinsics_vec256 v20 = ws[2U];
+    Lib_IntVector_Intrinsics_vec256 v30 = ws[3U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v1_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v00, v10);
+    Lib_IntVector_Intrinsics_vec256
+    v2_ = Lib_IntVector_Intrinsics_vec256_interleave_low64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v3_ = Lib_IntVector_Intrinsics_vec256_interleave_high64(v20, v30);
+    Lib_IntVector_Intrinsics_vec256
+    v0__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v1__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_, v2_);
+    Lib_IntVector_Intrinsics_vec256
+    v2__ = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256
+    v3__ = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_, v3_);
+    Lib_IntVector_Intrinsics_vec256 ws0 = v0__;
+    Lib_IntVector_Intrinsics_vec256 ws1 = v2__;
+    Lib_IntVector_Intrinsics_vec256 ws2 = v1__;
+    Lib_IntVector_Intrinsics_vec256 ws3 = v3__;
+    Lib_IntVector_Intrinsics_vec256 v01 = ws[4U];
+    Lib_IntVector_Intrinsics_vec256 v11 = ws[5U];
+    Lib_IntVector_Intrinsics_vec256 v21 = ws[6U];
+    Lib_IntVector_Intrinsics_vec256 v31 = ws[7U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v1_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v01, v11);
+    Lib_IntVector_Intrinsics_vec256
+    v2_0 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v3_0 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v21, v31);
+    Lib_IntVector_Intrinsics_vec256
+    v0__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v1__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_0, v2_0);
+    Lib_IntVector_Intrinsics_vec256
+    v2__0 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256
+    v3__0 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_0, v3_0);
+    Lib_IntVector_Intrinsics_vec256 ws4 = v0__0;
+    Lib_IntVector_Intrinsics_vec256 ws5 = v2__0;
+    Lib_IntVector_Intrinsics_vec256 ws6 = v1__0;
+    Lib_IntVector_Intrinsics_vec256 ws7 = v3__0;
+    Lib_IntVector_Intrinsics_vec256 v02 = ws[8U];
+    Lib_IntVector_Intrinsics_vec256 v12 = ws[9U];
+    Lib_IntVector_Intrinsics_vec256 v22 = ws[10U];
+    Lib_IntVector_Intrinsics_vec256 v32 = ws[11U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v1_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v02, v12);
+    Lib_IntVector_Intrinsics_vec256
+    v2_1 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v3_1 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v22, v32);
+    Lib_IntVector_Intrinsics_vec256
+    v0__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v1__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_1, v2_1);
+    Lib_IntVector_Intrinsics_vec256
+    v2__1 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256
+    v3__1 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_1, v3_1);
+    Lib_IntVector_Intrinsics_vec256 ws8 = v0__1;
+    Lib_IntVector_Intrinsics_vec256 ws9 = v2__1;
+    Lib_IntVector_Intrinsics_vec256 ws10 = v1__1;
+    Lib_IntVector_Intrinsics_vec256 ws11 = v3__1;
+    Lib_IntVector_Intrinsics_vec256 v03 = ws[12U];
+    Lib_IntVector_Intrinsics_vec256 v13 = ws[13U];
+    Lib_IntVector_Intrinsics_vec256 v23 = ws[14U];
+    Lib_IntVector_Intrinsics_vec256 v33 = ws[15U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v1_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v03, v13);
+    Lib_IntVector_Intrinsics_vec256
+    v2_2 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v3_2 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v23, v33);
+    Lib_IntVector_Intrinsics_vec256
+    v0__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v1__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_2, v2_2);
+    Lib_IntVector_Intrinsics_vec256
+    v2__2 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256
+    v3__2 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_2, v3_2);
+    Lib_IntVector_Intrinsics_vec256 ws12 = v0__2;
+    Lib_IntVector_Intrinsics_vec256 ws13 = v2__2;
+    Lib_IntVector_Intrinsics_vec256 ws14 = v1__2;
+    Lib_IntVector_Intrinsics_vec256 ws15 = v3__2;
+    Lib_IntVector_Intrinsics_vec256 v04 = ws[16U];
+    Lib_IntVector_Intrinsics_vec256 v14 = ws[17U];
+    Lib_IntVector_Intrinsics_vec256 v24 = ws[18U];
+    Lib_IntVector_Intrinsics_vec256 v34 = ws[19U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v1_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v04, v14);
+    Lib_IntVector_Intrinsics_vec256
+    v2_3 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v3_3 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v24, v34);
+    Lib_IntVector_Intrinsics_vec256
+    v0__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v1__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_3, v2_3);
+    Lib_IntVector_Intrinsics_vec256
+    v2__3 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256
+    v3__3 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_3, v3_3);
+    Lib_IntVector_Intrinsics_vec256 ws16 = v0__3;
+    Lib_IntVector_Intrinsics_vec256 ws17 = v2__3;
+    Lib_IntVector_Intrinsics_vec256 ws18 = v1__3;
+    Lib_IntVector_Intrinsics_vec256 ws19 = v3__3;
+    Lib_IntVector_Intrinsics_vec256 v05 = ws[20U];
+    Lib_IntVector_Intrinsics_vec256 v15 = ws[21U];
+    Lib_IntVector_Intrinsics_vec256 v25 = ws[22U];
+    Lib_IntVector_Intrinsics_vec256 v35 = ws[23U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v1_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v05, v15);
+    Lib_IntVector_Intrinsics_vec256
+    v2_4 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v3_4 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v25, v35);
+    Lib_IntVector_Intrinsics_vec256
+    v0__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v1__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_4, v2_4);
+    Lib_IntVector_Intrinsics_vec256
+    v2__4 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256
+    v3__4 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_4, v3_4);
+    Lib_IntVector_Intrinsics_vec256 ws20 = v0__4;
+    Lib_IntVector_Intrinsics_vec256 ws21 = v2__4;
+    Lib_IntVector_Intrinsics_vec256 ws22 = v1__4;
+    Lib_IntVector_Intrinsics_vec256 ws23 = v3__4;
+    Lib_IntVector_Intrinsics_vec256 v06 = ws[24U];
+    Lib_IntVector_Intrinsics_vec256 v16 = ws[25U];
+    Lib_IntVector_Intrinsics_vec256 v26 = ws[26U];
+    Lib_IntVector_Intrinsics_vec256 v36 = ws[27U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v1_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v06, v16);
+    Lib_IntVector_Intrinsics_vec256
+    v2_5 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v3_5 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v26, v36);
+    Lib_IntVector_Intrinsics_vec256
+    v0__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v1__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_5, v2_5);
+    Lib_IntVector_Intrinsics_vec256
+    v2__5 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256
+    v3__5 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_5, v3_5);
+    Lib_IntVector_Intrinsics_vec256 ws24 = v0__5;
+    Lib_IntVector_Intrinsics_vec256 ws25 = v2__5;
+    Lib_IntVector_Intrinsics_vec256 ws26 = v1__5;
+    Lib_IntVector_Intrinsics_vec256 ws27 = v3__5;
+    Lib_IntVector_Intrinsics_vec256 v0 = ws[28U];
+    Lib_IntVector_Intrinsics_vec256 v1 = ws[29U];
+    Lib_IntVector_Intrinsics_vec256 v2 = ws[30U];
+    Lib_IntVector_Intrinsics_vec256 v3 = ws[31U];
+    Lib_IntVector_Intrinsics_vec256
+    v0_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v1_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v0, v1);
+    Lib_IntVector_Intrinsics_vec256
+    v2_6 = Lib_IntVector_Intrinsics_vec256_interleave_low64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v3_6 = Lib_IntVector_Intrinsics_vec256_interleave_high64(v2, v3);
+    Lib_IntVector_Intrinsics_vec256
+    v0__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v1__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v0_6, v2_6);
+    Lib_IntVector_Intrinsics_vec256
+    v2__6 = Lib_IntVector_Intrinsics_vec256_interleave_low128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256
+    v3__6 = Lib_IntVector_Intrinsics_vec256_interleave_high128(v1_6, v3_6);
+    Lib_IntVector_Intrinsics_vec256 ws28 = v0__6;
+    Lib_IntVector_Intrinsics_vec256 ws29 = v2__6;
+    Lib_IntVector_Intrinsics_vec256 ws30 = v1__6;
+    Lib_IntVector_Intrinsics_vec256 ws31 = v3__6;
+    ws[0U] = ws0;
+    ws[1U] = ws4;
+    ws[2U] = ws8;
+    ws[3U] = ws12;
+    ws[4U] = ws16;
+    ws[5U] = ws20;
+    ws[6U] = ws24;
+    ws[7U] = ws28;
+    ws[8U] = ws1;
+    ws[9U] = ws5;
+    ws[10U] = ws9;
+    ws[11U] = ws13;
+    ws[12U] = ws17;
+    ws[13U] = ws21;
+    ws[14U] = ws25;
+    ws[15U] = ws29;
+    ws[16U] = ws2;
+    ws[17U] = ws6;
+    ws[18U] = ws10;
+    ws[19U] = ws14;
+    ws[20U] = ws18;
+    ws[21U] = ws22;
+    ws[22U] = ws26;
+    ws[23U] = ws30;
+    ws[24U] = ws3;
+    ws[25U] = ws7;
+    ws[26U] = ws11;
+    ws[27U] = ws15;
+    ws[28U] = ws19;
+    ws[29U] = ws23;
+    ws[30U] = ws27;
+    ws[31U] = ws31;
+    for (uint32_t i = 0U; i < 32U; i++)
+    {
+      Lib_IntVector_Intrinsics_vec256_store64_le(hbuf + i * 32U, ws[i]);
+    }
+    uint8_t *b0 = output0;
+    uint8_t *b1 = output1;
+    uint8_t *b2 = output2;
+    uint8_t *b3 = output3;
+    memcpy(b0 + i0 * 168U, hbuf, 168U * sizeof (uint8_t));
+    memcpy(b1 + i0 * 168U, hbuf + 256U, 168U * sizeof (uint8_t));
+    memcpy(b2 + i0 * 168U, hbuf + 512U, 168U * sizeof (uint8_t));
+    memcpy(b3 + i0 * 168U, hbuf + 768U, 168U * sizeof (uint8_t));
+    for (uint32_t i1 = 0U; i1 < 24U; i1++)
+    {
+      KRML_PRE_ALIGN(32) Lib_IntVector_Intrinsics_vec256 _C[5U] KRML_POST_ALIGN(32) = { 0U };
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____0 = state[i + 0U];
+        Lib_IntVector_Intrinsics_vec256 uu____1 = state[i + 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____2 = state[i + 10U];
+        _C[i] =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____0,
+            Lib_IntVector_Intrinsics_vec256_xor(uu____1,
+              Lib_IntVector_Intrinsics_vec256_xor(uu____2,
+                Lib_IntVector_Intrinsics_vec256_xor(state[i + 15U], state[i + 20U])))););
+      KRML_MAYBE_FOR5(i2,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____3 = _C[(i2 + 4U) % 5U];
+        Lib_IntVector_Intrinsics_vec256 uu____4 = _C[(i2 + 1U) % 5U];
+        Lib_IntVector_Intrinsics_vec256
+        _D =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____3,
+            Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____4,
+                1U),
+              Lib_IntVector_Intrinsics_vec256_shift_right64(uu____4, 63U)));
+        KRML_MAYBE_FOR5(i,
+          0U,
+          5U,
+          1U,
+          state[i2 + 5U * i] = Lib_IntVector_Intrinsics_vec256_xor(state[i2 + 5U * i], _D);););
+      Lib_IntVector_Intrinsics_vec256 x = state[1U];
+      Lib_IntVector_Intrinsics_vec256 current = x;
+      for (uint32_t i = 0U; i < 24U; i++)
+      {
+        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
+        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
+        Lib_IntVector_Intrinsics_vec256 temp = state[_Y];
+        Lib_IntVector_Intrinsics_vec256 uu____5 = current;
+        state[_Y] =
+          Lib_IntVector_Intrinsics_vec256_or(Lib_IntVector_Intrinsics_vec256_shift_left64(uu____5,
+              r),
+            Lib_IntVector_Intrinsics_vec256_shift_right64(uu____5, 64U - r));
+        current = temp;
+      }
+      KRML_MAYBE_FOR5(i,
+        0U,
+        5U,
+        1U,
+        Lib_IntVector_Intrinsics_vec256 uu____6 = state[0U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____7 = Lib_IntVector_Intrinsics_vec256_lognot(state[1U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v07 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____6,
+            Lib_IntVector_Intrinsics_vec256_and(uu____7, state[2U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____8 = state[1U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____9 = Lib_IntVector_Intrinsics_vec256_lognot(state[2U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v17 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____8,
+            Lib_IntVector_Intrinsics_vec256_and(uu____9, state[3U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____10 = state[2U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____11 = Lib_IntVector_Intrinsics_vec256_lognot(state[3U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v27 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____10,
+            Lib_IntVector_Intrinsics_vec256_and(uu____11, state[4U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____12 = state[3U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____13 = Lib_IntVector_Intrinsics_vec256_lognot(state[4U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v37 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____12,
+            Lib_IntVector_Intrinsics_vec256_and(uu____13, state[0U + 5U * i]));
+        Lib_IntVector_Intrinsics_vec256 uu____14 = state[4U + 5U * i];
+        Lib_IntVector_Intrinsics_vec256
+        uu____15 = Lib_IntVector_Intrinsics_vec256_lognot(state[0U + 5U * i]);
+        Lib_IntVector_Intrinsics_vec256
+        v4 =
+          Lib_IntVector_Intrinsics_vec256_xor(uu____14,
+            Lib_IntVector_Intrinsics_vec256_and(uu____15, state[1U + 5U * i]));
+        state[0U + 5U * i] = v07;
+        state[1U + 5U * i] = v17;
+        state[2U + 5U * i] = v27;
+        state[3U + 5U * i] = v37;
+        state[4U + 5U * i] = v4;);
+      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
+      Lib_IntVector_Intrinsics_vec256 uu____16 = state[0U];
+      state[0U] =
+        Lib_IntVector_Intrinsics_vec256_xor(uu____16,
+          Lib_IntVector_Intrinsics_vec256_load64(c));
+    }
+  }
 }
 

From 9586ae756e7983ae63dedcf133e80c2b67f3bd61 Mon Sep 17 00:00:00 2001
From: mamonet <maamoun.tk@gmail.com>
Date: Mon, 15 Jan 2024 20:58:44 +0200
Subject: [PATCH 4/6] Remove shake128_absorb API

---
 include/Hacl_Hash_SHA3_Scalar.h       |   5 +-
 include/Hacl_Hash_SHA3_Simd256.h      |   2 +-
 include/msvc/Hacl_Hash_SHA3_Scalar.h  |   5 +-
 include/msvc/Hacl_Hash_SHA3_Simd256.h |   2 +-
 src/Hacl_Hash_SHA3_Scalar.c           | 327 +-------------------------
 src/Hacl_Hash_SHA3_Simd256.c          |   2 +-
 src/msvc/Hacl_Hash_SHA3_Scalar.c      | 327 +-------------------------
 src/msvc/Hacl_Hash_SHA3_Simd256.c     |   2 +-
 8 files changed, 8 insertions(+), 664 deletions(-)

diff --git a/include/Hacl_Hash_SHA3_Scalar.h b/include/Hacl_Hash_SHA3_Scalar.h
index d0b7f253..63cf8710 100644
--- a/include/Hacl_Hash_SHA3_Scalar.h
+++ b/include/Hacl_Hash_SHA3_Scalar.h
@@ -71,15 +71,12 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
 );
 
 void
-Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint64_t *state,
   uint8_t *input,
   uint32_t inputByteLen
 );
 
-void
-Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen);
-
 void
 Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
   uint64_t *state,
diff --git a/include/Hacl_Hash_SHA3_Simd256.h b/include/Hacl_Hash_SHA3_Simd256.h
index d231e273..25c1a166 100644
--- a/include/Hacl_Hash_SHA3_Simd256.h
+++ b/include/Hacl_Hash_SHA3_Simd256.h
@@ -153,7 +153,7 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
 );
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,
diff --git a/include/msvc/Hacl_Hash_SHA3_Scalar.h b/include/msvc/Hacl_Hash_SHA3_Scalar.h
index d0b7f253..63cf8710 100644
--- a/include/msvc/Hacl_Hash_SHA3_Scalar.h
+++ b/include/msvc/Hacl_Hash_SHA3_Scalar.h
@@ -71,15 +71,12 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
 );
 
 void
-Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint64_t *state,
   uint8_t *input,
   uint32_t inputByteLen
 );
 
-void
-Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen);
-
 void
 Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
   uint64_t *state,
diff --git a/include/msvc/Hacl_Hash_SHA3_Simd256.h b/include/msvc/Hacl_Hash_SHA3_Simd256.h
index d231e273..25c1a166 100644
--- a/include/msvc/Hacl_Hash_SHA3_Simd256.h
+++ b/include/msvc/Hacl_Hash_SHA3_Simd256.h
@@ -153,7 +153,7 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
 );
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,
diff --git a/src/Hacl_Hash_SHA3_Scalar.c b/src/Hacl_Hash_SHA3_Scalar.c
index 7393ebf2..6d6806a3 100644
--- a/src/Hacl_Hash_SHA3_Scalar.c
+++ b/src/Hacl_Hash_SHA3_Scalar.c
@@ -2526,7 +2526,7 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
 }
 
 void
-Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint64_t *state,
   uint8_t *input,
   uint32_t inputByteLen
@@ -2730,331 +2730,6 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
   }
 }
 
-void
-Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen)
-{
-  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
-  {
-    uint8_t b1[256U] = { 0U };
-    uint8_t *b_ = b1;
-    uint8_t *b0 = input;
-    uint8_t *bl0 = b_;
-    memcpy(bl0, b0 + i0 * 168U, 168U * sizeof (uint8_t));
-    uint64_t ws[32U] = { 0U };
-    uint8_t *b = b_;
-    uint64_t u = load64_le(b);
-    ws[0U] = u;
-    uint64_t u0 = load64_le(b + 8U);
-    ws[1U] = u0;
-    uint64_t u1 = load64_le(b + 16U);
-    ws[2U] = u1;
-    uint64_t u2 = load64_le(b + 24U);
-    ws[3U] = u2;
-    uint64_t u3 = load64_le(b + 32U);
-    ws[4U] = u3;
-    uint64_t u4 = load64_le(b + 40U);
-    ws[5U] = u4;
-    uint64_t u5 = load64_le(b + 48U);
-    ws[6U] = u5;
-    uint64_t u6 = load64_le(b + 56U);
-    ws[7U] = u6;
-    uint64_t u7 = load64_le(b + 64U);
-    ws[8U] = u7;
-    uint64_t u8 = load64_le(b + 72U);
-    ws[9U] = u8;
-    uint64_t u9 = load64_le(b + 80U);
-    ws[10U] = u9;
-    uint64_t u10 = load64_le(b + 88U);
-    ws[11U] = u10;
-    uint64_t u11 = load64_le(b + 96U);
-    ws[12U] = u11;
-    uint64_t u12 = load64_le(b + 104U);
-    ws[13U] = u12;
-    uint64_t u13 = load64_le(b + 112U);
-    ws[14U] = u13;
-    uint64_t u14 = load64_le(b + 120U);
-    ws[15U] = u14;
-    uint64_t u15 = load64_le(b + 128U);
-    ws[16U] = u15;
-    uint64_t u16 = load64_le(b + 136U);
-    ws[17U] = u16;
-    uint64_t u17 = load64_le(b + 144U);
-    ws[18U] = u17;
-    uint64_t u18 = load64_le(b + 152U);
-    ws[19U] = u18;
-    uint64_t u19 = load64_le(b + 160U);
-    ws[20U] = u19;
-    uint64_t u20 = load64_le(b + 168U);
-    ws[21U] = u20;
-    uint64_t u21 = load64_le(b + 176U);
-    ws[22U] = u21;
-    uint64_t u22 = load64_le(b + 184U);
-    ws[23U] = u22;
-    uint64_t u23 = load64_le(b + 192U);
-    ws[24U] = u23;
-    uint64_t u24 = load64_le(b + 200U);
-    ws[25U] = u24;
-    uint64_t u25 = load64_le(b + 208U);
-    ws[26U] = u25;
-    uint64_t u26 = load64_le(b + 216U);
-    ws[27U] = u26;
-    uint64_t u27 = load64_le(b + 224U);
-    ws[28U] = u27;
-    uint64_t u28 = load64_le(b + 232U);
-    ws[29U] = u28;
-    uint64_t u29 = load64_le(b + 240U);
-    ws[30U] = u29;
-    uint64_t u30 = load64_le(b + 248U);
-    ws[31U] = u30;
-    for (uint32_t i = 0U; i < 25U; i++)
-    {
-      state[i] = state[i] ^ ws[i];
-    }
-    for (uint32_t i1 = 0U; i1 < 24U; i1++)
-    {
-      uint64_t _C[5U] = { 0U };
-      KRML_MAYBE_FOR5(i,
-        0U,
-        5U,
-        1U,
-        _C[i] =
-          state[i
-          + 0U]
-          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
-      KRML_MAYBE_FOR5(i2,
-        0U,
-        5U,
-        1U,
-        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
-        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
-        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
-      uint64_t x = state[1U];
-      uint64_t current = x;
-      for (uint32_t i = 0U; i < 24U; i++)
-      {
-        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
-        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
-        uint64_t temp = state[_Y];
-        uint64_t uu____1 = current;
-        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
-        current = temp;
-      }
-      KRML_MAYBE_FOR5(i,
-        0U,
-        5U,
-        1U,
-        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
-        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
-        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
-        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
-        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
-        state[0U + 5U * i] = v0;
-        state[1U + 5U * i] = v1;
-        state[2U + 5U * i] = v2;
-        state[3U + 5U * i] = v3;
-        state[4U + 5U * i] = v4;);
-      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
-      state[0U] = state[0U] ^ c;
-    }
-  }
-  uint32_t rem = inputByteLen % 168U;
-  uint8_t b2[256U] = { 0U };
-  uint8_t *b_ = b2;
-  uint32_t rem1 = inputByteLen % 168U;
-  uint8_t *b00 = input;
-  uint8_t *bl0 = b_;
-  memcpy(bl0, b00 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  uint8_t *b01 = b_;
-  b01[rem] = 0x1FU;
-  uint64_t ws[32U] = { 0U };
-  uint8_t *b = b_;
-  uint64_t u0 = load64_le(b);
-  ws[0U] = u0;
-  uint64_t u1 = load64_le(b + 8U);
-  ws[1U] = u1;
-  uint64_t u2 = load64_le(b + 16U);
-  ws[2U] = u2;
-  uint64_t u3 = load64_le(b + 24U);
-  ws[3U] = u3;
-  uint64_t u4 = load64_le(b + 32U);
-  ws[4U] = u4;
-  uint64_t u5 = load64_le(b + 40U);
-  ws[5U] = u5;
-  uint64_t u6 = load64_le(b + 48U);
-  ws[6U] = u6;
-  uint64_t u7 = load64_le(b + 56U);
-  ws[7U] = u7;
-  uint64_t u8 = load64_le(b + 64U);
-  ws[8U] = u8;
-  uint64_t u9 = load64_le(b + 72U);
-  ws[9U] = u9;
-  uint64_t u10 = load64_le(b + 80U);
-  ws[10U] = u10;
-  uint64_t u11 = load64_le(b + 88U);
-  ws[11U] = u11;
-  uint64_t u12 = load64_le(b + 96U);
-  ws[12U] = u12;
-  uint64_t u13 = load64_le(b + 104U);
-  ws[13U] = u13;
-  uint64_t u14 = load64_le(b + 112U);
-  ws[14U] = u14;
-  uint64_t u15 = load64_le(b + 120U);
-  ws[15U] = u15;
-  uint64_t u16 = load64_le(b + 128U);
-  ws[16U] = u16;
-  uint64_t u17 = load64_le(b + 136U);
-  ws[17U] = u17;
-  uint64_t u18 = load64_le(b + 144U);
-  ws[18U] = u18;
-  uint64_t u19 = load64_le(b + 152U);
-  ws[19U] = u19;
-  uint64_t u20 = load64_le(b + 160U);
-  ws[20U] = u20;
-  uint64_t u21 = load64_le(b + 168U);
-  ws[21U] = u21;
-  uint64_t u22 = load64_le(b + 176U);
-  ws[22U] = u22;
-  uint64_t u23 = load64_le(b + 184U);
-  ws[23U] = u23;
-  uint64_t u24 = load64_le(b + 192U);
-  ws[24U] = u24;
-  uint64_t u25 = load64_le(b + 200U);
-  ws[25U] = u25;
-  uint64_t u26 = load64_le(b + 208U);
-  ws[26U] = u26;
-  uint64_t u27 = load64_le(b + 216U);
-  ws[27U] = u27;
-  uint64_t u28 = load64_le(b + 224U);
-  ws[28U] = u28;
-  uint64_t u29 = load64_le(b + 232U);
-  ws[29U] = u29;
-  uint64_t u30 = load64_le(b + 240U);
-  ws[30U] = u30;
-  uint64_t u31 = load64_le(b + 248U);
-  ws[31U] = u31;
-  for (uint32_t i = 0U; i < 25U; i++)
-  {
-    state[i] = state[i] ^ ws[i];
-  }
-  uint8_t b3[256U] = { 0U };
-  uint8_t *b4 = b3;
-  uint8_t *b0 = b4;
-  b0[167U] = 0x80U;
-  uint64_t ws0[32U] = { 0U };
-  uint8_t *b1 = b4;
-  uint64_t u = load64_le(b1);
-  ws0[0U] = u;
-  uint64_t u32 = load64_le(b1 + 8U);
-  ws0[1U] = u32;
-  uint64_t u33 = load64_le(b1 + 16U);
-  ws0[2U] = u33;
-  uint64_t u34 = load64_le(b1 + 24U);
-  ws0[3U] = u34;
-  uint64_t u35 = load64_le(b1 + 32U);
-  ws0[4U] = u35;
-  uint64_t u36 = load64_le(b1 + 40U);
-  ws0[5U] = u36;
-  uint64_t u37 = load64_le(b1 + 48U);
-  ws0[6U] = u37;
-  uint64_t u38 = load64_le(b1 + 56U);
-  ws0[7U] = u38;
-  uint64_t u39 = load64_le(b1 + 64U);
-  ws0[8U] = u39;
-  uint64_t u40 = load64_le(b1 + 72U);
-  ws0[9U] = u40;
-  uint64_t u41 = load64_le(b1 + 80U);
-  ws0[10U] = u41;
-  uint64_t u42 = load64_le(b1 + 88U);
-  ws0[11U] = u42;
-  uint64_t u43 = load64_le(b1 + 96U);
-  ws0[12U] = u43;
-  uint64_t u44 = load64_le(b1 + 104U);
-  ws0[13U] = u44;
-  uint64_t u45 = load64_le(b1 + 112U);
-  ws0[14U] = u45;
-  uint64_t u46 = load64_le(b1 + 120U);
-  ws0[15U] = u46;
-  uint64_t u47 = load64_le(b1 + 128U);
-  ws0[16U] = u47;
-  uint64_t u48 = load64_le(b1 + 136U);
-  ws0[17U] = u48;
-  uint64_t u49 = load64_le(b1 + 144U);
-  ws0[18U] = u49;
-  uint64_t u50 = load64_le(b1 + 152U);
-  ws0[19U] = u50;
-  uint64_t u51 = load64_le(b1 + 160U);
-  ws0[20U] = u51;
-  uint64_t u52 = load64_le(b1 + 168U);
-  ws0[21U] = u52;
-  uint64_t u53 = load64_le(b1 + 176U);
-  ws0[22U] = u53;
-  uint64_t u54 = load64_le(b1 + 184U);
-  ws0[23U] = u54;
-  uint64_t u55 = load64_le(b1 + 192U);
-  ws0[24U] = u55;
-  uint64_t u56 = load64_le(b1 + 200U);
-  ws0[25U] = u56;
-  uint64_t u57 = load64_le(b1 + 208U);
-  ws0[26U] = u57;
-  uint64_t u58 = load64_le(b1 + 216U);
-  ws0[27U] = u58;
-  uint64_t u59 = load64_le(b1 + 224U);
-  ws0[28U] = u59;
-  uint64_t u60 = load64_le(b1 + 232U);
-  ws0[29U] = u60;
-  uint64_t u61 = load64_le(b1 + 240U);
-  ws0[30U] = u61;
-  uint64_t u62 = load64_le(b1 + 248U);
-  ws0[31U] = u62;
-  for (uint32_t i = 0U; i < 25U; i++)
-  {
-    state[i] = state[i] ^ ws0[i];
-  }
-  for (uint32_t i0 = 0U; i0 < 24U; i0++)
-  {
-    uint64_t _C[5U] = { 0U };
-    KRML_MAYBE_FOR5(i,
-      0U,
-      5U,
-      1U,
-      _C[i] = state[i + 0U] ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
-    KRML_MAYBE_FOR5(i1,
-      0U,
-      5U,
-      1U,
-      uint64_t uu____2 = _C[(i1 + 1U) % 5U];
-      uint64_t _D = _C[(i1 + 4U) % 5U] ^ (uu____2 << 1U | uu____2 >> 63U);
-      KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i1 + 5U * i] = state[i1 + 5U * i] ^ _D;););
-    uint64_t x = state[1U];
-    uint64_t current = x;
-    for (uint32_t i = 0U; i < 24U; i++)
-    {
-      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
-      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
-      uint64_t temp = state[_Y];
-      uint64_t uu____3 = current;
-      state[_Y] = uu____3 << r | uu____3 >> (64U - r);
-      current = temp;
-    }
-    KRML_MAYBE_FOR5(i,
-      0U,
-      5U,
-      1U,
-      uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
-      uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
-      uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
-      uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
-      uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
-      state[0U + 5U * i] = v0;
-      state[1U + 5U * i] = v1;
-      state[2U + 5U * i] = v2;
-      state[3U + 5U * i] = v3;
-      state[4U + 5U * i] = v4;);
-    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
-    state[0U] = state[0U] ^ c;
-  }
-}
-
 void
 Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
   uint64_t *state,
diff --git a/src/Hacl_Hash_SHA3_Simd256.c b/src/Hacl_Hash_SHA3_Simd256.c
index 9748a375..9046f3db 100644
--- a/src/Hacl_Hash_SHA3_Simd256.c
+++ b/src/Hacl_Hash_SHA3_Simd256.c
@@ -10323,7 +10323,7 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
 }
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,
diff --git a/src/msvc/Hacl_Hash_SHA3_Scalar.c b/src/msvc/Hacl_Hash_SHA3_Scalar.c
index 7393ebf2..6d6806a3 100644
--- a/src/msvc/Hacl_Hash_SHA3_Scalar.c
+++ b/src/msvc/Hacl_Hash_SHA3_Scalar.c
@@ -2526,7 +2526,7 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
 }
 
 void
-Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
+Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint64_t *state,
   uint8_t *input,
   uint32_t inputByteLen
@@ -2730,331 +2730,6 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_last(
   }
 }
 
-void
-Hacl_Hash_SHA3_Scalar_shake128_absorb(uint64_t *state, uint8_t *input, uint32_t inputByteLen)
-{
-  for (uint32_t i0 = 0U; i0 < inputByteLen / 168U; i0++)
-  {
-    uint8_t b1[256U] = { 0U };
-    uint8_t *b_ = b1;
-    uint8_t *b0 = input;
-    uint8_t *bl0 = b_;
-    memcpy(bl0, b0 + i0 * 168U, 168U * sizeof (uint8_t));
-    uint64_t ws[32U] = { 0U };
-    uint8_t *b = b_;
-    uint64_t u = load64_le(b);
-    ws[0U] = u;
-    uint64_t u0 = load64_le(b + 8U);
-    ws[1U] = u0;
-    uint64_t u1 = load64_le(b + 16U);
-    ws[2U] = u1;
-    uint64_t u2 = load64_le(b + 24U);
-    ws[3U] = u2;
-    uint64_t u3 = load64_le(b + 32U);
-    ws[4U] = u3;
-    uint64_t u4 = load64_le(b + 40U);
-    ws[5U] = u4;
-    uint64_t u5 = load64_le(b + 48U);
-    ws[6U] = u5;
-    uint64_t u6 = load64_le(b + 56U);
-    ws[7U] = u6;
-    uint64_t u7 = load64_le(b + 64U);
-    ws[8U] = u7;
-    uint64_t u8 = load64_le(b + 72U);
-    ws[9U] = u8;
-    uint64_t u9 = load64_le(b + 80U);
-    ws[10U] = u9;
-    uint64_t u10 = load64_le(b + 88U);
-    ws[11U] = u10;
-    uint64_t u11 = load64_le(b + 96U);
-    ws[12U] = u11;
-    uint64_t u12 = load64_le(b + 104U);
-    ws[13U] = u12;
-    uint64_t u13 = load64_le(b + 112U);
-    ws[14U] = u13;
-    uint64_t u14 = load64_le(b + 120U);
-    ws[15U] = u14;
-    uint64_t u15 = load64_le(b + 128U);
-    ws[16U] = u15;
-    uint64_t u16 = load64_le(b + 136U);
-    ws[17U] = u16;
-    uint64_t u17 = load64_le(b + 144U);
-    ws[18U] = u17;
-    uint64_t u18 = load64_le(b + 152U);
-    ws[19U] = u18;
-    uint64_t u19 = load64_le(b + 160U);
-    ws[20U] = u19;
-    uint64_t u20 = load64_le(b + 168U);
-    ws[21U] = u20;
-    uint64_t u21 = load64_le(b + 176U);
-    ws[22U] = u21;
-    uint64_t u22 = load64_le(b + 184U);
-    ws[23U] = u22;
-    uint64_t u23 = load64_le(b + 192U);
-    ws[24U] = u23;
-    uint64_t u24 = load64_le(b + 200U);
-    ws[25U] = u24;
-    uint64_t u25 = load64_le(b + 208U);
-    ws[26U] = u25;
-    uint64_t u26 = load64_le(b + 216U);
-    ws[27U] = u26;
-    uint64_t u27 = load64_le(b + 224U);
-    ws[28U] = u27;
-    uint64_t u28 = load64_le(b + 232U);
-    ws[29U] = u28;
-    uint64_t u29 = load64_le(b + 240U);
-    ws[30U] = u29;
-    uint64_t u30 = load64_le(b + 248U);
-    ws[31U] = u30;
-    for (uint32_t i = 0U; i < 25U; i++)
-    {
-      state[i] = state[i] ^ ws[i];
-    }
-    for (uint32_t i1 = 0U; i1 < 24U; i1++)
-    {
-      uint64_t _C[5U] = { 0U };
-      KRML_MAYBE_FOR5(i,
-        0U,
-        5U,
-        1U,
-        _C[i] =
-          state[i
-          + 0U]
-          ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
-      KRML_MAYBE_FOR5(i2,
-        0U,
-        5U,
-        1U,
-        uint64_t uu____0 = _C[(i2 + 1U) % 5U];
-        uint64_t _D = _C[(i2 + 4U) % 5U] ^ (uu____0 << 1U | uu____0 >> 63U);
-        KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i2 + 5U * i] = state[i2 + 5U * i] ^ _D;););
-      uint64_t x = state[1U];
-      uint64_t current = x;
-      for (uint32_t i = 0U; i < 24U; i++)
-      {
-        uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
-        uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
-        uint64_t temp = state[_Y];
-        uint64_t uu____1 = current;
-        state[_Y] = uu____1 << r | uu____1 >> (64U - r);
-        current = temp;
-      }
-      KRML_MAYBE_FOR5(i,
-        0U,
-        5U,
-        1U,
-        uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
-        uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
-        uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
-        uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
-        uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
-        state[0U + 5U * i] = v0;
-        state[1U + 5U * i] = v1;
-        state[2U + 5U * i] = v2;
-        state[3U + 5U * i] = v3;
-        state[4U + 5U * i] = v4;);
-      uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i1];
-      state[0U] = state[0U] ^ c;
-    }
-  }
-  uint32_t rem = inputByteLen % 168U;
-  uint8_t b2[256U] = { 0U };
-  uint8_t *b_ = b2;
-  uint32_t rem1 = inputByteLen % 168U;
-  uint8_t *b00 = input;
-  uint8_t *bl0 = b_;
-  memcpy(bl0, b00 + inputByteLen - rem1, rem1 * sizeof (uint8_t));
-  uint8_t *b01 = b_;
-  b01[rem] = 0x1FU;
-  uint64_t ws[32U] = { 0U };
-  uint8_t *b = b_;
-  uint64_t u0 = load64_le(b);
-  ws[0U] = u0;
-  uint64_t u1 = load64_le(b + 8U);
-  ws[1U] = u1;
-  uint64_t u2 = load64_le(b + 16U);
-  ws[2U] = u2;
-  uint64_t u3 = load64_le(b + 24U);
-  ws[3U] = u3;
-  uint64_t u4 = load64_le(b + 32U);
-  ws[4U] = u4;
-  uint64_t u5 = load64_le(b + 40U);
-  ws[5U] = u5;
-  uint64_t u6 = load64_le(b + 48U);
-  ws[6U] = u6;
-  uint64_t u7 = load64_le(b + 56U);
-  ws[7U] = u7;
-  uint64_t u8 = load64_le(b + 64U);
-  ws[8U] = u8;
-  uint64_t u9 = load64_le(b + 72U);
-  ws[9U] = u9;
-  uint64_t u10 = load64_le(b + 80U);
-  ws[10U] = u10;
-  uint64_t u11 = load64_le(b + 88U);
-  ws[11U] = u11;
-  uint64_t u12 = load64_le(b + 96U);
-  ws[12U] = u12;
-  uint64_t u13 = load64_le(b + 104U);
-  ws[13U] = u13;
-  uint64_t u14 = load64_le(b + 112U);
-  ws[14U] = u14;
-  uint64_t u15 = load64_le(b + 120U);
-  ws[15U] = u15;
-  uint64_t u16 = load64_le(b + 128U);
-  ws[16U] = u16;
-  uint64_t u17 = load64_le(b + 136U);
-  ws[17U] = u17;
-  uint64_t u18 = load64_le(b + 144U);
-  ws[18U] = u18;
-  uint64_t u19 = load64_le(b + 152U);
-  ws[19U] = u19;
-  uint64_t u20 = load64_le(b + 160U);
-  ws[20U] = u20;
-  uint64_t u21 = load64_le(b + 168U);
-  ws[21U] = u21;
-  uint64_t u22 = load64_le(b + 176U);
-  ws[22U] = u22;
-  uint64_t u23 = load64_le(b + 184U);
-  ws[23U] = u23;
-  uint64_t u24 = load64_le(b + 192U);
-  ws[24U] = u24;
-  uint64_t u25 = load64_le(b + 200U);
-  ws[25U] = u25;
-  uint64_t u26 = load64_le(b + 208U);
-  ws[26U] = u26;
-  uint64_t u27 = load64_le(b + 216U);
-  ws[27U] = u27;
-  uint64_t u28 = load64_le(b + 224U);
-  ws[28U] = u28;
-  uint64_t u29 = load64_le(b + 232U);
-  ws[29U] = u29;
-  uint64_t u30 = load64_le(b + 240U);
-  ws[30U] = u30;
-  uint64_t u31 = load64_le(b + 248U);
-  ws[31U] = u31;
-  for (uint32_t i = 0U; i < 25U; i++)
-  {
-    state[i] = state[i] ^ ws[i];
-  }
-  uint8_t b3[256U] = { 0U };
-  uint8_t *b4 = b3;
-  uint8_t *b0 = b4;
-  b0[167U] = 0x80U;
-  uint64_t ws0[32U] = { 0U };
-  uint8_t *b1 = b4;
-  uint64_t u = load64_le(b1);
-  ws0[0U] = u;
-  uint64_t u32 = load64_le(b1 + 8U);
-  ws0[1U] = u32;
-  uint64_t u33 = load64_le(b1 + 16U);
-  ws0[2U] = u33;
-  uint64_t u34 = load64_le(b1 + 24U);
-  ws0[3U] = u34;
-  uint64_t u35 = load64_le(b1 + 32U);
-  ws0[4U] = u35;
-  uint64_t u36 = load64_le(b1 + 40U);
-  ws0[5U] = u36;
-  uint64_t u37 = load64_le(b1 + 48U);
-  ws0[6U] = u37;
-  uint64_t u38 = load64_le(b1 + 56U);
-  ws0[7U] = u38;
-  uint64_t u39 = load64_le(b1 + 64U);
-  ws0[8U] = u39;
-  uint64_t u40 = load64_le(b1 + 72U);
-  ws0[9U] = u40;
-  uint64_t u41 = load64_le(b1 + 80U);
-  ws0[10U] = u41;
-  uint64_t u42 = load64_le(b1 + 88U);
-  ws0[11U] = u42;
-  uint64_t u43 = load64_le(b1 + 96U);
-  ws0[12U] = u43;
-  uint64_t u44 = load64_le(b1 + 104U);
-  ws0[13U] = u44;
-  uint64_t u45 = load64_le(b1 + 112U);
-  ws0[14U] = u45;
-  uint64_t u46 = load64_le(b1 + 120U);
-  ws0[15U] = u46;
-  uint64_t u47 = load64_le(b1 + 128U);
-  ws0[16U] = u47;
-  uint64_t u48 = load64_le(b1 + 136U);
-  ws0[17U] = u48;
-  uint64_t u49 = load64_le(b1 + 144U);
-  ws0[18U] = u49;
-  uint64_t u50 = load64_le(b1 + 152U);
-  ws0[19U] = u50;
-  uint64_t u51 = load64_le(b1 + 160U);
-  ws0[20U] = u51;
-  uint64_t u52 = load64_le(b1 + 168U);
-  ws0[21U] = u52;
-  uint64_t u53 = load64_le(b1 + 176U);
-  ws0[22U] = u53;
-  uint64_t u54 = load64_le(b1 + 184U);
-  ws0[23U] = u54;
-  uint64_t u55 = load64_le(b1 + 192U);
-  ws0[24U] = u55;
-  uint64_t u56 = load64_le(b1 + 200U);
-  ws0[25U] = u56;
-  uint64_t u57 = load64_le(b1 + 208U);
-  ws0[26U] = u57;
-  uint64_t u58 = load64_le(b1 + 216U);
-  ws0[27U] = u58;
-  uint64_t u59 = load64_le(b1 + 224U);
-  ws0[28U] = u59;
-  uint64_t u60 = load64_le(b1 + 232U);
-  ws0[29U] = u60;
-  uint64_t u61 = load64_le(b1 + 240U);
-  ws0[30U] = u61;
-  uint64_t u62 = load64_le(b1 + 248U);
-  ws0[31U] = u62;
-  for (uint32_t i = 0U; i < 25U; i++)
-  {
-    state[i] = state[i] ^ ws0[i];
-  }
-  for (uint32_t i0 = 0U; i0 < 24U; i0++)
-  {
-    uint64_t _C[5U] = { 0U };
-    KRML_MAYBE_FOR5(i,
-      0U,
-      5U,
-      1U,
-      _C[i] = state[i + 0U] ^ (state[i + 5U] ^ (state[i + 10U] ^ (state[i + 15U] ^ state[i + 20U]))););
-    KRML_MAYBE_FOR5(i1,
-      0U,
-      5U,
-      1U,
-      uint64_t uu____2 = _C[(i1 + 1U) % 5U];
-      uint64_t _D = _C[(i1 + 4U) % 5U] ^ (uu____2 << 1U | uu____2 >> 63U);
-      KRML_MAYBE_FOR5(i, 0U, 5U, 1U, state[i1 + 5U * i] = state[i1 + 5U * i] ^ _D;););
-    uint64_t x = state[1U];
-    uint64_t current = x;
-    for (uint32_t i = 0U; i < 24U; i++)
-    {
-      uint32_t _Y = Hacl_Impl_SHA3_Vec_keccak_piln[i];
-      uint32_t r = Hacl_Impl_SHA3_Vec_keccak_rotc[i];
-      uint64_t temp = state[_Y];
-      uint64_t uu____3 = current;
-      state[_Y] = uu____3 << r | uu____3 >> (64U - r);
-      current = temp;
-    }
-    KRML_MAYBE_FOR5(i,
-      0U,
-      5U,
-      1U,
-      uint64_t v0 = state[0U + 5U * i] ^ (~state[1U + 5U * i] & state[2U + 5U * i]);
-      uint64_t v1 = state[1U + 5U * i] ^ (~state[2U + 5U * i] & state[3U + 5U * i]);
-      uint64_t v2 = state[2U + 5U * i] ^ (~state[3U + 5U * i] & state[4U + 5U * i]);
-      uint64_t v3 = state[3U + 5U * i] ^ (~state[4U + 5U * i] & state[0U + 5U * i]);
-      uint64_t v4 = state[4U + 5U * i] ^ (~state[0U + 5U * i] & state[1U + 5U * i]);
-      state[0U + 5U * i] = v0;
-      state[1U + 5U * i] = v1;
-      state[2U + 5U * i] = v2;
-      state[3U + 5U * i] = v3;
-      state[4U + 5U * i] = v4;);
-    uint64_t c = Hacl_Impl_SHA3_Vec_keccak_rndc[i0];
-    state[0U] = state[0U] ^ c;
-  }
-}
-
 void
 Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
   uint64_t *state,
diff --git a/src/msvc/Hacl_Hash_SHA3_Simd256.c b/src/msvc/Hacl_Hash_SHA3_Simd256.c
index 9748a375..9046f3db 100644
--- a/src/msvc/Hacl_Hash_SHA3_Simd256.c
+++ b/src/msvc/Hacl_Hash_SHA3_Simd256.c
@@ -10323,7 +10323,7 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
 }
 
 void
-Hacl_Hash_SHA3_Simd256_shake128_absorb_last(
+Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   Lib_IntVector_Intrinsics_vec256 *state,
   uint8_t *input0,
   uint8_t *input1,

From 60488ea2f8679cd9aec5960d297363716d1fd198 Mon Sep 17 00:00:00 2001
From: mamonet <maamoun.tk@gmail.com>
Date: Wed, 17 Jan 2024 13:18:42 +0200
Subject: [PATCH 5/6] Document some API in SHA3 Scalar/SIMD256 headers

---
 include/Hacl_Hash_SHA3_Scalar.h       | 43 +++++++++++++++++++++++++
 include/Hacl_Hash_SHA3_Simd256.h      | 46 +++++++++++++++++++++++++++
 include/msvc/Hacl_Hash_SHA3_Scalar.h  | 43 +++++++++++++++++++++++++
 include/msvc/Hacl_Hash_SHA3_Simd256.h | 46 +++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)

diff --git a/include/Hacl_Hash_SHA3_Scalar.h b/include/Hacl_Hash_SHA3_Scalar.h
index 63cf8710..a40c2d04 100644
--- a/include/Hacl_Hash_SHA3_Scalar.h
+++ b/include/Hacl_Hash_SHA3_Scalar.h
@@ -59,10 +59,27 @@ void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t in
 
 void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
+/**
+Allocate state buffer of 200-bytes
+*/
 uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void);
 
+/**
+Free state buffer
+*/
 void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s);
 
+/**
+Absorb number of input blocks and write the output state
+
+  This function is intended to receive a hash state and input buffer.
+  It prcoesses an input of multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block are ignored.
+
+  The argument `state` (IN/OUT) points to hash state, i.e., uint64_t[25]
+  The argument `input` (IN) points to `inputByteLen` bytes of valid memory,
+  i.e., uint8_t[inputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
   uint64_t *state,
@@ -70,6 +87,21 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
   uint32_t inputByteLen
 );
 
+/**
+Absorb a final partial block of input and write the output state
+
+  This function is intended to receive a hash state and input buffer.
+  It prcoesses a sequence of bytes at end of input buffer that is less 
+  than 168-bytes (SHAKE128 block size),
+  any bytes of full blocks at start of input buffer are ignored.
+
+  The argument `state` (IN/OUT) points to hash state, i.e., uint64_t[25]
+  The argument `input` (IN) points to `inputByteLen` bytes of valid memory,
+  i.e., uint8_t[inputByteLen]
+  
+  Note: Full size of input buffer must be passed to `inputByteLen` including
+  the number of full-block bytes at start of input buffer that are ignored
+*/
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint64_t *state,
@@ -77,6 +109,17 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint32_t inputByteLen
 );
 
+/**
+Squeeze a hash state to output buffer
+
+  This function is intended to receive a hash state and output buffer.
+  It produces an output of multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block are ignored.
+
+  The argument `state` (IN) points to hash state, i.e., uint64_t[25]
+  The argument `output` (OUT) points to `outputByteLen` bytes of valid memory,
+  i.e., uint8_t[outputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
   uint64_t *state,
diff --git a/include/Hacl_Hash_SHA3_Simd256.h b/include/Hacl_Hash_SHA3_Simd256.h
index 25c1a166..302094a4 100644
--- a/include/Hacl_Hash_SHA3_Simd256.h
+++ b/include/Hacl_Hash_SHA3_Simd256.h
@@ -138,10 +138,28 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   uint32_t inputByteLen
 );
 
+/**
+Allocate quadruple state buffer (200-bytes for each)
+*/
 uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void);
 
+/**
+Free quadruple state buffer
+*/
 void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s);
 
+/**
+Absorb number of blocks of 4 input buffers and write the output states
+
+  This function is intended to receive a quadruple hash state and 4 input buffers.
+  It prcoesses an inputs of multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block for each buffer are ignored.
+
+  The argument `state` (IN/OUT) points to quadruple hash state,
+  i.e., Lib_IntVector_Intrinsics_vec256[25]
+  The arguments `input0/input1/input2/input3` (IN) point to `inputByteLen` bytes 
+  of valid memory for each buffer, i.e., uint8_t[inputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
   Lib_IntVector_Intrinsics_vec256 *state,
@@ -152,6 +170,22 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
   uint32_t inputByteLen
 );
 
+/**
+Absorb a final partial blocks of 4 input buffers and write the output states
+
+  This function is intended to receive a quadruple hash state and 4 input buffers.
+  It prcoesses a sequence of bytes at end of each input buffer that is less 
+  than 168-bytes (SHAKE128 block size),
+  any bytes of full blocks at start of input buffers are ignored.
+
+  The argument `state` (IN/OUT) points to quadruple hash state,
+  i.e., Lib_IntVector_Intrinsics_vec256[25]
+  The arguments `input0/input1/input2/input3` (IN) point to `inputByteLen` bytes 
+  of valid memory for each buffer, i.e., uint8_t[inputByteLen]
+  
+  Note: Full size of input buffers must be passed to `inputByteLen` including
+  the number of full-block bytes at start of each input buffer that are ignored
+*/
 void
 Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   Lib_IntVector_Intrinsics_vec256 *state,
@@ -162,6 +196,18 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   uint32_t inputByteLen
 );
 
+/**
+Squeeze a quadruple hash state to 4 output buffers
+
+  This function is intended to receive a quadruple hash state and 4 output buffers.
+  It produces 4 outputs, each is multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block for each buffer are ignored.
+
+  The argument `state` (IN) points to quadruple hash state,
+  i.e., Lib_IntVector_Intrinsics_vec256[25]
+  The arguments `output0/output1/output2/output3` (OUT) point to `outputByteLen` bytes 
+  of valid memory for each buffer, i.e., uint8_t[inputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
   Lib_IntVector_Intrinsics_vec256 *state,
diff --git a/include/msvc/Hacl_Hash_SHA3_Scalar.h b/include/msvc/Hacl_Hash_SHA3_Scalar.h
index 63cf8710..a40c2d04 100644
--- a/include/msvc/Hacl_Hash_SHA3_Scalar.h
+++ b/include/msvc/Hacl_Hash_SHA3_Scalar.h
@@ -59,10 +59,27 @@ void Hacl_Hash_SHA3_Scalar_sha3_384(uint8_t *output, uint8_t *input, uint32_t in
 
 void Hacl_Hash_SHA3_Scalar_sha3_512(uint8_t *output, uint8_t *input, uint32_t inputByteLen);
 
+/**
+Allocate state buffer of 200-bytes
+*/
 uint64_t *Hacl_Hash_SHA3_Scalar_state_malloc(void);
 
+/**
+Free state buffer
+*/
 void Hacl_Hash_SHA3_Scalar_state_free(uint64_t *s);
 
+/**
+Absorb number of input blocks and write the output state
+
+  This function is intended to receive a hash state and input buffer.
+  It prcoesses an input of multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block are ignored.
+
+  The argument `state` (IN/OUT) points to hash state, i.e., uint64_t[25]
+  The argument `input` (IN) points to `inputByteLen` bytes of valid memory,
+  i.e., uint8_t[inputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
   uint64_t *state,
@@ -70,6 +87,21 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_nblocks(
   uint32_t inputByteLen
 );
 
+/**
+Absorb a final partial block of input and write the output state
+
+  This function is intended to receive a hash state and input buffer.
+  It prcoesses a sequence of bytes at end of input buffer that is less 
+  than 168-bytes (SHAKE128 block size),
+  any bytes of full blocks at start of input buffer are ignored.
+
+  The argument `state` (IN/OUT) points to hash state, i.e., uint64_t[25]
+  The argument `input` (IN) points to `inputByteLen` bytes of valid memory,
+  i.e., uint8_t[inputByteLen]
+  
+  Note: Full size of input buffer must be passed to `inputByteLen` including
+  the number of full-block bytes at start of input buffer that are ignored
+*/
 void
 Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint64_t *state,
@@ -77,6 +109,17 @@ Hacl_Hash_SHA3_Scalar_shake128_absorb_final(
   uint32_t inputByteLen
 );
 
+/**
+Squeeze a hash state to output buffer
+
+  This function is intended to receive a hash state and output buffer.
+  It produces an output of multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block are ignored.
+
+  The argument `state` (IN) points to hash state, i.e., uint64_t[25]
+  The argument `output` (OUT) points to `outputByteLen` bytes of valid memory,
+  i.e., uint8_t[outputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Scalar_shake128_squeeze_nblocks(
   uint64_t *state,
diff --git a/include/msvc/Hacl_Hash_SHA3_Simd256.h b/include/msvc/Hacl_Hash_SHA3_Simd256.h
index 25c1a166..302094a4 100644
--- a/include/msvc/Hacl_Hash_SHA3_Simd256.h
+++ b/include/msvc/Hacl_Hash_SHA3_Simd256.h
@@ -138,10 +138,28 @@ Hacl_Hash_SHA3_Simd256_sha3_512(
   uint32_t inputByteLen
 );
 
+/**
+Allocate quadruple state buffer (200-bytes for each)
+*/
 uint64_t *Hacl_Hash_SHA3_Simd256_state_malloc(void);
 
+/**
+Free quadruple state buffer
+*/
 void Hacl_Hash_SHA3_Simd256_state_free(uint64_t *s);
 
+/**
+Absorb number of blocks of 4 input buffers and write the output states
+
+  This function is intended to receive a quadruple hash state and 4 input buffers.
+  It prcoesses an inputs of multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block for each buffer are ignored.
+
+  The argument `state` (IN/OUT) points to quadruple hash state,
+  i.e., Lib_IntVector_Intrinsics_vec256[25]
+  The arguments `input0/input1/input2/input3` (IN) point to `inputByteLen` bytes 
+  of valid memory for each buffer, i.e., uint8_t[inputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
   Lib_IntVector_Intrinsics_vec256 *state,
@@ -152,6 +170,22 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_nblocks(
   uint32_t inputByteLen
 );
 
+/**
+Absorb a final partial blocks of 4 input buffers and write the output states
+
+  This function is intended to receive a quadruple hash state and 4 input buffers.
+  It prcoesses a sequence of bytes at end of each input buffer that is less 
+  than 168-bytes (SHAKE128 block size),
+  any bytes of full blocks at start of input buffers are ignored.
+
+  The argument `state` (IN/OUT) points to quadruple hash state,
+  i.e., Lib_IntVector_Intrinsics_vec256[25]
+  The arguments `input0/input1/input2/input3` (IN) point to `inputByteLen` bytes 
+  of valid memory for each buffer, i.e., uint8_t[inputByteLen]
+  
+  Note: Full size of input buffers must be passed to `inputByteLen` including
+  the number of full-block bytes at start of each input buffer that are ignored
+*/
 void
 Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   Lib_IntVector_Intrinsics_vec256 *state,
@@ -162,6 +196,18 @@ Hacl_Hash_SHA3_Simd256_shake128_absorb_final(
   uint32_t inputByteLen
 );
 
+/**
+Squeeze a quadruple hash state to 4 output buffers
+
+  This function is intended to receive a quadruple hash state and 4 output buffers.
+  It produces 4 outputs, each is multiple of 168-bytes (SHAKE128 block size),
+  any additional bytes of final partial block for each buffer are ignored.
+
+  The argument `state` (IN) points to quadruple hash state,
+  i.e., Lib_IntVector_Intrinsics_vec256[25]
+  The arguments `output0/output1/output2/output3` (OUT) point to `outputByteLen` bytes 
+  of valid memory for each buffer, i.e., uint8_t[inputByteLen]
+*/
 void
 Hacl_Hash_SHA3_Simd256_shake128_squeeze_nblocks(
   Lib_IntVector_Intrinsics_vec256 *state,

From 856e97ad6f1b487813bd36ddd5e76ab0c564f5ab Mon Sep 17 00:00:00 2001
From: mamonet <maamoun.tk@gmail.com>
Date: Wed, 17 Jan 2024 13:56:00 +0200
Subject: [PATCH 6/6] Fix calling Hacl_Hash_SHA3_Simd256_shake128 in
 libcrux_hacl_glue.c

---
 libcrux/src/libcrux_hacl_glue.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcrux/src/libcrux_hacl_glue.c b/libcrux/src/libcrux_hacl_glue.c
index 4346053f..1c02291b 100644
--- a/libcrux/src/libcrux_hacl_glue.c
+++ b/libcrux/src/libcrux_hacl_glue.c
@@ -39,16 +39,16 @@ libcrux_digest_shake128x4f(size_t len,
       };
 #ifdef HACL_CAN_COMPILE_VEC256
   if (libcrux_platform_simd256_support() == true) {
-    Hacl_Hash_SHA3_Simd256_shake128(input0.len,
+    Hacl_Hash_SHA3_Simd256_shake128(out.fst,
+                                    out.snd,
+                                    out.thd,
+                                    out.f3,
+                                    (uint32_t)len,
                                     input0.ptr,
                                     input1.ptr,
                                     input2.ptr,
                                     input3.ptr,
-                                    (uint32_t)len,
-                                    out.fst,
-                                    out.snd,
-                                    out.thd,
-                                    out.f3);
+                                    input0.len);
   } else {
     Hacl_Hash_SHA3_shake128_hacl(
       input0.len, input0.ptr, (uint32_t)len, out.fst);