From 5edf1fa75bcf3fdc96178c952fbe08dd1f760d92 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Tue, 27 Feb 2024 14:09:19 +0100
Subject: [PATCH 01/16] Move to grep.

egrep is deprecated.
---
 scripts/ci/jlog          | 4 ++--
 scripts/ci/reporter/jlog | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/ci/jlog b/scripts/ci/jlog
index fe99c9c4..f3dbf4c7 100755
--- a/scripts/ci/jlog
+++ b/scripts/ci/jlog
@@ -34,13 +34,13 @@ implementations_status "${wcard}.error" $error;
 echo "${BOLD}Status: ${NORMAL}"
 
 # print implementations with zero warnings in 'green'
-cat $warning | egrep -E "^0, " | \
+cat $warning | grep -E "^0, " | \
 while read line; do
  echo "${GREEN}${BOLD}OK, ${line}${NORMAL}"
 done
 
 # print implementations with some warnings in 'yellow'
-cat $warning | egrep -vE "^0, " | \
+cat $warning | grep -vE "^0, " | \
 while read line; do
  echo "${YELLOW}${BOLD}W, ${line}${NORMAL}"
 done
diff --git a/scripts/ci/reporter/jlog b/scripts/ci/reporter/jlog
index 1c26d46d..e488434a 100755
--- a/scripts/ci/reporter/jlog
+++ b/scripts/ci/reporter/jlog
@@ -36,7 +36,7 @@ print()
   file=$3;
   label=$4;
 
-  egrep -E "$filter" $file | \
+  grep -E "$filter" $file | \
   while read line; do
    line=${line/$dir\//};
    echo -e "${color}${BOLD}${label}, ${line}${NORMAL}"
@@ -46,7 +46,7 @@ print()
 clear_empty()
 {
   file=$1;
-  egrep -E "^0" $file | cut -d',' -f2 | \
+  grep -E "^0" $file | cut -d',' -f2 | \
   while read log; do
     rm -f "$log";
   done

From 6a3da1ab067a75db9ce28a886aea2fad587cdad3 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Tue, 27 Feb 2024 14:59:08 +0100
Subject: [PATCH 02/16] Make crypto_hash CT on DOIT.

---
 .../keccak1600/amd64/bmi1/keccakf1600.jinc    | 16 +++++-
 .../keccak1600/amd64/ref/keccakf1600.jinc     | 16 +++++-
 .../keccak1600/amd64/ref/keccakf1600_v0.jinc  |  9 +++-
 .../keccak1600/amd64/ref1/keccakf1600.jinc    | 16 +++++-
 .../keccak1600/amd64/spec/keccakf1600.jinc    | 18 ++++++-
 src/crypto_hash/sha256/amd64/ref/sha256.jinc  | 39 +++++++++------
 src/crypto_hash/sha512/amd64/ref/sha512.jinc  | 49 ++++++++++++++-----
 7 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
index 565c69ae..6003aa4c 100644
--- a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
@@ -26,13 +26,18 @@ inline fn __theta_rol_bmi1(reg u64[5] c) -> reg u64[5]
 {
   inline int x;
   reg u64[5] d;
+  reg u64 t;
 
   for x = 0 to 5
   { // D[x] = C[x + 1]
     d[x] = c[(x + 1) % 5];
 
     // D[x] = ROT(D[x], 1)
-    _, _, d[x] = #ROL_64(d[x], 1);
+    // ROL is not DOIT, so use shifts.
+    t = d[x];
+    ?{}, t = #SHL_64(t, 1);
+    ?{}, d[x] = #SHR_64(d[x], 63);
+    ?{}, d[x] = #OR_64(t, d[x]);
 
     // D[x] ^= C[x-1]
     d[x] ^= c[(x - 1 + 5) % 5];
@@ -55,6 +60,7 @@ inline fn __rol_sum_bmi1(
 {
   inline int r x x_ y_;
   reg u64[5] b;
+  reg u64 t;
 
   for x = 0 to 5
   {
@@ -69,8 +75,14 @@ inline fn __rol_sum_bmi1(
     b[x] ^= d[x_];
 
     // B[x] = ROT( B[x], r[x',y'] );
+    // ROL is not DOIT, so use shifts.
     if(r != 0)
-    { _, _, b[x] = #ROL_64(b[x], r); }
+    {
+      t = b[x];
+      ?{}, t = #SHL_64(t, r);
+      ?{}, b[x] = #SHR_64(b[x], 64 - r);
+      ?{}, b[x] = #OR_64(t, b[x]);
+    }
 
   }
 
diff --git a/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc
index c748586d..9352df5b 100644
--- a/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc
@@ -26,13 +26,18 @@ inline fn __theta_rol_ref(reg u64[5] c) -> reg u64[5]
 {
   inline int x;
   reg u64[5] d;
+  reg u64 t;
 
   for x = 0 to 5
   { // D[x] = C[x + 1]
     d[x] = c[(x + 1) % 5];
 
     // D[x] = ROT(D[x], 1)
-    _, _, d[x] = #ROL_64(d[x], 1);
+    // ROL is not DOIT, so use shifts.
+    t = d[x];
+    ?{}, t = #SHL_64(t, 1);
+    ?{}, d[x] = #SHR_64(d[x], 63);
+    ?{}, d[x] = #OR_64(t, d[x]);
 
     // D[x] ^= C[x-1]
     d[x] ^= c[(x - 1 + 5) % 5];
@@ -55,6 +60,7 @@ inline fn __rol_sum_ref(
 {
   inline int r x x_ y_;
   reg u64[5] b;
+  reg u64 t;
 
   for x = 0 to 5
   {
@@ -69,8 +75,14 @@ inline fn __rol_sum_ref(
     b[x] ^= d[x_];
 
     // B[x] = ROT( B[x], r[x',y'] );
+    // ROL is not DOIT, so use shifts.
     if(r != 0)
-    { _, _, b[x] = #ROL_64(b[x], r); }
+    {
+      t = b[x];
+      ?{}, t = #SHL_64(t, r);
+      ?{}, b[x] = #SHR_64(b[x], 64 - r);
+      ?{}, b[x] = #OR_64(t, b[x]);
+    }
 
   }
 
diff --git a/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc b/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc
index 260147be..d36f8b9b 100644
--- a/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc
@@ -68,7 +68,12 @@ inline fn __ROL64(reg u64 x, inline int c) -> reg u64
   if (c == 0)
   { y = x; }
   else
-  { _, _, y = #ROL_64(x, c); }
+  { 
+    // _, _, y = #ROL_64(x, c);
+    ?{}, y = #SHL_64(x, c);
+    ?{}, x = #SHR_64(x, 64 - c);
+    ?{}, y = #OR_64(y, x);
+    }
   return y;
 }
 
@@ -96,7 +101,7 @@ inline fn __theta_rol_ref(reg u64[5] c) -> reg u64[5]
 
   for i = 0 to 5
   { d[i] = c[(i+1)%5];
-    _, _, d[i] = #ROL_64(d[i], 1);
+    d[i] = __ROL64(d[i], 1);
     d[i] ^= c[(i+4)%5];
   }
 
diff --git a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
index e261b30b..38c98a12 100644
--- a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
@@ -26,13 +26,18 @@ inline fn __theta_rol_ref1(reg u64[5] c) -> reg u64[5]
 {
   inline int x;
   reg u64[5] d;
+  reg u64 t;
 
   for x = 0 to 5
   { // D[x] = C[x + 1]
     d[x] = c[(x + 1) % 5];
 
     // D[x] = ROT(D[x], 1)
-    _, _, d[x] = #ROL_64(d[x], 1);
+    // ROL is not DOIT, so use shifts.
+    t = d[x];
+    ?{}, t = #SHL_64(t, 1);
+    ?{}, d[x] = #SHR_64(d[x], 63);
+    ?{}, d[x] = #OR_64(t, d[x]);
 
     // D[x] ^= C[x-1]
     d[x] ^= c[(x - 1 + 5) % 5];
@@ -55,6 +60,7 @@ inline fn __rol_sum_ref1(
 {
   inline int r x x_ y_;
   reg u64[5] b;
+  reg u64 t;
 
   for x = 0 to 5
   {
@@ -69,8 +75,14 @@ inline fn __rol_sum_ref1(
     b[x] ^= d[x_];
 
     // B[x] = ROT( B[x], r[x',y'] );
+    // ROL is not DOIT, so use shifts.
     if(r != 0)
-    { _, _, b[x] = #ROL_64(b[x], r); }
+    { 
+      t = b[x];
+      ?{}, t = #SHL_64(t, r);
+      ?{}, b[x] = #SHR_64(b[x], 64 - r);
+      ?{}, b[x] = #OR_64(t, b[x]);
+    }
 
   }
 
diff --git a/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc
index a9113bc5..0dc60f72 100644
--- a/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc
@@ -43,6 +43,7 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25]
 {
   inline int x y;
   reg u64[5] c d;
+  reg u64 t;
 
   for x = 0 to 5 {
     c[x] = 0;
@@ -53,7 +54,13 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25]
 
   for x = 0 to 5 {
     d[x] = c[(x + 1) % 5];
-    _, _, d[x] = #ROL_64(d[x], 1);
+    
+    // ROL is not DOIT, so use shifts.
+    t = d[x];
+    ?{}, t = #SHL_64(t, 1);
+    ?{}, d[x] = #SHR_64(d[x], 63);
+    ?{}, d[x] = #OR_64(t, d[x]);
+
     d[x] ^= c[(x + 4) % 5];
   }
 
@@ -70,12 +77,19 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25]
 inline fn __rho_spec(stack u64[25] a) -> stack u64[25]
 {
   inline int x y i z;
+  reg u64 t;
 
   for x = 0 to 5 {
     for y = 0 to 5 {
       i = __index_spec(x, y);
       z = __keccak_rho_offsets_spec(i);
-      _, _, a[i] = #ROL_64(a[i], z);
+
+      // ROL is not DOIT, so use shifts.
+      t = a[i];
+      ?{}, t = #SHL_64(t, z);
+      ?{}, a[i] = #SHR_64(a[i], 64 - z);
+      ?{}, t = #OR_64(t, a[i]);
+      a[i] = t;
     }
   }
 
diff --git a/src/crypto_hash/sha256/amd64/ref/sha256.jinc b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
index fa7497e4..27fbf1e7 100644
--- a/src/crypto_hash/sha256/amd64/ref/sha256.jinc
+++ b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
@@ -55,9 +55,12 @@ inline fn __store_ref(reg u64 out, stack u32[8] H)
   reg u32 v;
 
   for i=0 to 8
-  { v = H[i];
-    v = #BSWAP_32(v);
-    (u32)[out + i*4] = v;
+  {
+    //BSWAP could be used here, but it is not DOIT.
+    (u8)[out + i*4] = H[u8 i*4 + 3];
+    (u8)[out + i*4 + 1] = H[u8 i*4 + 2];
+    (u8)[out + i*4 + 2] = H[u8 i*4 + 1];
+    (u8)[out + i*4 + 3] = H[u8 i*4];
   }
 }
 
@@ -71,10 +74,14 @@ inline fn __SHR_ref(reg u32 x, inline int c) -> reg u32
 
 inline fn __ROTR_ref(reg u32 x, inline int c) -> reg u32
 {
-  reg u32 r;
-  r = x;
-  _, _, r = #ROR_32(r, c);
-  return r;
+  // ROR could be used here, but it is not DOIT.
+  reg u32 rt rb;
+  rt = x;
+  rb = x;
+  ?{}, rt = #SHR_32(rt, c);
+  ?{}, rb = #SHL_32(rb, 32 - c);
+  ?{}, rt = #OR_32(rt, rb);
+  return rt;
 }
 
 //(x & y) ^ (!x & z)
@@ -202,9 +209,11 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64
   while(inlen >= 64)
   {
     for t=0 to 16
-    { v = (u32)[in + t*4];
-      v = #BSWAP_32(v);
-      W[t] = v;
+    { //BSWAP could be used here, but it is not DOIT.
+      W[u8 t*4] = (u8)[in + t*4 + 3];
+      W[u8 t*4 + 1] = (u8)[in + t*4 + 2];
+      W[u8 t*4 + 2] = (u8)[in + t*4 + 1];
+      W[u8 t*4 + 3] = (u8)[in + t*4];
     }
     in_s = in;
 
@@ -270,7 +279,7 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64
 fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) -> reg ptr u32[8], reg ptr u32[32]
 {
   inline int t;
-  reg u32 T1 T2 a b c d e f g h r v;
+  reg u32 T1 T2 a b c d e f g h r;
   stack u32[64] W;
   reg ptr u32[64] Kp;
   stack ptr u32[8] Hp;
@@ -290,9 +299,11 @@ fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) ->
     s_i = i;
     oblocks = i << 4;
     for t=0 to 16
-    { v = sblocks[(int)oblocks + t];
-      v = #BSWAP_32(v);
-      W[t] = v;
+    { //BSWAP could be used here, but it is not DOIT.
+      W[u8 t*4] = sblocks[u8 (int)oblocks*4 + t*4 + 3];
+      W[u8 t*4 + 1] = sblocks[u8 (int)oblocks*4 + t*4 + 2];
+      W[u8 t*4 + 2] = sblocks[u8 (int)oblocks*4 + t*4 + 1];
+      W[u8 t*4 + 3] = sblocks[u8 (int)oblocks*4 + t*4];
     }
     s_sblocks = sblocks;
 
diff --git a/src/crypto_hash/sha512/amd64/ref/sha512.jinc b/src/crypto_hash/sha512/amd64/ref/sha512.jinc
index 184af39b..76e5c4ca 100644
--- a/src/crypto_hash/sha512/amd64/ref/sha512.jinc
+++ b/src/crypto_hash/sha512/amd64/ref/sha512.jinc
@@ -55,9 +55,15 @@ inline fn __store_ref(reg u64 out, stack u64[8] H)
   reg u64 v;
 
   for i=0 to 8
-  { v = H[i];
-    v = #BSWAP_64(v);
-    (u64)[out + i*8] = v;
+  { //BSWAP could be used here, but it is not DOIT.
+    (u8)[out + i*8] = H[u8 i*8 + 7];
+    (u8)[out + i*8 + 1] = H[u8 i*8 + 6];
+    (u8)[out + i*8 + 2] = H[u8 i*8 + 5];
+    (u8)[out + i*8 + 3] = H[u8 i*8 + 4];
+    (u8)[out + i*8 + 4] = H[u8 i*8 + 3];
+    (u8)[out + i*8 + 5] = H[u8 i*8 + 2];
+    (u8)[out + i*8 + 6] = H[u8 i*8 + 1];
+    (u8)[out + i*8 + 7] = H[u8 i*8];
   }
 }
 
@@ -71,10 +77,14 @@ inline fn __SHR_ref(reg u64 x, inline int c) -> reg u64
 
 inline fn __ROTR_ref(reg u64 x, inline int c) -> reg u64
 {
-  reg u64 r;
-  r = x;
-  _, _, r = #ROR_64(r, c);
-  return r;
+  reg u64 rt rb;
+  //ROR could be used here, but it is not DOIT.
+  rt = x;
+  rb = x;
+  ?{}, rt = #SHR_64(rt, c);
+  ?{}, rb = #SHL_64(rb, 64 - c);
+  ?{}, rt = #OR_64(rt, rb);
+  return rt;
 }
 
 //(x & y) ^ (!x & z)
@@ -202,9 +212,15 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64
   while(inlen >= 128)
   {
     for t=0 to 16
-    { v = (u64)[in + t*8];
-      v = #BSWAP_64(v);
-      W[t] = v;
+    { //BSWAP could be used here, but it is not DOIT.
+      W[u8 t*8] = (u8)[in + t*8 + 7];
+      W[u8 t*8 + 1] = (u8)[in + t*8 + 6];
+      W[u8 t*8 + 2] = (u8)[in + t*8 + 5];
+      W[u8 t*8 + 3] = (u8)[in + t*8 + 4];
+      W[u8 t*8 + 4] = (u8)[in + t*8 + 3];
+      W[u8 t*8 + 5] = (u8)[in + t*8 + 2];
+      W[u8 t*8 + 6] = (u8)[in + t*8 + 1];
+      W[u8 t*8 + 7] = (u8)[in + t*8];
     }
     in_s = in;
 
@@ -290,9 +306,16 @@ fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) ->
     s_i = i;
     oblocks = i << 4;
     for t=0 to 16
-    { v = sblocks[(int)oblocks + t];
-      v = #BSWAP_64(v);
-      W[t] = v;
+    { 
+      //BSWAP could be used here, but it is not DOIT.
+      W[u8 t*8] = sblocks[u8 (int)oblocks*8 + t*8 + 7];
+      W[u8 t*8 + 1] = sblocks[u8 (int)oblocks*8 + t*8 + 6];
+      W[u8 t*8 + 2] = sblocks[u8 (int)oblocks*8 + t*8 + 5];
+      W[u8 t*8 + 3] = sblocks[u8 (int)oblocks*8 + t*8 + 4];
+      W[u8 t*8 + 4] = sblocks[u8 (int)oblocks*8 + t*8 + 3];
+      W[u8 t*8 + 5] = sblocks[u8 (int)oblocks*8 + t*8 + 2];
+      W[u8 t*8 + 6] = sblocks[u8 (int)oblocks*8 + t*8 + 1];
+      W[u8 t*8 + 7] = sblocks[u8 (int)oblocks*8 + t*8];
     }
     s_sblocks = sblocks;
 

From 20f1c51074c65f911d2a06c48f0111afc78db395 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Tue, 27 Feb 2024 16:45:46 +0100
Subject: [PATCH 03/16] Introduce doit common include file. Use inlines from
 there.

---
 src/common/doit.jinc                          | 40 ++++++++++++
 .../keccak1600/amd64/bmi1/keccakf1600.jinc    | 15 +----
 .../keccak1600/amd64/ref/keccakf1600.jinc     | 15 +----
 .../keccak1600/amd64/ref/keccakf1600_v0.jinc  | 17 +----
 .../keccak1600/amd64/ref1/keccakf1600.jinc    | 15 +----
 .../keccak1600/amd64/spec/keccakf1600.jinc    | 13 +---
 src/crypto_hash/sha256/amd64/ref/sha256.jinc  | 26 +++-----
 src/crypto_hash/sha512/amd64/ref/sha512.jinc  | 65 +++++++------------
 8 files changed, 86 insertions(+), 120 deletions(-)
 create mode 100644 src/common/doit.jinc

diff --git a/src/common/doit.jinc b/src/common/doit.jinc
new file mode 100644
index 00000000..2039c1f6
--- /dev/null
+++ b/src/common/doit.jinc
@@ -0,0 +1,40 @@
+// This file contains some utility functions that replace instructions
+// that are not on the DOIT list of guaranteed constant-time instructions.
+
+// ROL is not DOIT, so use shifts.
+inline fn __ROL32(reg u32 x, inline int c) -> reg u32
+{
+  reg u32 xt xb;
+  xt = x;
+  xb = x;
+  xt <<= c;
+  xb >>= 32 - c;
+  xt |= xb;
+  return xt;
+}
+
+// ROR is also not DOIT.
+inline fn __ROR32(reg u32 x, inline int c) -> reg u32
+{
+  x = __ROL32(x, 32 - c);
+  return x;
+}
+
+// ROL is not DOIT, so use shifts.
+inline fn __ROL64(reg u64 x, inline int c) -> reg u64
+{
+  reg u64 xt xb;
+  xt = x;
+  xb = x;
+  xt <<= c;
+  xb >>= 64 - c;
+  xt |= xb;
+  return xt;
+}
+
+// ROR is also not DOIT.
+inline fn __ROR64(reg u64 x, inline int c) -> reg u64
+{
+  x = __ROL64(x, 64 - c);
+  return x;
+}
\ No newline at end of file
diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
index 6003aa4c..cb685a45 100644
--- a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc
@@ -1,5 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600_globals.jinc" // KECCAK1600_RC
 from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600.jinc" // __rhotates_spec
+from Jade require "common/doit.jinc" //__ROL64
 
 // C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4]
 inline fn __theta_sum_bmi1(reg ptr u64[25] a) -> reg u64[5]
@@ -26,18 +27,13 @@ inline fn __theta_rol_bmi1(reg u64[5] c) -> reg u64[5]
 {
   inline int x;
   reg u64[5] d;
-  reg u64 t;
 
   for x = 0 to 5
   { // D[x] = C[x + 1]
     d[x] = c[(x + 1) % 5];
 
     // D[x] = ROT(D[x], 1)
-    // ROL is not DOIT, so use shifts.
-    t = d[x];
-    ?{}, t = #SHL_64(t, 1);
-    ?{}, d[x] = #SHR_64(d[x], 63);
-    ?{}, d[x] = #OR_64(t, d[x]);
+    d[x] = __ROL64(d[x], 1);
 
     // D[x] ^= C[x-1]
     d[x] ^= c[(x - 1 + 5) % 5];
@@ -60,7 +56,6 @@ inline fn __rol_sum_bmi1(
 {
   inline int r x x_ y_;
   reg u64[5] b;
-  reg u64 t;
 
   for x = 0 to 5
   {
@@ -75,13 +70,9 @@ inline fn __rol_sum_bmi1(
     b[x] ^= d[x_];
 
     // B[x] = ROT( B[x], r[x',y'] );
-    // ROL is not DOIT, so use shifts.
     if(r != 0)
     {
-      t = b[x];
-      ?{}, t = #SHL_64(t, r);
-      ?{}, b[x] = #SHR_64(b[x], 64 - r);
-      ?{}, b[x] = #OR_64(t, b[x]);
+      b[x] = __ROL64(b[x], r);
     }
 
   }
diff --git a/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc
index 9352df5b..7559172c 100644
--- a/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref/keccakf1600.jinc
@@ -1,5 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600_globals.jinc" // KECCAK1600_RC
 from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600.jinc" // __rhotates_spec
+from Jade require "common/doit.jinc" //__ROL64
 
 // C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4]
 inline fn __theta_sum_ref(stack u64[25] a) -> reg u64[5]
@@ -26,18 +27,13 @@ inline fn __theta_rol_ref(reg u64[5] c) -> reg u64[5]
 {
   inline int x;
   reg u64[5] d;
-  reg u64 t;
 
   for x = 0 to 5
   { // D[x] = C[x + 1]
     d[x] = c[(x + 1) % 5];
 
     // D[x] = ROT(D[x], 1)
-    // ROL is not DOIT, so use shifts.
-    t = d[x];
-    ?{}, t = #SHL_64(t, 1);
-    ?{}, d[x] = #SHR_64(d[x], 63);
-    ?{}, d[x] = #OR_64(t, d[x]);
+    d[x] = __ROL64(d[x], 1);
 
     // D[x] ^= C[x-1]
     d[x] ^= c[(x - 1 + 5) % 5];
@@ -60,7 +56,6 @@ inline fn __rol_sum_ref(
 {
   inline int r x x_ y_;
   reg u64[5] b;
-  reg u64 t;
 
   for x = 0 to 5
   {
@@ -75,13 +70,9 @@ inline fn __rol_sum_ref(
     b[x] ^= d[x_];
 
     // B[x] = ROT( B[x], r[x',y'] );
-    // ROL is not DOIT, so use shifts.
     if(r != 0)
     {
-      t = b[x];
-      ?{}, t = #SHL_64(t, r);
-      ?{}, b[x] = #SHR_64(b[x], 64 - r);
-      ?{}, b[x] = #OR_64(t, b[x]);
+      b[x] = __ROL64(b[x], r);
     }
 
   }
diff --git a/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc b/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc
index d36f8b9b..16302623 100644
--- a/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref/keccakf1600_v0.jinc
@@ -1,3 +1,4 @@
+from Jade require "common/doit.jinc" //__ROL64
 
 u64[24] KECCAK_RC =
 {  0x0000000000000001
@@ -61,22 +62,6 @@ inline fn __rhotates(inline int x y) -> inline int
   return r;
 }
 
-inline fn __ROL64(reg u64 x, inline int c) -> reg u64
-{
-  reg u64 y;
-
-  if (c == 0)
-  { y = x; }
-  else
-  { 
-    // _, _, y = #ROL_64(x, c);
-    ?{}, y = #SHL_64(x, c);
-    ?{}, x = #SHR_64(x, 64 - c);
-    ?{}, y = #OR_64(y, x);
-    }
-  return y;
-}
-
 inline fn __theta_sum_ref(stack u64[25] a) -> reg u64[5]
 {
   inline int i j;
diff --git a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
index 38c98a12..545e7d38 100644
--- a/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/ref1/keccakf1600.jinc
@@ -1,5 +1,6 @@
 from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600_globals.jinc" // KECCAK1600_RC
 from Jade require "common/keccak/keccak1600/amd64/spec/keccakf1600.jinc" // __rhotates_spec
+from Jade require "common/doit.jinc" //__ROL64
 
 // C[x] = A[x,0] ^ A[x,1] ^ A[x,2] ^ A[x,3] ^ A[x,4]
 inline fn __theta_sum_ref1(reg ptr u64[25] a) -> reg u64[5]
@@ -26,18 +27,13 @@ inline fn __theta_rol_ref1(reg u64[5] c) -> reg u64[5]
 {
   inline int x;
   reg u64[5] d;
-  reg u64 t;
 
   for x = 0 to 5
   { // D[x] = C[x + 1]
     d[x] = c[(x + 1) % 5];
 
     // D[x] = ROT(D[x], 1)
-    // ROL is not DOIT, so use shifts.
-    t = d[x];
-    ?{}, t = #SHL_64(t, 1);
-    ?{}, d[x] = #SHR_64(d[x], 63);
-    ?{}, d[x] = #OR_64(t, d[x]);
+    d[x] = __ROL64(d[x], 1);
 
     // D[x] ^= C[x-1]
     d[x] ^= c[(x - 1 + 5) % 5];
@@ -60,7 +56,6 @@ inline fn __rol_sum_ref1(
 {
   inline int r x x_ y_;
   reg u64[5] b;
-  reg u64 t;
 
   for x = 0 to 5
   {
@@ -75,13 +70,9 @@ inline fn __rol_sum_ref1(
     b[x] ^= d[x_];
 
     // B[x] = ROT( B[x], r[x',y'] );
-    // ROL is not DOIT, so use shifts.
     if(r != 0)
     { 
-      t = b[x];
-      ?{}, t = #SHL_64(t, r);
-      ?{}, b[x] = #SHR_64(b[x], 64 - r);
-      ?{}, b[x] = #OR_64(t, b[x]);
+      b[x] = __ROL64(b[x], r);
     }
 
   }
diff --git a/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc
index 0dc60f72..ea4b4bdb 100644
--- a/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc
+++ b/src/common/keccak/keccak1600/amd64/spec/keccakf1600.jinc
@@ -1,5 +1,6 @@
 
 require "keccakf1600_globals.jinc"
+from Jade require "common/doit.jinc" //__ROL64
 
 inline fn __index_spec(inline int x y) -> inline int
 {
@@ -43,7 +44,6 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25]
 {
   inline int x y;
   reg u64[5] c d;
-  reg u64 t;
 
   for x = 0 to 5 {
     c[x] = 0;
@@ -55,11 +55,7 @@ inline fn __theta_spec(stack u64[25] a) -> stack u64[25]
   for x = 0 to 5 {
     d[x] = c[(x + 1) % 5];
     
-    // ROL is not DOIT, so use shifts.
-    t = d[x];
-    ?{}, t = #SHL_64(t, 1);
-    ?{}, d[x] = #SHR_64(d[x], 63);
-    ?{}, d[x] = #OR_64(t, d[x]);
+    d[x] = __ROL64(d[x], 1);
 
     d[x] ^= c[(x + 4) % 5];
   }
@@ -84,11 +80,8 @@ inline fn __rho_spec(stack u64[25] a) -> stack u64[25]
       i = __index_spec(x, y);
       z = __keccak_rho_offsets_spec(i);
 
-      // ROL is not DOIT, so use shifts.
       t = a[i];
-      ?{}, t = #SHL_64(t, z);
-      ?{}, a[i] = #SHR_64(a[i], 64 - z);
-      ?{}, t = #OR_64(t, a[i]);
+      t = __ROL64(t, z);
       a[i] = t;
     }
   }
diff --git a/src/crypto_hash/sha256/amd64/ref/sha256.jinc b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
index 27fbf1e7..b99bf9ed 100644
--- a/src/crypto_hash/sha256/amd64/ref/sha256.jinc
+++ b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
@@ -1,5 +1,6 @@
 
 require "sha256_globals.jinc"
+from Jade require "common/doit.jinc" //__ROR32
 
 inline fn __initH_ref() -> stack u32[8]
 {
@@ -55,9 +56,8 @@ inline fn __store_ref(reg u64 out, stack u32[8] H)
   reg u32 v;
 
   for i=0 to 8
-  {
-    //BSWAP could be used here, but it is not DOIT.
-    (u8)[out + i*4] = H[u8 i*4 + 3];
+  { // BSWAP could be used here, but it is not DOIT.
+    (u8)[out + i*4]     = H[u8 i*4 + 3];
     (u8)[out + i*4 + 1] = H[u8 i*4 + 2];
     (u8)[out + i*4 + 2] = H[u8 i*4 + 1];
     (u8)[out + i*4 + 3] = H[u8 i*4];
@@ -74,14 +74,8 @@ inline fn __SHR_ref(reg u32 x, inline int c) -> reg u32
 
 inline fn __ROTR_ref(reg u32 x, inline int c) -> reg u32
 {
-  // ROR could be used here, but it is not DOIT.
-  reg u32 rt rb;
-  rt = x;
-  rb = x;
-  ?{}, rt = #SHR_32(rt, c);
-  ?{}, rb = #SHL_32(rb, 32 - c);
-  ?{}, rt = #OR_32(rt, rb);
-  return rt;
+  x = __ROR32(x, c);
+  return x;
 }
 
 //(x & y) ^ (!x & z)
@@ -210,7 +204,7 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64
   {
     for t=0 to 16
     { //BSWAP could be used here, but it is not DOIT.
-      W[u8 t*4] = (u8)[in + t*4 + 3];
+      W[u8 t*4]     = (u8)[in + t*4 + 3];
       W[u8 t*4 + 1] = (u8)[in + t*4 + 2];
       W[u8 t*4 + 2] = (u8)[in + t*4 + 1];
       W[u8 t*4 + 3] = (u8)[in + t*4];
@@ -300,10 +294,10 @@ fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) ->
     oblocks = i << 4;
     for t=0 to 16
     { //BSWAP could be used here, but it is not DOIT.
-      W[u8 t*4] = sblocks[u8 (int)oblocks*4 + t*4 + 3];
-      W[u8 t*4 + 1] = sblocks[u8 (int)oblocks*4 + t*4 + 2];
-      W[u8 t*4 + 2] = sblocks[u8 (int)oblocks*4 + t*4 + 1];
-      W[u8 t*4 + 3] = sblocks[u8 (int)oblocks*4 + t*4];
+      W[u8 t*4]     = sblocks[u8 oblocks*4 + t*4 + 3];
+      W[u8 t*4 + 1] = sblocks[u8 oblocks*4 + t*4 + 2];
+      W[u8 t*4 + 2] = sblocks[u8 oblocks*4 + t*4 + 1];
+      W[u8 t*4 + 3] = sblocks[u8 oblocks*4 + t*4];
     }
     s_sblocks = sblocks;
 
diff --git a/src/crypto_hash/sha512/amd64/ref/sha512.jinc b/src/crypto_hash/sha512/amd64/ref/sha512.jinc
index 76e5c4ca..76426f24 100644
--- a/src/crypto_hash/sha512/amd64/ref/sha512.jinc
+++ b/src/crypto_hash/sha512/amd64/ref/sha512.jinc
@@ -1,5 +1,6 @@
 
 require "sha512_globals.jinc"
+from Jade require "common/doit.jinc" //__ROR64
 
 inline fn __initH_ref() -> stack u64[8]
 {
@@ -51,19 +52,14 @@ inline fn __store_H_ref(reg ptr u64[8] H, reg u64 a b c d e f g h) -> reg ptr u6
 
 inline fn __store_ref(reg u64 out, stack u64[8] H)
 {
-  inline int i;
-  reg u64 v;
+  inline int i j;
 
   for i=0 to 8
-  { //BSWAP could be used here, but it is not DOIT.
-    (u8)[out + i*8] = H[u8 i*8 + 7];
-    (u8)[out + i*8 + 1] = H[u8 i*8 + 6];
-    (u8)[out + i*8 + 2] = H[u8 i*8 + 5];
-    (u8)[out + i*8 + 3] = H[u8 i*8 + 4];
-    (u8)[out + i*8 + 4] = H[u8 i*8 + 3];
-    (u8)[out + i*8 + 5] = H[u8 i*8 + 2];
-    (u8)[out + i*8 + 6] = H[u8 i*8 + 1];
-    (u8)[out + i*8 + 7] = H[u8 i*8];
+  { // BSWAP could be used here, but it is not DOIT.
+    for j=0 to 8
+    {
+      (u8)[out + i*8 + j] = H[u8 i*8 + (7 - j)];
+    }
   }
 }
 
@@ -77,14 +73,8 @@ inline fn __SHR_ref(reg u64 x, inline int c) -> reg u64
 
 inline fn __ROTR_ref(reg u64 x, inline int c) -> reg u64
 {
-  reg u64 rt rb;
-  //ROR could be used here, but it is not DOIT.
-  rt = x;
-  rb = x;
-  ?{}, rt = #SHR_64(rt, c);
-  ?{}, rb = #SHL_64(rb, 64 - c);
-  ?{}, rt = #OR_64(rt, rb);
-  return rt;
+  x = __ROR64(x, c);
+  return x;
 }
 
 //(x & y) ^ (!x & z)
@@ -195,8 +185,8 @@ inline fn __Wt_ref(stack u64[80] W, inline int t) -> stack u64[80]
 
 fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64, reg u64
 {
-  inline int t;
-  reg u64 T1 T2 a b c d e f g h r v;
+  inline int t u;
+  reg u64 T1 T2 a b c d e f g h r;
   stack u64[80] W;
   reg ptr u64[80] Kp;
   stack ptr u64[8] Hp;
@@ -212,15 +202,11 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64
   while(inlen >= 128)
   {
     for t=0 to 16
-    { //BSWAP could be used here, but it is not DOIT.
-      W[u8 t*8] = (u8)[in + t*8 + 7];
-      W[u8 t*8 + 1] = (u8)[in + t*8 + 6];
-      W[u8 t*8 + 2] = (u8)[in + t*8 + 5];
-      W[u8 t*8 + 3] = (u8)[in + t*8 + 4];
-      W[u8 t*8 + 4] = (u8)[in + t*8 + 3];
-      W[u8 t*8 + 5] = (u8)[in + t*8 + 2];
-      W[u8 t*8 + 6] = (u8)[in + t*8 + 1];
-      W[u8 t*8 + 7] = (u8)[in + t*8];
+    { // BSWAP could be used here, but it is not DOIT.
+      for u=0 to 8
+      {
+        W[u8 t*8 + u] = (u8)[in + t*8 + (7 - u)];
+      }
     }
     in_s = in;
 
@@ -285,8 +271,8 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64
 
 fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) -> reg ptr u64[8], reg ptr u64[32]
 {
-  inline int t;
-  reg u64 T1 T2 a b c d e f g h r v;
+  inline int t u;
+  reg u64 T1 T2 a b c d e f g h r;
   stack u64[80] W;
   reg ptr u64[80] Kp;
   stack ptr u64[8] Hp;
@@ -306,16 +292,11 @@ fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) ->
     s_i = i;
     oblocks = i << 4;
     for t=0 to 16
-    { 
-      //BSWAP could be used here, but it is not DOIT.
-      W[u8 t*8] = sblocks[u8 (int)oblocks*8 + t*8 + 7];
-      W[u8 t*8 + 1] = sblocks[u8 (int)oblocks*8 + t*8 + 6];
-      W[u8 t*8 + 2] = sblocks[u8 (int)oblocks*8 + t*8 + 5];
-      W[u8 t*8 + 3] = sblocks[u8 (int)oblocks*8 + t*8 + 4];
-      W[u8 t*8 + 4] = sblocks[u8 (int)oblocks*8 + t*8 + 3];
-      W[u8 t*8 + 5] = sblocks[u8 (int)oblocks*8 + t*8 + 2];
-      W[u8 t*8 + 6] = sblocks[u8 (int)oblocks*8 + t*8 + 1];
-      W[u8 t*8 + 7] = sblocks[u8 (int)oblocks*8 + t*8];
+    { // BSWAP could be used here, but it is not DOIT.
+      for u=0 to 8
+      {
+        W[u8 t*8 + u] = sblocks[u8 oblocks*8 + t*8 + (7 - u)];
+      }
     }
     s_sblocks = sblocks;
 

From 5a8da25c1d0df3281106a463a18500a8da8196b9 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Tue, 27 Feb 2024 17:09:22 +0100
Subject: [PATCH 04/16] Remove unused variables from sha256.

---
 src/crypto_hash/sha256/amd64/ref/sha256.jinc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/crypto_hash/sha256/amd64/ref/sha256.jinc b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
index b99bf9ed..4cab1790 100644
--- a/src/crypto_hash/sha256/amd64/ref/sha256.jinc
+++ b/src/crypto_hash/sha256/amd64/ref/sha256.jinc
@@ -53,7 +53,6 @@ inline fn __store_H_ref(reg ptr u32[8] H, reg u32 a b c d e f g h) -> reg ptr u3
 inline fn __store_ref(reg u64 out, stack u32[8] H)
 {
   inline int i;
-  reg u32 v;
 
   for i=0 to 8
   { // BSWAP could be used here, but it is not DOIT.
@@ -187,7 +186,7 @@ inline fn __Wt_ref(stack u32[64] W, inline int t) -> stack u32[64]
 fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64, reg u64
 {
   inline int t;
-  reg u32 T1 T2 a b c d e f g h r v;
+  reg u32 T1 T2 a b c d e f g h r;
   stack u32[64] W;
   reg ptr u32[64] Kp;
   stack ptr u32[8] Hp;

From 92557ae2def07760a22fdbb85362dca4058e7f10 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Thu, 29 Feb 2024 12:34:37 +0100
Subject: [PATCH 05/16] Fix chacha to use DOIT.

This spills some more registers to the stack in the reference chacha core. This comes at a significant performance cost. Further experimentation should be done with register packing instead.
---
 src/common/doit.jinc                          | 36 +++++++++++
 .../chacha/common/amd64/ref/chacha_core.jinc  | 60 ++++++++++++++-----
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/src/common/doit.jinc b/src/common/doit.jinc
index 2039c1f6..797e2333 100644
--- a/src/common/doit.jinc
+++ b/src/common/doit.jinc
@@ -13,6 +13,42 @@ inline fn __ROL32(reg u32 x, inline int c) -> reg u32
   return xt;
 }
 
+inline fn __ROL32x(reg u32 x, inline int c) -> reg u32
+{
+  reg u32 y;
+  y = x;
+  x <<= c;
+  y >>= 32 - c;
+  x |= y;
+  return x;
+}
+
+inline fn __ROL32y(reg u32 x, inline int c) -> reg u32
+{
+  reg u32 y;
+  y = x;
+  x <<= c;
+  y >>= 32 - c;
+  y |= x;
+  return y;
+}
+
+inline fn __ROL32s(reg u32 x, inline int c) -> reg u32
+{
+  stack u32 y;
+  y = x;
+  x <<= c;
+  y >>= 32 - c;
+  x |= y;
+  return x;
+}
+
+inline fn __ROL32i(reg u32 x, inline int c) -> reg u32
+{
+  ?{}, x = #ROL_32(x, c);
+  return x;
+}
+
 // ROR is also not DOIT.
 inline fn __ROR32(reg u32 x, inline int c) -> reg u32
 {
diff --git a/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc b/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc
index d0f238d6..b870bc42 100644
--- a/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc
+++ b/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc
@@ -1,3 +1,4 @@
+from Jade require "common/doit.jinc" //__ROL32
 
 // the following implementation requires:
 // - (even) param int CHACHA_ROUNDS;
@@ -22,7 +23,7 @@ inline fn __copy_state_ref(stack u32[16] st) -> reg u32[16], stack u32
 }
 
 
-///////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////// 
 
 
 // not exported; may be useful as spec;
@@ -30,7 +31,7 @@ inline fn __line_ref(reg u32[16] k, inline int a b c r) -> reg u32[16]
 {
   k[a] += k[b];
   k[c] ^= k[a];
-  _, _, k[c] = #ROL_32(k[c], r);
+  k[c] = __ROL32x(k[c], r);
   return k;
 }
 
@@ -111,9 +112,9 @@ inline fn __half_round_inline_ref(
 
   k[d0] ^= k[a0];
   k[d1] ^= k[a1];
-
-  _, _, k[d0] = #ROL_32(k[d0], 16);
-  _, _, k[d1] = #ROL_32(k[d1], 16);
+  
+  k[d0] = __ROL32x(k[d0], 16);
+  k[d1] = __ROL32x(k[d1], 16);
 
   //k = line(k, c, d, b, 12);
   k[c0] += k[d0];
@@ -122,9 +123,9 @@ inline fn __half_round_inline_ref(
   k[b0] ^= k[c0];
   k[b1] ^= k[c1];
 
-  _, _, k[b0] = #ROL_32(k[b0], 12);
-  _, _, k[b1] = #ROL_32(k[b1], 12);
-
+  k[b0] = __ROL32x(k[b0], 12);
+  k[b1] = __ROL32x(k[b1], 12);
+  
   //k = line(k, a, b, d, 8);
   k[a0] += k[b0];
   k[a1] += k[b1];
@@ -132,8 +133,8 @@ inline fn __half_round_inline_ref(
   k[d0] ^= k[a0];
   k[d1] ^= k[a1];
 
-  _, _, k[d0] = #ROL_32(k[d0], 8);
-  _, _, k[d1] = #ROL_32(k[d1], 8);
+  k[d0] = __ROL32x(k[d0], 8);
+  k[d1] = __ROL32x(k[d1], 8);
 
   //k = line(k, c, d, b, 7);
   k[c0] += k[d0];
@@ -142,22 +143,42 @@ inline fn __half_round_inline_ref(
   k[b0] ^= k[c0];
   k[b1] ^= k[c1];
 
-  _, _, k[b0] = #ROL_32(k[b0], 7);
-  _, _, k[b1] = #ROL_32(k[b1], 7);
+  k[b0] = __ROL32x(k[b0], 7);
+  k[b1] = __ROL32x(k[b1], 7);
 
   return k;
 }
 
 
 // used;
-inline fn __double_round_inline_ref(reg u32[16] k, stack u32 k14 k15) -> reg u32[16], stack u32, stack u32
+//
+//                                                                             
+// The function below requires the spillage of some state on the stack
+// this is due to the 
+//         ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐                   
+//         │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│  + - Value used   
+// ┌───────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                   
+// │       │ +│  │ +│  │ +│ S│ +│  │ +│  │ +│  │ +│  │ +│ S│  S - Stack spills 
+// │ Round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                   
+// │       │  │ +│  │ +│ S│ +│  │ +│  │ +│  │ +│  │ +│ S│ +│                   
+// ├───────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                   
+// │       │ +│ +│  │  │ S│ +│ +│  │  │  │ +│ +│ +│  │ S│ +│                   
+// │ Round ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                   
+// │       │  │  │ +│ +│ +│ S│  │ +│ +│ +│  │  │  │ +│ +│ S│                   
+// └───────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘                   
+//
+inline fn __double_round_inline_ref(reg u32[16] k, stack u32 k4 k5 k14 k15) -> reg u32[16], stack u32, stack u32, stack u32, stack u32
 {
   k[14] = k14;
+  k[4] = k4;
 
   k = __half_round_inline_ref(k, 0, 4, 8, 12,
                                  2, 6, 10, 14);
   k14 = k[14];
+  k4 = k[4];
   k[15] = k15;
+  k[5] = k5;
+  
 
   k = __half_round_inline_ref(k, 1, 5, 9, 13,
                                  3, 7, 11, 15);
@@ -166,35 +187,42 @@ inline fn __double_round_inline_ref(reg u32[16] k, stack u32 k14 k15) -> reg u32
                                  0, 5, 10, 15);
 
   k15 = k[15];
+  k5 = k[5];
   k[14] = k14;
+  k[4] = k4;
 
   k = __half_round_inline_ref(k, 2, 7, 8, 13,
                                  3, 4, 9, 14);
 
   k14 = k[14];
+  k4 = k[4];
 
-  return k, k14, k15;
+  return k, k4, k5, k14, k15;
 }
 
 
 // used;
 inline fn __rounds_inline_ref(reg u32[16] k, stack u32 k15) -> reg u32[16], stack u32
 {
-  stack u32 s_c k14;
+  stack u32 s_c k4 k5 k14;
   reg u32 c;
 
   k14 = k[14];
+  k4 = k[4];
+  k5 = k[5];
   c = (CHACHA_ROUNDS/2);
 
   while
   { s_c = c;
     
-    k, k14, k15 = __double_round_inline_ref(k, k14, k15);
+    k, k4, k5, k14, k15 = __double_round_inline_ref(k, k4, k5, k14, k15);
 
     c = s_c;
     (_,_,_,_,c) = #DEC_32(c);
   } (c > 0)
 
+  k[4] = k4;
+  k[5] = k5;
   k[14] = k14;
   return k, k15;
 }

From dd906fbf1483370a0aef3d77084abe53e3cf7537 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Thu, 29 Feb 2024 16:19:36 +0100
Subject: [PATCH 06/16] Make crypto_stream DOIT.

But at what cost?!?
---
 .../chacha/common/amd64/ref/chacha_core.jinc  |  3 +-
 .../common/amd64/ref/salsa20_core.jinc        | 66 +++++++++++++------
 src/crypto_stream/xsalsa20/amd64/avx/Makefile |  1 +
 .../xsalsa20/amd64/avx2/Makefile              |  1 +
 4 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc b/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc
index b870bc42..aeecff10 100644
--- a/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc
+++ b/src/crypto_stream/chacha/common/amd64/ref/chacha_core.jinc
@@ -154,7 +154,8 @@ inline fn __half_round_inline_ref(
 //
 //                                                                             
 // The function below requires the spillage of some state on the stack
-// this is due to the 
+// this is due to the use of an auxilliary register in the implementation
+// of __ROL32x.
 //         ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐                   
 //         │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│  + - Value used   
 // ┌───────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                   
diff --git a/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc b/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc
index 91103b72..ed884d50 100644
--- a/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc
+++ b/src/crypto_stream/salsa20/common/amd64/ref/salsa20_core.jinc
@@ -1,3 +1,4 @@
+from Jade require "common/doit.jinc" //__ROL32
 
 // the following implementation requires:
 // - (even) param int SALSA20_ROUNDS;
@@ -33,7 +34,7 @@ inline fn __line_ref(reg u32[16] k, inline int a b c r) -> reg u32[16]
   reg u32 t;
   t  = k[b];
   t += k[c];
-  _, _, t = #ROL_32(t, r);
+  t = __ROL32x(t, r);
   k[a] ^= t;
   return k;
 }
@@ -49,58 +50,81 @@ inline fn __quarter_round_ref(reg u32[16] k, inline int a b c d) -> reg u32[16]
 }
 
 
-inline fn __column_round_ref(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+inline fn __column_round_ref(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k = __quarter_round_ref(k,  0,  4,  8, 12); k12 = k[12]; k[2] = k2;
-  k = __quarter_round_ref(k,  5,  9, 13,  1); k13 = k[13]; k[3] = k3;
-  k = __quarter_round_ref(k, 10, 14,  2,  6);
+  k = __quarter_round_ref(k,  0,  4,  8, 12); k12 = k[12];
+  k = __quarter_round_ref(k,  5,  9, 13,  1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6;
+  k = __quarter_round_ref(k, 10, 14,  2,  6); k[3] = k3;
   k = __quarter_round_ref(k, 15,  3,  7, 11);
 
-  return k, k12, k13;
+  return k, k9, k12, k13;
 }
 
 
-inline fn __line_round_ref(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32
+inline fn __line_round_ref(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k2 k3;
+  stack u32 k2 k3 k6;
 
-  k = __quarter_round_ref(k,  0,  1,  2,  3); k2 = k[2]; k[12] = k12;
-  k = __quarter_round_ref(k,  5,  6,  7,  4); k3 = k[3]; k[13] = k13;
-  k = __quarter_round_ref(k, 10, 11,  8,  9);
+  k = __quarter_round_ref(k,  0,  1,  2,  3); k2 = k[2]; k3 = k[3];
+  k = __quarter_round_ref(k,  5,  6,  7,  4); k6 = k[6]; k[9] = k9;
+  k = __quarter_round_ref(k, 10, 11,  8,  9); k[12] = k12; k[13] = k13;
   k = __quarter_round_ref(k, 15, 12, 13, 14);
 
-  return k, k2, k3;
+  return k, k2, k3, k6;
 }
 
 
-inline fn __double_round_ref(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+// The function below requires the spillage of some state on the stack.
+//          ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐                  
+//          │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│  Spilled values │                  
+// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤  + - Value used  
+// │ Column │ +│  │ S│ S│ +│  │ S│  │ +│  │  │  │ +│  │  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │  S - Stack spills
+// │        │  │ +│ S│ S│  │ +│ S│  │  │ +│  │  │ S│ +│  │  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ +│ S│  │  │ +│  │  │ S│ +│  │ S│ S│ +│  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │  │ +│  │  │  │ +│  │ S│  │ +│ S│ S│  │ +│  3              │                  
+// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │ Line   │ +│ +│ +│ +│  │  │  │  │  │ S│  │  │ S│ S│  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│ +│ +│ +│ +│  │ S│  │  │ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │ +│ +│ +│ +│ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │  │  │  │  │ +│ +│ +│ +│  3              │                  
+// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘                  
+//
+inline fn __double_round_ref(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k, k12, k13 = __column_round_ref(k, k2, k3);
-  k, k2,  k3  = __line_round_ref(k, k12, k13);
-  return k, k2, k3;
+  k, k9, k12, k13 = __column_round_ref(k, k2, k3, k6);
+  k, k2,  k3,  k6 = __line_round_ref(k, k9, k12, k13);
+  return k, k2, k3, k6;
 }
 
 
 inline fn __rounds_ref(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32
 {
-  stack u32 s_c k15;
+  stack u32 s_c k15 k6;
   reg u32 c;
 
+  k6 = k[6];
+
   c = (SALSA20_ROUNDS/2);
   while
   { s_c = c;
 
-    k, k2, k3 = __double_round_ref(k, k2, k3);
+    k, k2, k3, k6 = __double_round_ref(k, k2, k3, k6);
 
     c = s_c;
     (_,_,_,_,c) = #DEC_32(c);
   } (c > 0)
 
-  k15 = k[15]; k[2] = k2; k[3] = k3;
+  k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6;
   return k, k15;
 }
 
diff --git a/src/crypto_stream/xsalsa20/amd64/avx/Makefile b/src/crypto_stream/xsalsa20/amd64/avx/Makefile
index a5c992e6..60659907 100644
--- a/src/crypto_stream/xsalsa20/amd64/avx/Makefile
+++ b/src/crypto_stream/xsalsa20/amd64/avx/Makefile
@@ -1,2 +1,3 @@
+override JFLAGS += -lazy-regalloc
 SRCS := stream.jazz
 include ../../../../Makefile.common
diff --git a/src/crypto_stream/xsalsa20/amd64/avx2/Makefile b/src/crypto_stream/xsalsa20/amd64/avx2/Makefile
index a5c992e6..60659907 100644
--- a/src/crypto_stream/xsalsa20/amd64/avx2/Makefile
+++ b/src/crypto_stream/xsalsa20/amd64/avx2/Makefile
@@ -1,2 +1,3 @@
+override JFLAGS += -lazy-regalloc
 SRCS := stream.jazz
 include ../../../../Makefile.common

From 8fbd90da67ebbd9714f85e17d3b13948aa790c2e Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Thu, 29 Feb 2024 16:59:56 +0100
Subject: [PATCH 07/16] Protect utility rotate functions with checks on the
 rotate amount.

The rotate instruction just ignores these, but the ">>=" operators complain.
---
 src/common/doit.jinc | 53 ++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/common/doit.jinc b/src/common/doit.jinc
index 797e2333..ebe1bcd7 100644
--- a/src/common/doit.jinc
+++ b/src/common/doit.jinc
@@ -6,20 +6,26 @@ inline fn __ROL32(reg u32 x, inline int c) -> reg u32
 {
   reg u32 xt xb;
   xt = x;
-  xb = x;
-  xt <<= c;
-  xb >>= 32 - c;
-  xt |= xb;
+  if (c != 0 && c != 32)
+  {
+    xb = x;
+    xt <<= c;
+    xb >>= 32 - c;
+    xt |= xb;
+  }
   return xt;
 }
 
 inline fn __ROL32x(reg u32 x, inline int c) -> reg u32
 {
   reg u32 y;
-  y = x;
-  x <<= c;
-  y >>= 32 - c;
-  x |= y;
+  if (c != 0 && c != 32)
+  {
+    y = x;
+    x <<= c;
+    y >>= 32 - c;
+    x |= y;
+  }
   return x;
 }
 
@@ -27,19 +33,25 @@ inline fn __ROL32y(reg u32 x, inline int c) -> reg u32
 {
   reg u32 y;
   y = x;
-  x <<= c;
-  y >>= 32 - c;
-  y |= x;
+  if (c != 0 && c != 32)
+  {
+    x <<= c;
+    y >>= 32 - c;
+    y |= x;
+  }
   return y;
 }
 
 inline fn __ROL32s(reg u32 x, inline int c) -> reg u32
 {
   stack u32 y;
-  y = x;
-  x <<= c;
-  y >>= 32 - c;
-  x |= y;
+  if (c != 0 && c != 32)
+  {
+    y = x;
+    x <<= c;
+    y >>= 32 - c;
+    x |= y;
+  }
   return x;
 }
 
@@ -61,10 +73,13 @@ inline fn __ROL64(reg u64 x, inline int c) -> reg u64
 {
   reg u64 xt xb;
   xt = x;
-  xb = x;
-  xt <<= c;
-  xb >>= 64 - c;
-  xt |= xb;
+  if (c != 0 && c != 64)
+  {
+    xb = x;
+    xt <<= c;
+    xb >>= 64 - c;
+    xt |= xb;
+  }
   return xt;
 }
 

From c3f5fb54e0ed558b73801672bbd3071edc02a7cb Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Thu, 29 Feb 2024 17:01:42 +0100
Subject: [PATCH 08/16] Remove non-DOIT VMOVLPD and VMOVHPD.

---
 src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc | 11 +++++++----
 src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc | 13 ++++++++-----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc b/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc
index d372b648..975484bd 100644
--- a/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc
+++ b/src/crypto_xof/shake128/amd64/avx2/shake128_4x.jinc
@@ -14,6 +14,7 @@ inline fn __shake128_squeezeblock4x(
 {
   reg u256 t256;
   reg u128 t128;
+  reg u64 t64;
   inline int i;
 
   state = _keccakf1600_4x_avx2(state);
@@ -21,11 +22,13 @@ inline fn __shake128_squeezeblock4x(
 	for i = 0 to (SHAKE128_RATE / 8) {
     t256 = state[i];
     t128 = (128u)t256;
-		h0[u64 i] = #VMOVLPD(t128);
-		h1[u64 i] = #VMOVHPD(t128);
+		h0[u64 i] = (64u)t128;
+    t128 = #VPSRLDQ(t128, 8);
+		h1[u64 i] = (64u)t128;
     t128 = #VEXTRACTI128(t256, 1);
-		h2[u64 i] = #VMOVLPD(t128);
-		h3[u64 i] = #VMOVHPD(t128);
+		h2[u64 i] = (64u)t128;
+    t128 = #VPSRLDQ(t128, 8);
+		h3[u64 i] = (64u)t128;
 	}
 
   return state, h0, h1, h2, h3;
diff --git a/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc b/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc
index c4d1db6b..0688b7de 100644
--- a/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc
+++ b/src/crypto_xof/shake256/amd64/avx2/shake256_4x.jinc
@@ -1,5 +1,5 @@
 
-from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_4x.jinc"
+from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_4x.jinc" // _keccakf1600_4x_avx2_
 from Jade require "common/keccak/common/fips202_params.jinc" // SHAKE256_RATE
 
 inline fn __shake256_squeezeblock4x(
@@ -14,6 +14,7 @@ inline fn __shake256_squeezeblock4x(
 {
   reg u256 t256;
   reg u128 t128;
+  reg u64 t64;
   inline int i;
 
   state = _keccakf1600_4x_avx2(state);
@@ -21,11 +22,13 @@ inline fn __shake256_squeezeblock4x(
 	for i = 0 to (SHAKE256_RATE / 8) {
     t256 = state[i];
     t128 = (128u)t256;
-		h0[u64 i] = #VMOVLPD(t128);
-		h1[u64 i] = #VMOVHPD(t128);
+		h0[u64 i] = (64u)t128;
+    t128 = #VPSRLDQ(t128, 8);
+		h1[u64 i] = (64u)t128;
     t128 = #VEXTRACTI128(t256, 1);
-		h2[u64 i] = #VMOVLPD(t128);
-		h3[u64 i] = #VMOVHPD(t128);
+		h2[u64 i] = (64u)t128;
+    t128 = #VPSRLDQ(t128, 8);
+		h3[u64 i] = (64u)t128;
 	}
 
   return state, h0, h1, h2, h3;

From be794f30f75ff04e905c2f1fd01f76ca8b827df6 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Fri, 1 Mar 2024 13:19:05 +0100
Subject: [PATCH 09/16] Add DOIT POPCNT.

---
 src/common/doit.jinc | 70 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 13 deletions(-)

diff --git a/src/common/doit.jinc b/src/common/doit.jinc
index ebe1bcd7..81db77cb 100644
--- a/src/common/doit.jinc
+++ b/src/common/doit.jinc
@@ -29,19 +29,6 @@ inline fn __ROL32x(reg u32 x, inline int c) -> reg u32
   return x;
 }
 
-inline fn __ROL32y(reg u32 x, inline int c) -> reg u32
-{
-  reg u32 y;
-  y = x;
-  if (c != 0 && c != 32)
-  {
-    x <<= c;
-    y >>= 32 - c;
-    y |= x;
-  }
-  return y;
-}
-
 inline fn __ROL32s(reg u32 x, inline int c) -> reg u32
 {
   stack u32 y;
@@ -87,5 +74,62 @@ inline fn __ROL64(reg u64 x, inline int c) -> reg u64
 inline fn __ROR64(reg u64 x, inline int c) -> reg u64
 {
   x = __ROL64(x, 64 - c);
+  return x;
+}
+
+// POPCNT is not DOIT.
+inline fn __POPCNT32(reg u32 i) -> reg u32
+{
+  reg u32 x y;
+  
+  // i = i - ((i >> 1) & 0x55555555);        // add pairs of bits
+  x = i >> 1;
+  x &= 0x55555555;
+  i -= x;
+  
+  // i = (i & 0x33333333) + ((i >> 2) & 0x33333333);  // quads
+  x = i & 0x33333333;
+  y = i >> 2;
+  y &= 0x33333333;
+  i = x + y;
+
+  // i = (i + (i >> 4)) & 0x0f0f0f0f;        // groups of 8
+  x = i >> 4;
+  x += i;
+  i = x & 0x0f0f0f0f;
+
+  // i *= 0x01010101;                        // horizontal sum of bytes
+  i *= 0x01010101;
+  // i >> 24;                                // return just that top byte
+  i >>= 24;
+  return i;
+}
+
+inline fn __POPCNT64(reg u64 i) -> reg u64
+{
+  reg u64 x y;
+  // i -= (i >> 1) & 0x5555555555555555;             //put count of each 2 bits into those 2 bits
+  x = i;
+  x >>= 1;
+  x &= 0x5555555555555555;
+  i -= x;
+
+  // i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333); //put count of each 4 bits into those 4 bits 
+  x = i;
+  x &= 0x3333333333333333;
+  y = i >> 2;
+  y &= 0x3333333333333333;
+  i = x + y;
+
+  // i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0f;        //put count of each 8 bits into those 8 bits 
+  x = i;
+  x >>= 4;
+  x += i;
+  x &= 0x0f0f0f0f0f0f0f0f;
+
+  // (i * 0x0101010101010101) >> 56;  //returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ... 
+  x *= 0x0101010101010101;
+  x >>= 56;
+
   return x;
 }
\ No newline at end of file

From 4315ad540286092dd08ad69ecb9dcd7f67dbe7f6 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Fri, 1 Mar 2024 17:35:40 +0100
Subject: [PATCH 10/16] Declassify publicseed in Kyber.

---
 src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc | 9 ++++++---
 src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc  | 9 ++++++---
 src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc | 9 ++++++---
 src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc  | 9 ++++++---
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc b/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc
index 1f9e08d6..602bd6a5 100644
--- a/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber512/amd64/avx2/indcpa.jinc
@@ -23,7 +23,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES]
   for i=0 to KYBER_SYMBYTES/8
   {
     t64 = buf[u64 i];
-    publicseed[u64 i] = t64;
+    // We declassify here because we are reading the public part of the seed.
+    #declassify publicseed[u64 i] = t64;
     t64 = buf[u64 i + KYBER_SYMBYTES/8];
     noiseseed[u64 i] = t64;
   }
@@ -82,7 +83,8 @@ fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u6
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed[u64 (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed[u64 (int)i] = t64;
     pkp += 8;
     i += 1;
   }
@@ -148,7 +150,8 @@ fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_BYTES] ctp, reg ptr u8[KYBER_INDCPA_MS
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed[u64 (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed[u64 (int)i] = t64;
     pkp += 8;
     i += 1;
   }
diff --git a/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc
index c1bb634b..040c59c0 100644
--- a/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber512/amd64/ref/indcpa.jinc
@@ -23,7 +23,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES]
   for i=0 to KYBER_SYMBYTES/8
   {
     t64 = buf[u64 i];
-    publicseed[u64 i] = t64;
+    // We declassify here because we are reading the public part of the seed.
+    #declassify publicseed[u64 i] = t64;
     t64 = buf[u64 i + KYBER_SYMBYTES/8];
     noiseseed[u64 i] = t64;
   }
@@ -88,7 +89,8 @@ fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u6
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed[u64 (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed[u64 (int)i] = t64;
     pkp += 8;
     i += 1;
   }
@@ -163,7 +165,8 @@ fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_BYTES] ctp, reg ptr u8[KYBER_INDCPA_MS
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed[u64 (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed[u64 (int)i] = t64;
     pkp += 8;
     i += 1;
   }
diff --git a/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc b/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc
index 9852b48f..89207574 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/avx2/indcpa.jinc
@@ -23,7 +23,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES]
   for i=0 to KYBER_SYMBYTES/8
   {
     t64 = buf[u64 i];
-    publicseed[u64 i] = t64;
+    // We declassify here because we are reading the public part of the seed.
+    #declassify publicseed[u64 i] = t64;
     t64 = buf[u64 i + KYBER_SYMBYTES/8];
     noiseseed[u64 i] = t64;
   }
@@ -85,7 +86,8 @@ fn __indcpa_enc_0(stack u64 sctp, reg ptr u8[KYBER_INDCPA_MSGBYTES] msgp, reg u6
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed[u64 (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed[u64 (int)i] = t64;
     pkp += 8;
     i += 1;
   }
@@ -150,7 +152,8 @@ fn __indcpa_enc_1(reg ptr u8[KYBER_INDCPA_BYTES] ctp, reg ptr u8[KYBER_INDCPA_MS
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed[u64 (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed[u64 (int)i] = t64;
     pkp += 8;
     i += 1;
   }
diff --git a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
index 34c8982f..c2254b90 100644
--- a/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
+++ b/src/crypto_kem/kyber/kyber768/amd64/ref/indcpa.jinc
@@ -37,7 +37,8 @@ fn __indcpa_keypair_derand(reg u64 pkp, reg u64 skp, reg ptr u8[KYBER_SYMBYTES]
   for i=0 to KYBER_SYMBYTES/8
   {
     t64 = buf[u64 i];
-    publicseed[u64 i] = t64;
+    // We declassify here because we are reading the public part of the seed.
+    #declassify publicseed[u64 i] = t64;
     t64 = buf[u64 i + KYBER_SYMBYTES/8];
     noiseseed[u64 i] = t64;
   }
@@ -110,7 +111,8 @@ fn __indcpa_enc(stack u64 sctp, reg ptr u8[32] msgp, reg u64 pkp, reg ptr u8[KYB
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed.[u64 8 * (int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed.[u64 8 * (int)i] = t64;
     pkp += 8;
     i += 1;
   }
@@ -195,7 +197,8 @@ fn __iindcpa_enc(reg ptr u8[KYBER_CT_LEN] ctp, reg ptr u8[32] msgp, reg u64 pkp,
   while (i < KYBER_SYMBYTES/8)
   {
     t64 = (u64)[pkp];
-    publicseed.[u64 8*(int)i] = t64;
+    // We declassify here because we are reading the public part of the seed from the public key.
+    #declassify publicseed.[u64 8*(int)i] = t64;
     pkp += 8;
     i += 1;
   }

From f5c1a4c23d42d54201466432a3c448992d3c7c9c Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Wed, 6 Mar 2024 16:46:18 +0100
Subject: [PATCH 11/16] Remove SHRD use in Poly1305 impls.

---
 .../poly1305/amd64/avx/poly1305.jinc                 | 12 +++++++++---
 .../poly1305/amd64/avx2/poly1305.jinc                | 12 +++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc b/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc
index d3e9f7cc..89769a37 100644
--- a/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc
+++ b/src/crypto_onetimeauth/poly1305/amd64/avx/poly1305.jinc
@@ -29,8 +29,10 @@ inline fn __unpack_avx(
   r12[u64 o + 2] = l;
 
   l = rt[0];
-  ?{},l = #SHRD(l, rt[1], 52);
-  h = l;
+  l >>= 52;
+  h = rt[1];
+  h <<= 12;
+  l |= h;
   l &= mask26;
   r12[u64 o + 4] = l;
 
@@ -38,8 +40,12 @@ inline fn __unpack_avx(
   l >>= 26;
   l &= mask26;
   r12[u64 o + 6] = l;
+  
   l = rt[1];
-  ?{}, l = #SHRD(l, rt[2], 40);
+  l >>= 40;
+  h = rt[2];
+  h <<= 24;
+  l |= h;
   r12[u64 o + 8] = l;
 
   return r12;
diff --git a/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc b/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc
index f641f9dd..933b001d 100644
--- a/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc
+++ b/src/crypto_onetimeauth/poly1305/amd64/avx2/poly1305.jinc
@@ -29,8 +29,10 @@ inline fn __unpack_avx2(
   r1234[u64 o + 4] = l;
 
   l = rt[0];
-  ?{}, l = #SHRD(l, rt[1], 52);
-  h = l;
+  l >>= 52;
+  h = rt[1];
+  h <<= 12;
+  l |= h;
   l &= mask26;
   r1234[u64 o + 8] = l;
 
@@ -38,8 +40,12 @@ inline fn __unpack_avx2(
   l >>= 26;
   l &= mask26;
   r1234[u64 o + 12] = l;
+
   l = rt[1];
-  ?{}, l = #SHRD(l, rt[2], 40);
+  l >>= 40;
+  h = rt[2];
+  h <<= 24;
+  l |= h;
   r1234[u64 o + 16] = l;
 
   return r1234;

From a28c27b73dfc70cc54c38cf2d9611f72997067eb Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Fri, 1 Mar 2024 17:17:11 +0100
Subject: [PATCH 12/16] Make crypto_secretbox CT on DOIT.

Fix xsalsa20poly1305 by swapping its xsalsa20 with the DOIT one.

Adds declassifies into xsalsa20poly1305.
This declassifies the result of tag verification, which leaks via
a branch (decryption is not done if tag does not verify).
---
 .../amd64/avx/salsa20_32D.jinc                | 67 ++++++++++++-------
 .../amd64/avx/xsalsa20poly1305.jinc           |  3 +
 .../amd64/avx2/salsa20_32D.jinc               | 67 ++++++++++++-------
 .../amd64/avx2/xsalsa20poly1305.jinc          |  3 +
 .../amd64/ref/salsa20_32D.jinc                | 67 ++++++++++++-------
 .../amd64/ref/xsalsa20poly1305.jinc           |  3 +
 6 files changed, 141 insertions(+), 69 deletions(-)

diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc
index 885f045c..fc76f192 100644
--- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc
+++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/salsa20_32D.jinc
@@ -59,7 +59,7 @@ inline fn __line_ref_32(reg u32[16] k, inline int a b c r) -> reg u32[16]
   reg u32 t;
   t  = k[b];
   t += k[c];
-  _, _, t = #ROL_32(t, r);
+  t = __ROL32x(t, r);
   k[a] ^= t;
   return k;
 }
@@ -75,58 +75,79 @@ inline fn __quarter_round_ref_32(reg u32[16] k, inline int a b c d) -> reg u32[1
 }
 
 
-inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k = __quarter_round_ref_32(k,  0,  4,  8, 12); k12 = k[12]; k[2] = k2;
-  k = __quarter_round_ref_32(k,  5,  9, 13,  1); k13 = k[13]; k[3] = k3;
-  k = __quarter_round_ref_32(k, 10, 14,  2,  6);
+  k = __quarter_round_ref_32(k,  0,  4,  8, 12); k12 = k[12];
+  k = __quarter_round_ref_32(k,  5,  9, 13,  1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6;
+  k = __quarter_round_ref_32(k, 10, 14,  2,  6); k[3] = k3;
   k = __quarter_round_ref_32(k, 15,  3,  7, 11);
 
-  return k, k12, k13;
+  return k, k9, k12, k13;
 }
 
 
-inline fn __line_round_ref_32(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32
+inline fn __line_round_ref_32(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k2 k3;
+  stack u32 k2 k3 k6;
 
-  k = __quarter_round_ref_32(k,  0,  1,  2,  3); k2 = k[2]; k[12] = k12;
-  k = __quarter_round_ref_32(k,  5,  6,  7,  4); k3 = k[3]; k[13] = k13;
-  k = __quarter_round_ref_32(k, 10, 11,  8,  9);
+  k = __quarter_round_ref_32(k,  0,  1,  2,  3); k2 = k[2]; k3 = k[3];
+  k = __quarter_round_ref_32(k,  5,  6,  7,  4); k6 = k[6]; k[9] = k9;
+  k = __quarter_round_ref_32(k, 10, 11,  8,  9); k[12] = k12; k[13] = k13;
   k = __quarter_round_ref_32(k, 15, 12, 13, 14);
 
-  return k, k2, k3;
+  return k, k2, k3, k6;
 }
 
 
-inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+// The function below requires the spillage of some state on the stack.
+//          ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐                  
+//          │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│  Spilled values │                  
+// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤  + - Value used  
+// │ Column │ +│  │ S│ S│ +│  │ S│  │ +│  │  │  │ +│  │  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │  S - Stack spills
+// │        │  │ +│ S│ S│  │ +│ S│  │  │ +│  │  │ S│ +│  │  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ +│ S│  │  │ +│  │  │ S│ +│  │ S│ S│ +│  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │  │ +│  │  │  │ +│  │ S│  │ +│ S│ S│  │ +│  3              │                  
+// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │ Line   │ +│ +│ +│ +│  │  │  │  │  │ S│  │  │ S│ S│  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│ +│ +│ +│ +│  │ S│  │  │ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │ +│ +│ +│ +│ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │  │  │  │  │ +│ +│ +│ +│  3              │                  
+// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘                  
+//
+inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k, k12, k13 = __column_round_ref_32(k, k2, k3);
-  k, k2,  k3  = __line_round_ref_32(k, k12, k13);
-  return k, k2, k3;
+  k, k9, k12, k13 = __column_round_ref_32(k, k2, k3, k6);
+  k, k2,  k3,  k6 = __line_round_ref_32(k, k9, k12, k13);
+  return k, k2, k3, k6;
 }
 
 
 inline fn __rounds_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32
 {
-  stack u32 s_c k15;
+  stack u32 s_c k15 k6;
   reg u32 c;
 
+  k6 = k[6];
+
   c = 10;
   while
   { s_c = c;
-    k, k2, k3 = __double_round_ref_32(k, k2, k3);
+    k, k2, k3, k6 = __double_round_ref_32(k, k2, k3, k6);
     c = s_c;
     ?{}, c = #DEC_32(c);
   } (c > 0)
 
-  k15 = k[15];
-  k[2] = k2;
-  k[3] = k3;
+  k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6;
   return k, k15;
 }
 
diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc
index 1b96d27b..fb4dbc9c 100644
--- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc
+++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx/xsalsa20poly1305.jinc
@@ -63,6 +63,9 @@ inline fn __xsalsa20poly1305_avx_open(reg u64 m c clen nonce key) -> reg u64
     clen = #LEA(clen - 32);
 
     r = __poly1305_verify_avx_k(tag, ct, clen, subkey_p);
+    // We declassify the result of tag verification, as the function returns it anyway.
+    // This is a hack due to the annotation getting lost if put directly on the inline function.
+    #declassify r = r;
 
     if(r == 0)
     { m = m_s;
diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc
index 885f045c..fc76f192 100644
--- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc
+++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/salsa20_32D.jinc
@@ -59,7 +59,7 @@ inline fn __line_ref_32(reg u32[16] k, inline int a b c r) -> reg u32[16]
   reg u32 t;
   t  = k[b];
   t += k[c];
-  _, _, t = #ROL_32(t, r);
+  t = __ROL32x(t, r);
   k[a] ^= t;
   return k;
 }
@@ -75,58 +75,79 @@ inline fn __quarter_round_ref_32(reg u32[16] k, inline int a b c d) -> reg u32[1
 }
 
 
-inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k = __quarter_round_ref_32(k,  0,  4,  8, 12); k12 = k[12]; k[2] = k2;
-  k = __quarter_round_ref_32(k,  5,  9, 13,  1); k13 = k[13]; k[3] = k3;
-  k = __quarter_round_ref_32(k, 10, 14,  2,  6);
+  k = __quarter_round_ref_32(k,  0,  4,  8, 12); k12 = k[12];
+  k = __quarter_round_ref_32(k,  5,  9, 13,  1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6;
+  k = __quarter_round_ref_32(k, 10, 14,  2,  6); k[3] = k3;
   k = __quarter_round_ref_32(k, 15,  3,  7, 11);
 
-  return k, k12, k13;
+  return k, k9, k12, k13;
 }
 
 
-inline fn __line_round_ref_32(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32
+inline fn __line_round_ref_32(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k2 k3;
+  stack u32 k2 k3 k6;
 
-  k = __quarter_round_ref_32(k,  0,  1,  2,  3); k2 = k[2]; k[12] = k12;
-  k = __quarter_round_ref_32(k,  5,  6,  7,  4); k3 = k[3]; k[13] = k13;
-  k = __quarter_round_ref_32(k, 10, 11,  8,  9);
+  k = __quarter_round_ref_32(k,  0,  1,  2,  3); k2 = k[2]; k3 = k[3];
+  k = __quarter_round_ref_32(k,  5,  6,  7,  4); k6 = k[6]; k[9] = k9;
+  k = __quarter_round_ref_32(k, 10, 11,  8,  9); k[12] = k12; k[13] = k13;
   k = __quarter_round_ref_32(k, 15, 12, 13, 14);
 
-  return k, k2, k3;
+  return k, k2, k3, k6;
 }
 
 
-inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+// The function below requires the spillage of some state on the stack.
+//          ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐                  
+//          │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│  Spilled values │                  
+// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤  + - Value used  
+// │ Column │ +│  │ S│ S│ +│  │ S│  │ +│  │  │  │ +│  │  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │  S - Stack spills
+// │        │  │ +│ S│ S│  │ +│ S│  │  │ +│  │  │ S│ +│  │  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ +│ S│  │  │ +│  │  │ S│ +│  │ S│ S│ +│  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │  │ +│  │  │  │ +│  │ S│  │ +│ S│ S│  │ +│  3              │                  
+// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │ Line   │ +│ +│ +│ +│  │  │  │  │  │ S│  │  │ S│ S│  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│ +│ +│ +│ +│  │ S│  │  │ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │ +│ +│ +│ +│ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │  │  │  │  │ +│ +│ +│ +│  3              │                  
+// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘                  
+//
+inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k, k12, k13 = __column_round_ref_32(k, k2, k3);
-  k, k2,  k3  = __line_round_ref_32(k, k12, k13);
-  return k, k2, k3;
+  k, k9, k12, k13 = __column_round_ref_32(k, k2, k3, k6);
+  k, k2,  k3,  k6 = __line_round_ref_32(k, k9, k12, k13);
+  return k, k2, k3, k6;
 }
 
 
 inline fn __rounds_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32
 {
-  stack u32 s_c k15;
+  stack u32 s_c k15 k6;
   reg u32 c;
 
+  k6 = k[6];
+
   c = 10;
   while
   { s_c = c;
-    k, k2, k3 = __double_round_ref_32(k, k2, k3);
+    k, k2, k3, k6 = __double_round_ref_32(k, k2, k3, k6);
     c = s_c;
     ?{}, c = #DEC_32(c);
   } (c > 0)
 
-  k15 = k[15];
-  k[2] = k2;
-  k[3] = k3;
+  k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6;
   return k, k15;
 }
 
diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc
index 76f24a0c..68a8461d 100644
--- a/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc
+++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/avx2/xsalsa20poly1305.jinc
@@ -63,6 +63,9 @@ inline fn __xsalsa20poly1305_avx2_open(reg u64 m c clen nonce key) -> reg u64
     clen = #LEA(clen - 32);
 
     r = __poly1305_verify_avx2_k(tag, ct, clen, subkey_p);
+    // We declassify the result of tag verification, as the function returns it anyway.
+    // This is a hack due to the annotation getting lost if put directly on the inline function.
+    #declassify r = r;
 
     if(r == 0)
     { m = m_s;
diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc
index 885f045c..fc76f192 100644
--- a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc
+++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/salsa20_32D.jinc
@@ -59,7 +59,7 @@ inline fn __line_ref_32(reg u32[16] k, inline int a b c r) -> reg u32[16]
   reg u32 t;
   t  = k[b];
   t += k[c];
-  _, _, t = #ROL_32(t, r);
+  t = __ROL32x(t, r);
   k[a] ^= t;
   return k;
 }
@@ -75,58 +75,79 @@ inline fn __quarter_round_ref_32(reg u32[16] k, inline int a b c d) -> reg u32[1
 }
 
 
-inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+inline fn __column_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k = __quarter_round_ref_32(k,  0,  4,  8, 12); k12 = k[12]; k[2] = k2;
-  k = __quarter_round_ref_32(k,  5,  9, 13,  1); k13 = k[13]; k[3] = k3;
-  k = __quarter_round_ref_32(k, 10, 14,  2,  6);
+  k = __quarter_round_ref_32(k,  0,  4,  8, 12); k12 = k[12];
+  k = __quarter_round_ref_32(k,  5,  9, 13,  1); k9 = k[9]; k13 = k[13]; k[2] = k2; k[6] = k6;
+  k = __quarter_round_ref_32(k, 10, 14,  2,  6); k[3] = k3;
   k = __quarter_round_ref_32(k, 15,  3,  7, 11);
 
-  return k, k12, k13;
+  return k, k9, k12, k13;
 }
 
 
-inline fn __line_round_ref_32(reg u32[16] k, stack u32 k12 k13) -> reg u32[16], stack u32, stack u32
+inline fn __line_round_ref_32(reg u32[16] k, stack u32 k9 k12 k13) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k2 k3;
+  stack u32 k2 k3 k6;
 
-  k = __quarter_round_ref_32(k,  0,  1,  2,  3); k2 = k[2]; k[12] = k12;
-  k = __quarter_round_ref_32(k,  5,  6,  7,  4); k3 = k[3]; k[13] = k13;
-  k = __quarter_round_ref_32(k, 10, 11,  8,  9);
+  k = __quarter_round_ref_32(k,  0,  1,  2,  3); k2 = k[2]; k3 = k[3];
+  k = __quarter_round_ref_32(k,  5,  6,  7,  4); k6 = k[6]; k[9] = k9;
+  k = __quarter_round_ref_32(k, 10, 11,  8,  9); k[12] = k12; k[13] = k13;
   k = __quarter_round_ref_32(k, 15, 12, 13, 14);
 
-  return k, k2, k3;
+  return k, k2, k3, k6;
 }
 
 
-inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32, stack u32
+// The function below requires the spillage of some state on the stack.
+//          ┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬─────────────────┐                  
+//          │ 0│ 1│ 2│ 3│ 4│ 5│ 6│ 7│ 8│ 9│10│11│12│13│14│15│  Spilled values │                  
+// ┌────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼─────────────────┤  + - Value used  
+// │ Column │ +│  │ S│ S│ +│  │ S│  │ +│  │  │  │ +│  │  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │  S - Stack spills
+// │        │  │ +│ S│ S│  │ +│ S│  │  │ +│  │  │ S│ +│  │  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ +│ S│  │  │ +│  │  │ S│ +│  │ S│ S│ +│  │  4              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │  │ +│  │  │  │ +│  │ S│  │ +│ S│ S│  │ +│  3              │                  
+// ├────────┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │ Line   │ +│ +│ +│ +│  │  │  │  │  │ S│  │  │ S│ S│  │  │  3              │                  
+// │ round  ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│ +│ +│ +│ +│  │ S│  │  │ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │ +│ +│ +│ +│ S│ S│  │  │  5              │                  
+// │        ├──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┼──┤                 │                  
+// │        │  │  │ S│ S│  │  │ S│  │  │  │  │  │ +│ +│ +│ +│  3              │                  
+// └────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴─────────────────┘                  
+//
+inline fn __double_round_ref_32(reg u32[16] k, stack u32 k2 k3 k6) -> reg u32[16], stack u32, stack u32, stack u32
 {
-  stack u32 k12 k13;
+  stack u32 k9 k12 k13;
 
-  k, k12, k13 = __column_round_ref_32(k, k2, k3);
-  k, k2,  k3  = __line_round_ref_32(k, k12, k13);
-  return k, k2, k3;
+  k, k9, k12, k13 = __column_round_ref_32(k, k2, k3, k6);
+  k, k2,  k3,  k6 = __line_round_ref_32(k, k9, k12, k13);
+  return k, k2, k3, k6;
 }
 
 
 inline fn __rounds_ref_32(reg u32[16] k, stack u32 k2 k3) -> reg u32[16], stack u32
 {
-  stack u32 s_c k15;
+  stack u32 s_c k15 k6;
   reg u32 c;
 
+  k6 = k[6];
+
   c = 10;
   while
   { s_c = c;
-    k, k2, k3 = __double_round_ref_32(k, k2, k3);
+    k, k2, k3, k6 = __double_round_ref_32(k, k2, k3, k6);
     c = s_c;
     ?{}, c = #DEC_32(c);
   } (c > 0)
 
-  k15 = k[15];
-  k[2] = k2;
-  k[3] = k3;
+  k15 = k[15]; k[2] = k2; k[3] = k3; k[6] = k6;
   return k, k15;
 }
 
diff --git a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc
index 93a8a688..8da21efb 100644
--- a/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc
+++ b/src/crypto_secretbox/xsalsa20poly1305/amd64/ref/xsalsa20poly1305.jinc
@@ -61,6 +61,9 @@ inline fn __xsalsa20poly1305_ref_open(reg u64 m c clen nonce key) -> reg u64
     clen = #LEA(clen - 32);
 
     r = __poly1305_verify_ref_k(tag, ct, clen, subkey_p);
+    // We declassify the result of tag verification, as the function returns it anyway.
+    // This is a hack due to the annotation getting lost if put directly on the inline function.
+    #declassify r = r;
 
     if(r == 0)
     { m = m_s;

From e874ab018f341979461d1850a3b34b5c7e0bb0b0 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Thu, 7 Mar 2024 16:43:51 +0100
Subject: [PATCH 13/16] Make crypto_scalarmult CT on DOIT.

---
 .../curve25519/amd64/ref5/mul5.jinc           | 50 +++++++++++++++----
 .../curve25519/amd64/ref5/sqr5.jinc           | 50 +++++++++++++++----
 2 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc
index 8d1c379a..cb56a0c2 100644
--- a/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc
+++ b/src/crypto_scalarmult/curve25519/amd64/ref5/mul5.jinc
@@ -119,18 +119,33 @@ inline fn __mul5_rss(stack u64[5] xa ya) -> reg u64[5]
   cf, r[3] += mulrax;
   _, mulr31 += mulrdx + cf;
   mulredmask = 0x7FFFFFFFFFFFF;
-  ?{}, mulr01 = #SHLD(mulr01, r[0], 13);
+  mulr01 <<= 13;
+  mulrax = r[0];
+  mulrax >>= 51;
+  mulr01 |= mulrax;
   r[0] &= mulredmask;
-  ?{}, mulr11 = #SHLD(mulr11, r[1], 13);
+  mulr11 <<= 13;
+  mulrax = r[1];
+  mulrax >>= 51;
+  mulr11 |= mulrax;
   r[1] &= mulredmask;
   r[1] += mulr01;
-  ?{}, mulr21 = #SHLD(mulr21, r[2], 13);
+  mulr21 <<= 13;
+  mulrax = r[2];
+  mulrax >>= 51;
+  mulr21 |= mulrax;
   r[2] &= mulredmask;
   r[2] += mulr11;
-  ?{}, mulr31 = #SHLD(mulr31, r[3], 13);
+  mulr31 <<= 13;
+  mulrax = r[3];
+  mulrax >>= 51;
+  mulr31 |= mulrax;
   r[3] &= mulredmask;
   r[3] += mulr21;
-  ?{}, mulr41 = #SHLD(mulr41, r[4], 13);
+  mulr41 <<= 13;
+  mulrax = r[4];
+  mulrax >>= 51;
+  mulr41 |= mulrax;
   r[4] &= mulredmask;
   r[4] += mulr31;
   mulr41 = mulr41 * 19;
@@ -293,18 +308,33 @@ fn _mul5_pp(reg ptr u64[5] xa ya) -> reg ptr u64[5]
   cf, r[3] += mulrax;
   _, mulr31 += mulrdx + cf;
   mulredmask = 0x7FFFFFFFFFFFF;
-  ?{}, mulr01 = #SHLD(mulr01, r[0], 13);
+  mulr01 <<= 13;
+  mulrax = r[0];
+  mulrax >>= 51;
+  mulr01 |= mulrax;
   r[0] &= mulredmask;
-  ?{}, mulr11 = #SHLD(mulr11, r[1], 13);
+  mulr11 <<= 13;
+  mulrax = r[1];
+  mulrax >>= 51;
+  mulr11 |= mulrax;
   r[1] &= mulredmask;
   r[1] += mulr01;
-  ?{}, mulr21 = #SHLD(mulr21, r[2], 13);
+  mulr21 <<= 13;
+  mulrax = r[2];
+  mulrax >>= 51;
+  mulr21 |= mulrax;
   r[2] &= mulredmask;
   r[2] += mulr11;
-  ?{}, mulr31 = #SHLD(mulr31, r[3], 13);
+  mulr31 <<= 13;
+  mulrax = r[3];
+  mulrax >>= 51;
+  mulr31 |= mulrax;
   r[3] &= mulredmask;
   r[3] += mulr21;
-  ?{}, mulr41 = #SHLD(mulr41, r[4], 13);
+  mulr41 <<= 13;
+  mulrax = r[4];
+  mulrax >>= 51;
+  mulr41 |= mulrax;
   r[4] &= mulredmask;
   r[4] += mulr31;
   mulr41 = mulr41 * 19;
diff --git a/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc b/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc
index 64a6e3f1..bcca236f 100644
--- a/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc
+++ b/src/crypto_scalarmult/curve25519/amd64/ref5/sqr5.jinc
@@ -79,18 +79,33 @@ inline fn __sqr5_rs(stack u64[5] xa) -> reg u64[5]
   cf, r[3] += squarerax;
   _, squarer31 += squarerdx + cf;
   squareredmask = 0x7FFFFFFFFFFFF;
-  _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13);
+  squarer01 <<= 13;
+  squarerax = r[0];
+  squarerax >>= 51;
+  squarer01 |= squarerax;
   r[0] &= squareredmask;
-  _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13);
+  squarer11 <<= 13;
+  squarerax = r[1];
+  squarerax >>= 51;
+  squarer11 |= squarerax;
   r[1] &= squareredmask;
   r[1] += squarer01;
-  _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13);
+  squarer21 <<= 13;
+  squarerax = r[2];
+  squarerax >>= 51;
+  squarer21 |= squarerax;
   r[2] &= squareredmask;
   r[2] += squarer11;
-  _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13);
+  squarer31 <<= 13;
+  squarerax = r[3];
+  squarerax >>= 51;
+  squarer31 |= squarerax;
   r[3] &= squareredmask;
   r[3] += squarer21;
-  _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13);
+  squarer41 <<= 13;
+  squarerax = r[4];
+  squarerax >>= 51;
+  squarer41 |= squarerax;
   r[4] &= squareredmask;
   r[4] += squarer31;
   squarer41 = squarer41 * 19;
@@ -217,18 +232,33 @@ fn _sqr5_p(reg ptr u64[5] xa) -> reg ptr u64[5]
   cf, r[3] += squarerax;
   _, squarer31 += squarerdx + cf;
   squareredmask = 0x7FFFFFFFFFFFF;
-  _, _, _, _, _, squarer01 = #SHLD(squarer01, r[0], 13);
+  squarer01 <<= 13;
+  squarerax = r[0];
+  squarerax >>= 51;
+  squarer01 |= squarerax;
   r[0] &= squareredmask;
-  _, _, _, _, _, squarer11 = #SHLD(squarer11, r[1], 13);
+  squarer11 <<= 13;
+  squarerax = r[1];
+  squarerax >>= 51;
+  squarer11 |= squarerax;
   r[1] &= squareredmask;
   r[1] += squarer01;
-  _, _, _, _, _, squarer21 = #SHLD(squarer21, r[2], 13);
+  squarer21 <<= 13;
+  squarerax = r[2];
+  squarerax >>= 51;
+  squarer21 |= squarerax;
   r[2] &= squareredmask;
   r[2] += squarer11;
-  _, _, _, _, _, squarer31 = #SHLD(squarer31, r[3], 13);
+  squarer31 <<= 13;
+  squarerax = r[3];
+  squarerax >>= 51;
+  squarer31 |= squarerax;
   r[3] &= squareredmask;
   r[3] += squarer21;
-  _, _, _, _, _, squarer41 = #SHLD(squarer41, r[4], 13);
+  squarer41 <<= 13;
+  squarerax = r[4];
+  squarerax >>= 51;
+  squarer41 |= squarerax;
   r[4] &= squareredmask;
   r[4] += squarer31;
   squarer41 = squarer41 * 19;

From 27b5e775f16e28a3c00e4476a6ead53cd121fcda Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Fri, 8 Mar 2024 15:58:02 +0100
Subject: [PATCH 14/16] Fix non-doit ROLs in Keccak in Dilithium.

---
 src/crypto_sign/dilithium/common/amd64/fips202.jinc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/crypto_sign/dilithium/common/amd64/fips202.jinc b/src/crypto_sign/dilithium/common/amd64/fips202.jinc
index aeb015ad..25c8bd16 100644
--- a/src/crypto_sign/dilithium/common/amd64/fips202.jinc
+++ b/src/crypto_sign/dilithium/common/amd64/fips202.jinc
@@ -1,3 +1,4 @@
+from Jade require "common/doit.jinc"
 /* -- Stolen and modified from the Kyber repo -- */
 
 param int SHAKE128_RATE = 168;
@@ -24,7 +25,7 @@ fn theta(reg ptr u64[25] a) -> reg ptr u64[25] {
 
   for x = 0 to 5 {
     d[x] = c[(x + 1) % 5];
-    ?{}, d[x] = #ROL_64(d[x], 1);
+    d[x] = __ROL64(d[x], 1);
     d[x] ^= c[(x + 4) % 5];
   }
 
@@ -66,7 +67,7 @@ fn rho(reg ptr u64[25] a) -> reg ptr u64[25] {
     for y = 0 to 5 {
       i = index(x, y);
       z = keccakRhoOffsets(i);
-      _, _, a[i] = #ROL_64(a[i], z);
+      a[i] = __ROL64(a[i], z);
     }
   }
 

From 1b7cf841167c41ffcd2cc168b88ece578d288058 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Fri, 15 Mar 2024 17:04:17 +0100
Subject: [PATCH 15/16] Add WIP of benchmark plotting.

---
 bench/plot.py | 217 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100755 bench/plot.py

diff --git a/bench/plot.py b/bench/plot.py
new file mode 100755
index 00000000..c07bfd72
--- /dev/null
+++ b/bench/plot.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+
+import csv
+from pathlib import Path
+from dataclasses import dataclass
+from enum import IntEnum, Enum, auto
+from matplotlib import pyplot as plt
+import click
+
+
+class OpType(IntEnum):
+    OP1 = 1 # number_of_iterations, check_is_ok, sdev, mean, median, list_of_results
+    OP2 = 2 # inlen, number_of_iterations, check_is_ok, sdev, mean, median, list_of_results 
+    OP3 = 3 # outlen, inlen, number_of_iterations, check_is_ok, sdev, mean, median, list_of_results 
+
+
+@dataclass(frozen=True)
+class ImplFunction(object):
+    name: str
+    optype: OpType
+
+
+class ImplType(Enum):
+    crypto_hash = (ImplFunction("", OpType.OP2), )
+    crypto_kem = (ImplFunction("keypair", OpType.OP1),
+                  ImplFunction("keypair_derand", OpType.OP1),
+                  ImplFunction("enc", OpType.OP1),
+                  ImplFunction("enc_derand", OpType.OP1),
+                  ImplFunction("dec", OpType.OP1))
+    crypto_onetimeauth = (ImplFunction("", OpType.OP2), ImplFunction("verify", OpType.OP2))
+    crypto_scalarmult = (ImplFunction("base", OpType.OP1), ImplFunction("", OpType.OP1))
+    crypto_secretbox = (ImplFunction("", OpType.OP2), ImplFunction("open", OpType.OP2), ImplFunction("open_forgery", OpType.OP2))
+    crypto_sign = (ImplFunction("keypair", OpType.OP1), ImplFunction("", OpType.OP2), ImplFunction("open", OpType.OP2))
+    crypto_stream = (ImplFunction("", OpType.OP2), ImplFunction("xor", OpType.OP2))
+    crypto_xof = (ImplFunction("", OpType.OP3), )
+
+
+@dataclass
+class Results(object):
+    name: str
+    """Name of the results config (machine/before/after/DOITM)."""
+    type: ImplType
+    """Implementation type (like crypto_kem, ...)."""
+    impl: str
+    """Implementation (like kyber512...)."""
+    arch: str
+    """Architecture (like amd64...)."""
+    variant: str
+    """The variant (like ref, avx, ...)."""
+    func: ImplFunction
+    """The function (like keypair gen, ...)."""
+    data: list
+    """The raw data."""
+
+    def __str__(self) -> str:
+        return f"{self.name} {self.type.name} {self.impl} {self.arch} {self.variant} {self.func.name} {self.func.optype.name}"
+
+    def __repr__(self) -> str:
+        s = str(self) + "\n"
+        for l in self.data:
+            s += ", ".join(map(str, l)) + "\n"
+        return s
+
+
+def plot_op1(ax, *results: Results):
+    labels = []
+    data = []
+    for result in results:
+        if not(result.data):
+            print(f"Skipping {result}")
+            continue
+        for line in result.data:
+            measurements = line[0]
+            ok = line[1]
+            sdev = line[2]
+            mean = line[3]
+            median = line[4]
+            rest = line[5:]
+            data.append(rest)
+        labels.append(result.name)
+    ax.boxplot(data, labels=labels)
+    ax.set_ylabel("cycles")
+
+
+def plot_op2(ax, *results: Results):
+    for result in results:
+        if not(result.data):
+            print(f"Skipping {result}")
+            continue
+        lengths = []
+        sdevs = []
+        means = []
+        for line in result.data:
+            inlen = line[0]
+            measurements = line[1]
+            ok = line[2]
+            sdev = line[3]
+            mean = line[4]
+            median = line[5]
+            rest = line[6:]
+            lengths.append(inlen)
+            sdevs.append(sdev)
+            means.append(mean)
+        ax.plot(lengths, means, label=result.name)
+    ax.legend(loc="best")
+    ax.set_xlabel("inlen")
+    ax.set_ylabel("cycles")
+
+
+def plot_op3(ax, *results: Results):
+    for result in results:
+        if not(result.data):
+            print(f"Skipping {result}")
+            continue
+        lengths = []
+        sdevs = []
+        means = []
+        for line in result.data:
+            outlen = line[0]
+            inlen = line[1]
+            measurements = line[2]
+            ok = line[3]
+            sdev = line[4]
+            mean = line[5]
+            median = line[6]
+            rest = line[7:]
+            lengths.append((outlen, inlen))
+            sdevs.append(sdev)
+            means.append(mean)
+        ax.plot(lengths, means, label=result.name)
+    ax.legend(loc="best")
+    ax.set_xlabel("inlen")
+    ax.set_ylabel("cycles")
+
+
+def load_directory(directory: Path) -> list[Results]:
+    bin_dir = directory / "bin"
+    all_results = []
+    for impl_type in ImplType:
+        type_dir = bin_dir / impl_type.name
+        # sign, kem and stream (except xsalsa20) have additional subdirectory for the primitive
+        if impl_type in (ImplType.crypto_sign, ImplType.crypto_kem, ImplType.crypto_stream):
+            top_levels = list(type_dir.iterdir())
+            impl_dirs = sum(map(lambda top: list(top.iterdir()) if top.name != "xsalsa20" else [top], top_levels), [])
+        else:
+            impl_dirs = list(type_dir.iterdir())
+        for impl_dir in impl_dirs:
+            impl_name = impl_dir.name
+            for arch_dir in impl_dir.iterdir():
+                arch = arch_dir.name
+                for variant_dir in arch_dir.iterdir():
+                    variant = variant_dir.name
+                    for fname in variant_dir.glob("*.csv"):
+                        for func in impl_type.value:
+                            func_name = f"{variant}_{func.name}" if func.name else variant
+                            if str(fname).endswith(func_name + ".csv"):
+                                break
+                        else:
+                            raise ValueError("Unknown function")
+                        with fname.open("r") as f:
+                            reader = csv.reader(f)
+                            data = [list(map(lambda x: float(x.strip()) if "." in x else int(x.strip()), line)) for line in reader]
+                            results = Results(directory.name, impl_type, impl_name, arch, variant, func, data)
+                            all_results.append(results)
+    return all_results
+
+
+@click.command()
+@click.argument("dirs", nargs=-1, type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), required=True)
+def main(dirs):
+    # (ImplType, impl, arch, variant, func) -> list[Results]
+    result_map = {}
+    for directory in dirs:
+        click.echo(f"Processing {directory}.")
+        results = load_directory(directory)
+        for r in results:
+            ident = (r.type, r.impl, r.arch, r.variant, r.func)
+            result_map.setdefault(ident, [])
+            result_map[ident].append(r)
+
+    # (ImplType, impl, arch, variant) -> (func -> list[Results])
+    func_map = {}
+    for ident, results in result_map.items():
+        merged_ident = (ident[0], ident[1], ident[2], ident[3]) # all but the func
+        func_map.setdefault(merged_ident, {})
+        func_map[merged_ident][ident[4]] = results
+
+    for ident, func_result_map in func_map.items():
+        funcs = ident[0].value
+        fig, axs = plt.subplots(len(funcs), figsize=(5, len(funcs)*4))
+        if len(funcs) == 1:
+            axs = [axs]
+        for i, func in enumerate(funcs):
+            results = func_result_map.get(func)
+            if not results:
+                continue
+            ax = axs[i]
+            if len(results) <= 1:
+                print(f"Not enough results for {name}.")
+                continue
+            ax.set_title(func.name)
+            if func.optype == OpType.OP1:
+                plot_op1(ax, *results)
+            elif func.optype == OpType.OP2:
+                plot_op2(ax, *results)
+            elif func.optype == OpType.OP3:
+                plot_op3(ax, *results)
+        name = f"{ident[0].name}_{ident[1]}_{ident[2]}_{ident[3]}"
+        fname = name + ".png"
+        fig.suptitle(name)
+        fig.tight_layout()
+        fig.savefig(fname, dpi=300)
+        plt.close(fig)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From a1fa0630b9214e9d7debe1856e3566ec5a008708 Mon Sep 17 00:00:00 2001
From: J08nY <johny@neuromancer.sk>
Date: Wed, 8 May 2024 16:04:00 +0200
Subject: [PATCH 16/16] Use jazzct in Makefile.checksct.

---
 src/Makefile.checksct | 4 ++--
 src/Makefile.common   | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Makefile.checksct b/src/Makefile.checksct
index 7a1bfc0c..6055547e 100644
--- a/src/Makefile.checksct
+++ b/src/Makefile.checksct
@@ -7,8 +7,8 @@ ifneq ($(OP),)
 
 SCT_FLAGS  ?= 
 
-CHECK_SCT_S = ($(JASMINC) -slice $* -checkSCT $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
-CHECK_SCT   = ($(JASMINC)           -checkSCT $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT_S = ($(JAZZCT) --slice $* --speculative $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
+CHECK_SCT   = ($(JAZZCT)            --speculative $(SCT_FLAGS) $< > $@ 2>&1) $(CIT)
 
 SCT_TARGETS  = $(addsuffix .sct, $(FUNCTIONS))
 
diff --git a/src/Makefile.common b/src/Makefile.common
index ab28f62b..10a967ee 100644
--- a/src/Makefile.common
+++ b/src/Makefile.common
@@ -36,6 +36,7 @@ JEXT    ?= jazz
 override JFLAGS += -noinsertarraycopy
 JINCLUDE = -I Jade:$(SRC)
 JASMIN  ?= jasminc
+JAZZCT  ?= jazzct
 JASMINC := $(JASMIN) $(JFLAGS) $(JINCLUDE)
 COMPILE  = ($(JASMINC) -o $@ $<) $(CIT)