From 84dcf8570020d6c9b785dad9b27ebb0fd9db60ed Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Fri, 5 Jan 2024 16:10:29 +0100
Subject: [PATCH] Run copy_from_upstream.py -k

---
 docs/algorithms/kem/classic_mceliece.md       | 42 +++++++++----------
 docs/algorithms/kem/classic_mceliece.yml      |  4 +-
 docs/algorithms/kem/kyber.md                  |  4 +-
 docs/algorithms/kem/kyber.yml                 |  2 +-
 docs/algorithms/sig/falcon.md                 |  4 +-
 .../pqcrystals-kyber_kyber1024_ref/poly.c     | 15 ++++++-
 .../pqcrystals-kyber_kyber1024_ref/polyvec.c  | 17 +++++++-
 .../pqcrystals-kyber_kyber512_ref/poly.c      | 15 ++++++-
 .../pqcrystals-kyber_kyber512_ref/polyvec.c   | 17 +++++++-
 .../pqcrystals-kyber_kyber768_ref/poly.c      | 15 ++++++-
 .../pqcrystals-kyber_kyber768_ref/polyvec.c   | 17 +++++++-
 11 files changed, 113 insertions(+), 39 deletions(-)

diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md
index b193065b5a..29c2d745e3 100644
--- a/docs/algorithms/kem/classic_mceliece.md
+++ b/docs/algorithms/kem/classic_mceliece.md
@@ -14,7 +14,7 @@
 ## Advisories
 
 - Classic-McEliece-460896, Classic-McEliece-460896f, Classic-McEliece-6960119, and Classic-McEliece-6960119f parameter sets fail memory leak testing on x86-64 when building with ``clang`` using optimization level ``-O2`` and ``-O3``. Care is advised when using the algorithm at higher optimization levels, and any other compiler and architecture.
-- Current implementation of the algorithm may not be constant-time. Additionally, environment specific constant-time leaks may not be documented; please report potential constant-time leaks when found. 
+- Current implementation of the algorithm may not be constant-time. Additionally, environment specific constant-time leaks may not be documented; please report potential constant-time leaks when found.
 
 ## Parameter set summary
 
@@ -35,8 +35,8 @@
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                  |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | True                                           | True                  |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                  |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | False                                          | True                  |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -46,8 +46,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -55,8 +55,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -64,8 +64,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -73,8 +73,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -82,8 +82,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -91,8 +91,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -100,8 +100,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -109,8 +109,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT             | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -118,8 +118,8 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | True                 |
-| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | True                                           | True                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | True                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,POPCNT,BMI1        | False                              | False                                          | True                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
diff --git a/docs/algorithms/kem/classic_mceliece.yml b/docs/algorithms/kem/classic_mceliece.yml
index a5fcf751fc..4916af115e 100644
--- a/docs/algorithms/kem/classic_mceliece.yml
+++ b/docs/algorithms/kem/classic_mceliece.yml
@@ -26,7 +26,9 @@ advisories:
   building with ``clang`` using optimization level ``-O2`` and ``-O3``. Care is advised
   when using the algorithm at higher optimization levels, and any other compiler and
   architecture.
-- Current implementation of the algorithm may not be constant-time. Additionally, environment specific constant-time leaks may not be documented; please report potential constant-time leaks when found. 
+- Current implementation of the algorithm may not be constant-time. Additionally,
+  environment specific constant-time leaks may not be documented; please report potential
+  constant-time leaks when found.
 parameter-sets:
 - name: Classic-McEliece-348864
   claimed-nist-level: 1
diff --git a/docs/algorithms/kem/kyber.md b/docs/algorithms/kem/kyber.md
index 9279672346..8191dfbfe8 100644
--- a/docs/algorithms/kem/kyber.md
+++ b/docs/algorithms/kem/kyber.md
@@ -7,9 +7,9 @@
 - **Authors' website**: https://pq-crystals.org/
 - **Specification version**: NIST Round 3 submission.
 - **Primary Source**<a name="primary-source"></a>:
-  - **Source**: https://github.com/pq-crystals/kyber/commit/dda29cc63af721981ee2c831cf00822e69be3220 with copy_from_upstream patches
+  - **Source**: https://github.com/pq-crystals/kyber/commit/272125f6acc8e8b6850fd68ceb901a660ff48196 with copy_from_upstream patches
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
-- **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/dda29cc63af721981ee2c831cf00822e69be3220 with copy_from_upstream patches
+- **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/272125f6acc8e8b6850fd68ceb901a660ff48196 with copy_from_upstream patches
   - **pqclean-aarch64**:<a name="pqclean-aarch64"></a>
       - **Source**: https://github.com/PQClean/PQClean/commit/8e220a87308154d48fdfac40abbb191ac7fce06a with copy_from_upstream patches
       - **Implementation license (SPDX-Identifier)**: CC0-1.0 and (CC0-1.0 or Apache-2.0) and (CC0-1.0 or MIT) and MIT
diff --git a/docs/algorithms/kem/kyber.yml b/docs/algorithms/kem/kyber.yml
index f3dc15ad76..096c5702a2 100644
--- a/docs/algorithms/kem/kyber.yml
+++ b/docs/algorithms/kem/kyber.yml
@@ -17,7 +17,7 @@ website: https://pq-crystals.org/
 nist-round: 3
 spec-version: NIST Round 3 submission
 primary-upstream:
-  source: https://github.com/pq-crystals/kyber/commit/dda29cc63af721981ee2c831cf00822e69be3220
+  source: https://github.com/pq-crystals/kyber/commit/272125f6acc8e8b6850fd68ceb901a660ff48196
     with copy_from_upstream patches
   spdx-license-identifier: CC0-1.0 or Apache-2.0
 optimized-upstreams:
diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md
index 101ffa9a98..08598e3b47 100644
--- a/docs/algorithms/sig/falcon.md
+++ b/docs/algorithms/sig/falcon.md
@@ -22,7 +22,7 @@
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | False                 |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | False                 |
 | [Primary Source](#primary-source) | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                 |
 | [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                 |
 
@@ -34,7 +34,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 |       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
-| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | False                              | False                                          | False                |
+| [Primary Source](#primary-source) | clean                    | All                         | All                             | None                    | True                               | True                                           | False                |
 | [Primary Source](#primary-source) | avx2                     | x86\_64                     | All                             | AVX2                    | False                              | False                                          | False                |
 | [Primary Source](#primary-source) | aarch64                  | ARM64\_V8                   | Linux,Darwin                    | None                    | False                              | False                                          | False                |
 
diff --git a/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/poly.c b/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/poly.c
index 017cacf5d6..3e73579e68 100644
--- a/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/poly.c
+++ b/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/poly.c
@@ -19,6 +19,7 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
 {
   unsigned int i,j;
   int16_t u;
+  uint32_t d0;
   uint8_t t[8];
 
 #if (KYBER_POLYCOMPRESSEDBYTES == 128)
@@ -27,7 +28,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
     }
 
     r[0] = t[0] | (t[1] << 4);
@@ -42,7 +48,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
     }
 
     r[0] = (t[0] >> 0) | (t[1] << 5);
diff --git a/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/polyvec.c b/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/polyvec.c
index 8420d069c2..669f6a5f1d 100644
--- a/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/polyvec.c
+++ b/src/kem/kyber/pqcrystals-kyber_kyber1024_ref/polyvec.c
@@ -15,6 +15,7 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
 {
   unsigned int i,j,k;
+  uint64_t d0;
 
 #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
   uint16_t t[8];
@@ -23,7 +24,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<8;k++) {
         t[k]  = a->vec[i].coeffs[8*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
       }
 
       r[ 0] = (t[0] >>  0);
@@ -47,7 +54,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<4;k++) {
         t[k]  = a->vec[i].coeffs[4*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff;
+/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+        d0 = t[k];
+        d0 <<= 10;
+        d0 += 1665;
+        d0 *= 1290167;
+        d0 >>= 32;
+        t[k] = d0 & 0x3ff;
       }
 
       r[0] = (t[0] >> 0);
diff --git a/src/kem/kyber/pqcrystals-kyber_kyber512_ref/poly.c b/src/kem/kyber/pqcrystals-kyber_kyber512_ref/poly.c
index 017cacf5d6..3e73579e68 100644
--- a/src/kem/kyber/pqcrystals-kyber_kyber512_ref/poly.c
+++ b/src/kem/kyber/pqcrystals-kyber_kyber512_ref/poly.c
@@ -19,6 +19,7 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
 {
   unsigned int i,j;
   int16_t u;
+  uint32_t d0;
   uint8_t t[8];
 
 #if (KYBER_POLYCOMPRESSEDBYTES == 128)
@@ -27,7 +28,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
     }
 
     r[0] = t[0] | (t[1] << 4);
@@ -42,7 +48,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
     }
 
     r[0] = (t[0] >> 0) | (t[1] << 5);
diff --git a/src/kem/kyber/pqcrystals-kyber_kyber512_ref/polyvec.c b/src/kem/kyber/pqcrystals-kyber_kyber512_ref/polyvec.c
index 8420d069c2..669f6a5f1d 100644
--- a/src/kem/kyber/pqcrystals-kyber_kyber512_ref/polyvec.c
+++ b/src/kem/kyber/pqcrystals-kyber_kyber512_ref/polyvec.c
@@ -15,6 +15,7 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
 {
   unsigned int i,j,k;
+  uint64_t d0;
 
 #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
   uint16_t t[8];
@@ -23,7 +24,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<8;k++) {
         t[k]  = a->vec[i].coeffs[8*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
       }
 
       r[ 0] = (t[0] >>  0);
@@ -47,7 +54,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<4;k++) {
         t[k]  = a->vec[i].coeffs[4*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff;
+/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+        d0 = t[k];
+        d0 <<= 10;
+        d0 += 1665;
+        d0 *= 1290167;
+        d0 >>= 32;
+        t[k] = d0 & 0x3ff;
       }
 
       r[0] = (t[0] >> 0);
diff --git a/src/kem/kyber/pqcrystals-kyber_kyber768_ref/poly.c b/src/kem/kyber/pqcrystals-kyber_kyber768_ref/poly.c
index 017cacf5d6..3e73579e68 100644
--- a/src/kem/kyber/pqcrystals-kyber_kyber768_ref/poly.c
+++ b/src/kem/kyber/pqcrystals-kyber_kyber768_ref/poly.c
@@ -19,6 +19,7 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
 {
   unsigned int i,j;
   int16_t u;
+  uint32_t d0;
   uint8_t t[8];
 
 #if (KYBER_POLYCOMPRESSEDBYTES == 128)
@@ -27,7 +28,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
     }
 
     r[0] = t[0] | (t[1] << 4);
@@ -42,7 +48,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
     }
 
     r[0] = (t[0] >> 0) | (t[1] << 5);
diff --git a/src/kem/kyber/pqcrystals-kyber_kyber768_ref/polyvec.c b/src/kem/kyber/pqcrystals-kyber_kyber768_ref/polyvec.c
index 8420d069c2..669f6a5f1d 100644
--- a/src/kem/kyber/pqcrystals-kyber_kyber768_ref/polyvec.c
+++ b/src/kem/kyber/pqcrystals-kyber_kyber768_ref/polyvec.c
@@ -15,6 +15,7 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
 {
   unsigned int i,j,k;
+  uint64_t d0;
 
 #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
   uint16_t t[8];
@@ -23,7 +24,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<8;k++) {
         t[k]  = a->vec[i].coeffs[8*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
       }
 
       r[ 0] = (t[0] >>  0);
@@ -47,7 +54,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<4;k++) {
         t[k]  = a->vec[i].coeffs[4*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff;
+/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+        d0 = t[k];
+        d0 <<= 10;
+        d0 += 1665;
+        d0 *= 1290167;
+        d0 >>= 32;
+        t[k] = d0 & 0x3ff;
       }
 
       r[0] = (t[0] >> 0);