diff --git a/mlkem/native/aarch64/intt_clean.S b/mlkem/native/aarch64/intt_clean.S
index c6a290aba..73d1eb9c2 100644
--- a/mlkem/native/aarch64/intt_clean.S
+++ b/mlkem/native/aarch64/intt_clean.S
@@ -30,8 +30,9 @@
 #include "common.i"
 
 // Bounds:
-// In: |src| < q * C
-// Out: |dst| < q * (0.0508 * C + 1/2)
+// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
+//
+// See mlken/reduce.c and test/test_bounds.py for more details.
 .macro mulmodq dst, src, const, idx0, idx1
         // Signed barrett multiplication using
         // round-to-nearest-even-integer approximation.
diff --git a/mlkem/native/aarch64/intt_opt.S b/mlkem/native/aarch64/intt_opt.S
index 4ff961feb..873790c96 100644
--- a/mlkem/native/aarch64/intt_opt.S
+++ b/mlkem/native/aarch64/intt_opt.S
@@ -29,6 +29,10 @@
 // Needed to provide ASM_LOAD directive
 #include "common.i"
 
+// Bounds:
+// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
+//
+// See mlken/reduce.c and test/test_bounds.py for more details.
 .macro mulmodq dst, src, const, idx0, idx1
         sqrdmulh t2.8h,      \src\().8h, \const\().h[\idx1\()]
         mul      \dst\().8h, \src\().8h, \const\().h[\idx0\()]
diff --git a/mlkem/native/aarch64/ntt_clean.S b/mlkem/native/aarch64/ntt_clean.S
index 39298093d..e60713f4d 100644
--- a/mlkem/native/aarch64/ntt_clean.S
+++ b/mlkem/native/aarch64/ntt_clean.S
@@ -30,8 +30,7 @@
 #include "common.i"
 
 // Bounds:
-// In: |src| < q * C
-// Out: |dst| < q * (0.0508 * C + 1/2)
+// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
 //
 // See mlken/reduce.c and test/test_bounds.py for more details.
 .macro mulmodq dst, src, const, idx0, idx1
@@ -218,13 +217,13 @@ _ntt_asm_clean:
         // Bounds reasoning:
         // - There are 7 layers
         // - When passing from layer N to layer N+1, each layer-N value
-        //   is modified through the addition/subtraction of a Montgomery
-        //   product of a twiddle of absolute value < q/2 and a layer-N value.
-        // - Recalling that |fqmul(a,t)| < q * (0.0254*C + 1/2) for
-        //   |a| < C*q and |t|<q/2, we see that the coefficients of layer N
-        //   (starting with layer 0 = input data) are bound by
-        //   q * f^N(1) = q * f(f(...(f(1))...)), where f(C) = 1/2 + 1.0508*C.
-        //   For N=7, we get the bound of f^7(1) * q < 18295.
+        // is modified through the addition/subtraction of a Montgomery
+        // product of a twiddle of absolute value < q/2 and a layer-N value.
+        // - Recalling that for C such that |a| < C * q and |t|<q/2, we have
+        // |fqmul(a,t)| < q * (0.0254*C + 1/2), we see that the coefficients
+        // of layer N (starting with layer 0 = input data) are bound by q * f^N(1),
+        // where f(C) = 1/2 + 1.0508*C.
+        // For N=7, we get the bound of f^7(1) * q < 18295.
         //
         // See test/test_bounds.py for more details.
 
diff --git a/mlkem/native/aarch64/ntt_opt.S b/mlkem/native/aarch64/ntt_opt.S
index f01982edb..09a0e03c6 100644
--- a/mlkem/native/aarch64/ntt_opt.S
+++ b/mlkem/native/aarch64/ntt_opt.S
@@ -30,8 +30,7 @@
 #include "common.i"
 
 // Bounds:
-// In: |src| < q * C
-// Out: |dst| < q * (0.0508 * C + 1/2)
+// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2)
 //
 // See mlken/reduce.c and test/test_bounds.py for more details.
 .macro mulmodq dst, src, const, idx0, idx1
@@ -220,10 +219,10 @@ _ntt_asm_opt:
         // - When passing from layer N to layer N+1, each layer-N value
         // is modified through the addition/subtraction of a Montgomery
         // product of a twiddle of absolute value < q/2 and a layer-N value.
-        // - Recalling that |fqmul(a,t)| < q * (0.0254*C + 1/2) for
-        // |a| < C*q and |t|<q/2, we see that the coefficients of layer N
-        // (starting with layer 0 = input data) are bound by q * f^N(1), where
-        // f(C) = 1/2 + 1.0508*C.
+        // - Recalling that for C such that |a| < C * q and |t|<q/2, we have
+        // |fqmul(a,t)| < q * (0.0254*C + 1/2), we see that the coefficients
+        // of layer N (starting with layer 0 = input data) are bound by q * f^N(1),
+        // where f(C) = 1/2 + 1.0508*C.
         // For N=7, we get the bound of f^7(1) * q < 18295.
 
                                           // Instructions:    10
diff --git a/mlkem/reduce.c b/mlkem/reduce.c
index d6cee8fca..258675139 100644
--- a/mlkem/reduce.c
+++ b/mlkem/reduce.c
@@ -13,7 +13,7 @@
  *
  * Returns:     integer congruent to a * R^-1 modulo q
  *
- *              Bounds: If |a| < q * C, then the return value
+ *              Bounds: For any C such that |a| < q * C, the return value
  *              has absolute value < q (C/2^16 + 1/2).
  *
  *              Notable special cases:
@@ -32,13 +32,13 @@ int16_t montgomery_reduce(int32_t a) {
 
   // Bounds on paper
   //
-  // - Case |a| < q * C
+  // - Case |a| < q * C, for some C
   //
   // |t| <= |a|/2^16 + |t|*q/2^16
   //      < q * C / 2^16 + q/2
   //      = q (C/2^16 + 1/2)
   //
-  // - Case |a| < (q/2) * C * q
+  // - Case |a| < (q/2) * C * q, for some C
   //
   // Replace C -> C * q in the above and estimate
   // q / 2^17 < 0.0254.