From 0b6f740ceae03b54282bc2ae940a3f32f3abaa98 Mon Sep 17 00:00:00 2001
From: Patrick Longa <plonga@microsoft.com>
Date: Thu, 4 Feb 2021 17:18:34 -0800
Subject: [PATCH] Optimizing xDBLADD, some minor editing

---
 src/compression/dlog.c | 25 +++++++++----------------
 src/ec_isogeny.c       | 39 +++++++++++++++++++--------------------
 src/fpx.c              |  2 +-
 3 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/src/compression/dlog.c b/src/compression/dlog.c
index 36786db..34e45c7 100644
--- a/src/compression/dlog.c
+++ b/src/compression/dlog.c
@@ -32,8 +32,7 @@ void from_base(int *D, digit_t *r, int Dlen, int base)
             digit[0] = (digit_t)(-D[i]);
             if ((base & 1) == 0) {
                 Montgomery_neg(digit, (digit_t*)Alice_order);
-            }
-            else {  
+            } else {  
                 mp_sub((digit_t*)Bob_order, digit, digit, NWORDS_ORDER);                            
             }
         } else {
@@ -70,8 +69,7 @@ void from_base(int *D, digit_t *r, int Dlen, int base)
         digit[0] = (digit_t)(-D[0]);
         if ((base & 1) == 0) {
             Montgomery_neg(digit, (digit_t*)Alice_order);
-        }
-        else { 
+        } else { 
             mp_sub((digit_t*)Bob_order, digit, digit, NWORDS_ORDER);            
         }
     } else {
@@ -88,7 +86,6 @@ void from_base(int *D, digit_t *r, int Dlen, int base)
 
 #ifdef COMPRESSED_TABLES
 
-
 #ifdef ELL2_TORUS
 
 int ord2w_dlog(const felm_t *r, const int *logT, const felm_t *Texp)
@@ -97,11 +94,9 @@ int ord2w_dlog(const felm_t *r, const int *logT, const felm_t *Texp)
   // Output: corresponding digit d in [-2^{w1-1},2^{w1-1}]
     felm_t x, y;
     felm_t sum = {0}, prods[1<<(W_2_1-1)] = {0};
-    f2elm_t tmp;
 
     fpcopy(r[0], x);
     fpcopy(r[1], y);
-
     fpcorrection(x);
     fpcorrection(y);
 
@@ -112,11 +107,10 @@ int ord2w_dlog(const felm_t *r, const int *logT, const felm_t *Texp)
     fpneg(sum);
     fpcorrection(sum);
     if (memcmp(x, sum, NBITS_TO_NBYTES(NBITS_FIELD)) == 0) return logT[2];    
-    for (int j = 2; j < W_2; ++j)
-    {
-        for (int i = 0; i < (1<<(j-1)); ++i)
-        {
-            if ((i % 2) == 0) fpmul_mont(y, Texp[(1<<(j-2)) + (i/2) - 1], prods[(1<<(j-2)) + (i/2) - 1]);
+    for (int j = 2; j < W_2; ++j) {
+        for (int i = 0; i < (1<<(j-1)); ++i) {
+            if ((i % 2) == 0) 
+                fpmul_mont(y, Texp[(1<<(j-2)) + (i/2) - 1], prods[(1<<(j-2)) + (i/2) - 1]);
             fpcopy(y, sum);
             for (int k = 0; k <= j-2; ++k) {
                 if (((i>>(j-k-2)) % 2) == 0) 
@@ -142,9 +136,9 @@ int ord2w_dlog(const felm_t *r, const int *logT, const felm_t *Texp)
 // Output: The signed digit D in {-ell^(w-1), ..., ell^(w-1)}
 int ord2w_dloghyb(const felm_t *h, const int *logT, const felm_t *Texp, const felm_t *G)
 {
-    int k = 0, d = 0, index = 0, ord = 0, tmp = 0, w = W_2, w2 = w - W_2_1, i_j = 0, t, pow0, pow1;
+    int k = 0, d = 0, index = 0, ord = 0, tmp = 0, w = W_2, w2 = w - W_2_1, i_j = 0, t;
     uint8_t inv = 0, flag = 0;
-    f2elm_t H[W_2_1] = {0}, tmp2; // Size of H should be max of {W_2_1, W_2 - W_2_1}
+    f2elm_t H[W_2_1] = {0}; // Size of H should be max of {W_2_1, W_2 - W_2_1}
     felm_t one = {0};
 
     fpcopy((digit_t*)&Montgomery_one, one);    
@@ -152,8 +146,7 @@ int ord2w_dloghyb(const felm_t *h, const int *logT, const felm_t *Texp, const fe
     fpcorrection(H[0][0]);
     fpcorrection(H[0][1]);
 
-    for (int i = 1; i <= w2; ++i)
-    {
+    for (int i = 1; i <= w2; ++i) {
         if (!is_felm_zero(H[0][1])) { // check if first compressed Fp2 element in H is NOT the identity
             for (int j = k; j >= 0; j--) fp2copy(H[j], H[j+1]);
             sqr_Fp2_cycl_proj(H[0]);
diff --git a/src/ec_isogeny.c b/src/ec_isogeny.c
index 42ecd66..edd449e 100644
--- a/src/ec_isogeny.c
+++ b/src/ec_isogeny.c
@@ -273,26 +273,25 @@ void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t XPQ, const f2elm_t ZP
   // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. 
     f2elm_t t0, t1, t2;
 
-    fp2add(P->X, P->Z, t0);                         // t0 = XP+ZP
-    fp2sub(P->X, P->Z, t1);                         // t1 = XP-ZP    
-    fp2sqr_mont(t0, P->X);                          // XP = (XP+ZP)^2    
-    fp2sub(Q->X, Q->Z, t2);                         // t2 = XQ-ZQ
-    fp2correction(t2);    
-    fp2add(Q->X, Q->Z, Q->X);                       // XQ = XQ+ZQ    
-    fp2mul_mont(t0, t2, t0);                        // t0 = (XP+ZP)*(XQ-ZQ)    
-    fp2sqr_mont(t1, P->Z);                          // ZP = (XP-ZP)^2    
-    fp2mul_mont(t1, Q->X, t1);                      // t1 = (XP-ZP)*(XQ+ZQ)    
-    fp2sub(P->X, P->Z, t2);                         // t2 = (XP+ZP)^2-(XP-ZP)^2    
-    fp2mul_mont(P->X, P->Z, P->X);                  // XP = (XP+ZP)^2*(XP-ZP)^2    
-    fp2mul_mont(t2, A24, Q->X);                     // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2]    
-    fp2sub(t0, t1, Q->Z);                           // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)    
-    fp2add(Q->X, P->Z, P->Z);                       // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2    
-    fp2add(t0, t1, Q->X);                           // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)    
-    fp2mul_mont(P->Z, t2, P->Z);                    // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2]    
-    fp2sqr_mont(Q->Z, Q->Z);                        // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2    
-    fp2sqr_mont(Q->X, Q->X);                        // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2    
-    fp2mul_mont(Q->X, ZPQ, Q->X);                   // XQ = ZPQ*[(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2    
-    fp2mul_mont(Q->Z, XPQ, Q->Z);                   // ZQ = XPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2          
+    mp2_add(P->X, P->Z, t0);                        // t0 = XP+ZP
+    mp2_sub_p2(P->X, P->Z, t1);                     // t1 = XP-ZP
+    fp2sqr_mont(t0, P->X);                          // XP = (XP+ZP)^2
+    mp2_sub_p2(Q->X, Q->Z, t2);                     // t2 = XQ-ZQ
+    mp2_add(Q->X, Q->Z, Q->X);                      // XQ = XQ+ZQ
+    fp2mul_mont(t0, t2, t0);                        // t0 = (XP+ZP)*(XQ-ZQ)
+    fp2sqr_mont(t1, P->Z);                          // ZP = (XP-ZP)^2
+    fp2mul_mont(t1, Q->X, t1);                      // t1 = (XP-ZP)*(XQ+ZQ)
+    mp2_sub_p2(P->X, P->Z, t2);                     // t2 = (XP+ZP)^2-(XP-ZP)^2
+    fp2mul_mont(P->X, P->Z, P->X);                  // XP = (XP+ZP)^2*(XP-ZP)^2
+    fp2mul_mont(A24, t2, Q->X);                     // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2]
+    mp2_sub_p2(t0, t1, Q->Z);                       // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)
+    mp2_add(Q->X, P->Z, P->Z);                      // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2
+    mp2_add(t0, t1, Q->X);                          // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)
+    fp2mul_mont(P->Z, t2, P->Z);                    // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2]
+    fp2sqr_mont(Q->Z, Q->Z);                        // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
+    fp2sqr_mont(Q->X, Q->X);                        // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2
+    fp2mul_mont(Q->Z, XPQ, Q->Z);                   // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
+    fp2mul_mont(Q->X, ZPQ, Q->X);                   // XQ = ZPQ*[(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2            
 }
 
 
diff --git a/src/fpx.c b/src/fpx.c
index 72629f8..09cbc34 100644
--- a/src/fpx.c
+++ b/src/fpx.c
@@ -1558,7 +1558,7 @@ int reverse_bits(int t, unsigned int nbits)
         x >>= 1;
         bits++;
     }
-    while (bits < nbits) {
+    while ((unsigned int)bits < nbits) {
         r <<= 1;
         bits++;
     }