diff --git a/Makefile b/Makefile
index 5d2a54b..833ea1a 100755
--- a/Makefile
+++ b/Makefile
@@ -77,7 +77,7 @@ else
 CFLAGS= $(EXTRA_CFLAGS)
 endif
 CFLAGS+= $(VALGRIND_CFLAGS)
-CFLAGS+= -std=gnu11 -Wall $(ADDITIONAL_SETTINGS) -D $(ARCHITECTURE) -D __NIX__ -D $(USE_OPT_LEVEL) $(MULX) $(ADX)
+CFLAGS+= -std=gnu11 -Wall $(ADDITIONAL_SETTINGS) -D $(ARCHITECTURE) -D __NIX__ -D $(USE_OPT_LEVEL) $(MULX) $(ADX) -Wno-missing-braces
 LDFLAGS=-lm
 ifeq "$(USE_OPT_LEVEL)" "_GENERIC_"
     EXTRA_OBJECTS_434=objs434/fp_generic.o
@@ -106,7 +106,7 @@ OBJECTS_503_COMP=objs503comp/P503_compressed.o $(EXTRA_OBJECTS_503) objs/random.
 OBJECTS_610_COMP=objs610comp/P610_compressed.o $(EXTRA_OBJECTS_610) objs/random.o objs/fips202.o
 OBJECTS_751_COMP=objs751comp/P751_compressed.o $(EXTRA_OBJECTS_751) objs/random.o objs/fips202.o
 
-all: lib434 lib503 lib610 lib751 lib434comp lib503comp lib610comp lib751comp tests KATS
+all: lib434 lib503 lib610 lib751 lib434comp lib503comp lib610comp lib751comp tests_p434 tests_p503 tests_p610 tests_p751
 
 objs434/%.o: src/P434/%.c
 	@mkdir -p $(@D)
@@ -259,28 +259,6 @@ lib751comp: $(OBJECTS_751_COMP)
 	$(AR) lib751comp/libsidh.a $^
 	$(RANLIB) lib751comp/libsidh.a
 
-tests: lib434 lib434comp lib503 lib503comp lib610 lib610comp lib751 lib751comp
-	$(CC) $(CFLAGS) -L./lib434 tests/arith_tests-p434.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p434 $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p503 $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib610 tests/arith_tests-p610.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p610 $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p751 $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib434 tests/test_SIDHp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib503 tests/test_SIDHp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib610 tests/test_SIDHp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751 tests/test_SIDHp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib434 tests/test_SIKEp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib503 tests/test_SIKEp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib610 tests/test_SIKEp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751 tests/test_SIKEp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib434comp tests/test_SIDHp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434_compressed/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib503comp tests/test_SIDHp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503_compressed/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib610comp tests/test_SIDHp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610_compressed/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751comp tests/test_SIDHp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751_compressed/test_SIDH $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib434comp tests/test_SIKEp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434_compressed/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib503comp tests/test_SIKEp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503_compressed/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib610comp tests/test_SIKEp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610_compressed/test_SIKE $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751comp tests/test_SIKEp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751_compressed/test_SIKE $(ARM_SETTING)
-
 # AES
 AES_OBJS=objs/aes.o objs/aes_c.o
 
@@ -320,17 +298,40 @@ lib751comp_for_KATs: $(OBJECTS_751_COMP) $(AES_OBJS)
 	$(AR) lib751comp/libsidh_for_testing.a $^
 	$(RANLIB) lib751comp/libsidh_for_testing.a
 
-KATS: lib434_for_KATs lib503_for_KATs lib610_for_KATs lib751_for_KATs lib434comp_for_KATs lib503comp_for_KATs lib610comp_for_KATs lib751comp_for_KATs
+tests_p434: lib434 lib434comp lib434_for_KATs lib434comp_for_KATs
+	$(CC) $(CFLAGS) -L./lib434 tests/arith_tests-p434.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p434 $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib434 tests/test_SIDHp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib434 tests/test_SIKEp434.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib434comp tests/test_SIDHp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh434_compressed/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib434comp tests/test_SIKEp434_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike434_compressed/test_SIKE $(ARM_SETTING)
 	$(CC) $(CFLAGS) -L./lib434 tests/PQCtestKAT_kem434.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike434/PQCtestKAT_kem $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib503 tests/PQCtestKAT_kem503.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503/PQCtestKAT_kem $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib610 tests/PQCtestKAT_kem610.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike610/PQCtestKAT_kem $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751 tests/PQCtestKAT_kem751.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751/PQCtestKAT_kem $(ARM_SETTING)
 	$(CC) $(CFLAGS) -L./lib434comp tests/PQCtestKAT_kem434_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike434_compressed/PQCtestKAT_kem $(ARM_SETTING)
+tests_p503: lib503 lib503comp lib503_for_KATs lib503comp_for_KATs
+	$(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p503 $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib503 tests/test_SIDHp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib503 tests/test_SIKEp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib503comp tests/test_SIDHp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503_compressed/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib503comp tests/test_SIKEp503_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503_compressed/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib503 tests/PQCtestKAT_kem503.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503/PQCtestKAT_kem $(ARM_SETTING)
 	$(CC) $(CFLAGS) -L./lib503comp tests/PQCtestKAT_kem503_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503_compressed/PQCtestKAT_kem $(ARM_SETTING)
+tests_p610: lib610 lib610comp lib610_for_KATs lib610comp_for_KATs
+	$(CC) $(CFLAGS) -L./lib610 tests/arith_tests-p610.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p610 $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib610 tests/test_SIDHp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib610 tests/test_SIKEp610.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib610comp tests/test_SIDHp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh610_compressed/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib610comp tests/test_SIKEp610_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike610_compressed/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib610 tests/PQCtestKAT_kem610.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike610/PQCtestKAT_kem $(ARM_SETTING)
 	$(CC) $(CFLAGS) -L./lib610comp tests/PQCtestKAT_kem610_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike610_compressed/PQCtestKAT_kem $(ARM_SETTING)
-	$(CC) $(CFLAGS) -L./lib751comp tests/PQCtestKAT_kem751_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751_compressed/PQCtestKAT_kem $(ARM_SETTING)
+tests_p751: lib751 lib751comp lib751_for_KATs lib751comp_for_KATs
+	$(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p751 $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib751 tests/test_SIDHp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib751 tests/test_SIKEp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib751comp tests/test_SIDHp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751_compressed/test_SIDH $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib751comp tests/test_SIKEp751_compressed.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751_compressed/test_SIKE $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib751 tests/PQCtestKAT_kem751.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751/PQCtestKAT_kem $(ARM_SETTING)
+	$(CC) $(CFLAGS) -L./lib751comp tests/PQCtestKAT_kem751_compressed.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751_compressed/PQCtestKAT_kem $(ARM_SETTING)    
 
-check: tests
+check: tests_p434 tests_p503 tests_p610 tests_p751
 
 test434:
 ifeq "$(DO_VALGRIND_CHECK)" "TRUE"
diff --git a/README.md b/README.md
index be4dfa0..9dc86a8 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
-# SIDH v3.4 (C Edition)
+# SIDH v3.5 (C Edition)
 
 The **SIDH** library is an efficient supersingular isogeny-based cryptography library written in C language.
-**Version v3.4** of the library includes the ephemeral Diffie-Hellman key exchange scheme "SIDH" [1,2], and the CCA-secure
+**Version v3.5** of the library includes the ephemeral Diffie-Hellman key exchange scheme "SIDH" [1,2], and the CCA-secure
 key encapsulation mechanism "SIKE" [3]. These schemes are conjectured to be secure against quantum computer attacks.
 
 Concretely, the SIDH library includes the following KEM schemes:
@@ -91,9 +91,13 @@ The library was developed by [Microsoft Research](http://research.microsoft.com/
  
 - Memory optimizations for compressed SIDH and compressed SIKE.
 
+## New in Version 3.5
+ 
+- New implementations of the quadratic extension field arithmetic for x64 processors on Linux [13].
+
 ## Supported Platforms
 
-**SIDH v3.4** is supported on a wide range of platforms including x64, x86, ARM and s390x processors running Windows,
+**SIDH v3.5** is supported on a wide range of platforms including x64, x86, ARM and s390x processors running Windows,
 Linux or Mac OS X. We have tested the library with Microsoft Visual Studio 2015, GNU GCC v5.4, and clang v3.8.
 See instructions below to choose an implementation option and compile on one of the supported platforms.
 
@@ -121,7 +125,7 @@ optimizations using MULX/ADX.
 Other options for x64:
 
 ```sh
-$ make ARCH=x64 CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC] USE_MULX=[TRUE/FALSE] USE_ADX=[TRUE/FALSE]
+$ make tests_pXXX ARCH=x64 CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC] USE_MULX=[TRUE/FALSE] USE_ADX=[TRUE/FALSE]
 ```
 
 When `OPT_LEVEL=FAST` (i.e., assembly use enabled), the user is responsible for setting the flags MULX and ADX 
@@ -129,17 +133,18 @@ according to the targeted platform (for example, MULX/ADX are not supported on S
 is supported on Haswell, and both MULX and ADX are supported on Broadwell, Skylake and Kaby Lake architectures). 
 Note that USE_ADX can only be set to `TRUE` if `USE_MULX=TRUE`.
 The option `USE_MULX=FALSE` with `USE_ADX=FALSE` is only supported on p503 and p751.
+The use of `tests_pXXX`, for any value XXX in [434,503,610,751], allows to compile only one parameter set at a time.
 
 Options for x86/ARM/M1/s390x:
 
 ```sh
-$ make ARCH=[x86/ARM/M1/s390x] CC=[gcc/clang]
+$ make tests_pXXX ARCH=[x86/ARM/M1/s390x] CC=[gcc/clang]
 ```
 
 Options for ARM64 or Apple M1:
 
 ```sh
-$ make ARCH=[ARM64/M1] CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC]
+$ make tests_pXXX ARCH=[ARM64/M1] CC=[gcc/clang] OPT_LEVEL=[FAST/GENERIC]
 ```
 
 As in the x64 case, `OPT_LEVEL=FAST` enables the use of assembly optimizations on ARMv8 platforms.
@@ -209,7 +214,7 @@ The library includes some third party modules that are licensed differently. In
 - `tests/PQCtestKAT_kem<#>.c`: copyrighted by Lawrence E. Bassham 
 - `src/sha3/fips202.c`: public domain
 
-## Contributors
+## Other contributors
 
 - Basil Hess.
 - Geovandro Pereira.
@@ -223,8 +228,8 @@ The extended version is available [`here`](http://eprint.iacr.org/2016/413).
 [2]  David Jao and Luca DeFeo, "Towards quantum-resistant cryptosystems from supersingular elliptic curve isogenies". PQCrypto 2011, LNCS 7071, pp. 19-34, 2011. 
 The extended version is available [`here`](https://eprint.iacr.org/2011/506).
 
-[3]  Reza Azarderakhsh, Matthew Campagna, Craig Costello, Luca De Feo, Basil Hess, Amir Jalali, David Jao, Brian Koziel, Brian LaMacchia, Patrick Longa, Michael Naehrig, Joost Renes, Vladimir Soukharev, and David Urbanik, "Supersingular Isogeny Key Encapsulation". Submission to the NIST Post-Quantum Standardization project, 2017.  
-The round 2 submission package is available [`here`](https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-2/submissions/SIKE-Round2.zip).
+[3]  Reza Azarderakhsh, Matthew Campagna, Craig Costello, Luca De Feo, Basil Hess, Aaron Hutchinson, Amir Jalali, Koray Karabina, David Jao, Brian Koziel, Brian LaMacchia, Patrick Longa, Michael Naehrig, Geovandro Pereira, Joost Renes, Vladimir Soukharev, and David Urbanik, "Supersingular Isogeny Key Encapsulation (SIKE)", 2017.  
+The specifications document is available [`here`](https://sike.org).
 
 [4]  Craig Costello, and Huseyin Hisil, "A simple and compact algorithm for SIDH with arbitrary degree isogenies". Advances in Cryptology - ASIACRYPT 2017, LNCS 10625, pp. 303-329, 2017. 
 The preprint version is available [`here`](https://eprint.iacr.org/2017/504). 
@@ -235,10 +240,10 @@ The preprint version is available [`here`](https://eprint.iacr.org/2017/1015).
 [6]  Gora Adj, Daniel Cervantes-Vázquez, Jesús-Javier Chi-Domínguez, Alfred Menezes and Francisco Rodríguez-Henríquez, "On the cost of computing isogenies between supersingular elliptic curves". SAC 2018, LCNS 11349, pp. 322-343, 2018. 
 The preprint version is available [`here`](https://eprint.iacr.org/2018/313). 
 
-[7]  Samuel Jaques and John M. Schanck, "Quantum cryptanalysis in the RAM model: Claw-finding attacks on SIKE". Advances in Cryptology - CRYPTO 2019 (to appear), 2019. 
+[7]  Samuel Jaques and John M. Schanck, "Quantum cryptanalysis in the RAM model: Claw-finding attacks on SIKE". Advances in Cryptology - CRYPTO 2019, 2019. 
 The preprint version is available [`here`](https://eprint.iacr.org/2019/103).   
 
-[8]  Craig Costello, Patrick Longa, Michael Naehrig, Joost Renes and Fernando Virdia, "Improved Classical Cryptanalysis of the Computational Supersingular Isogeny Problem", 2019. 
+[8]  Craig Costello, Patrick Longa, Michael Naehrig, Joost Renes and Fernando Virdia, "Improved classical cryptanalysis of the computational supersingular isogeny problem". PKC 2020, LCNS 12111, pp. 505-534, 2020. 
 The preprint version is available [`here`](https://eprint.iacr.org/2019/298). 
 
 [9]  Craig Costello, David Jao, Patrick Longa, Michael Naehrig, Joost Renes and David Urbanik, "Efficient compression of SIDH public keys". Advances in Cryptology - EUROCRYPT 2017, LNCS 10210, pp. 679-706, 2017. 
@@ -247,12 +252,14 @@ The preprint version is available [`here`](https://eprint.iacr.org/2016/963).
 [10]  Gustavo H.M. Zanon, Marcos A. Simplicio Jr, Geovandro C.C.F. Pereira, Javad Doliskani and Paulo S.L.M. Barreto, "Faster key compression for isogeny-based cryptosystems". IEEE Transactions on Computers, Vol. 68(5), 2019. 
 The preprint version is available [`here`](https://eprint.iacr.org/2017/1143).  
 
-[11]  Michael Naehrig and Joost Renes, "Dual Isogenies and Their Application to Public-key Compression for Isogeny-based Cryptography". Advances in Cryptology - ASIACRYPT 2019, LNCS 11922, pp. 243-272, 2019.
+[11]  Michael Naehrig and Joost Renes, "Dual isogenies and their application to public-key compression for isogeny-based cryptography". Advances in Cryptology - ASIACRYPT 2019, LNCS 11922, pp. 243-272, 2019.
 The preprint version is available [`here`](https://eprint.iacr.org/2019/499).
 
-[12]  Geovandro C.C.F. Pereira, Javad Doliskani and David Jao, "x-only point addition formula and faster torsion basis generation in compressed SIKE".
+[12]  Geovandro C.C.F. Pereira, Javad Doliskani and David Jao, "x-only point addition formula and faster torsion basis generation in compressed SIKE". JCEN, Vol. 11, pp. 57-69, 2021. 
 The preprint version is available [`here`](https://eprint.iacr.org/2020/431).
 
+[13]  Patrick Longa, "Efficient algorithms for large prime characteristic fields and their application to bilinear pairings and supersingular isogeny-based protocols", 2022.
+
 # Contributing
 
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
diff --git a/src/P434/AMD64/fp_x64.c b/src/P434/AMD64/fp_x64.c
index 5cb92a7..16852fb 100644
--- a/src/P434/AMD64/fp_x64.c
+++ b/src/P434/AMD64/fp_x64.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license 
 *
 * Abstract: modular arithmetic optimized for x64 platforms for P434
 *********************************************************************************************/
@@ -17,7 +21,7 @@ extern const uint64_t p434x4[NWORDS_FIELD];
 
 inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -39,7 +43,7 @@ inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c)
 
 inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -50,11 +54,6 @@ inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c)
     for (i = 0; i < NWORDS_FIELD; i++) {
         ADDC(borrow, c[i], ((digit_t*)p434x4)[i], borrow, c[i]); 
     }
-    
-#elif (OS_TARGET == OS_NIX)                 
-    
-    mp_sub434_p4_asm(a, b, c);    
-
 #endif
 }
 
@@ -161,13 +160,42 @@ void fpcorrection434(digit_t* a)
     }
 }
 
+#if (OS_TARGET == OS_NIX)
+
+void fp2mul434_c0_mont(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fp2mul434_c0_asm(a, b, c);
+}
+
+
+void fp2mul434_c1_mont(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fp2mul434_c1_asm(a, b, c);
+}
+
+
+void fp2sqr434_c0_mont(const digit_t* a, digit_t* c)
+{
+    fp2sqr434_c0_asm(a, c);
+}
+
+
+void fp2sqr434_c1_mont(const digit_t* a, digit_t* c)
+{
+    fp2sqr434_c1_asm(a, c);
+}
+
+void fpmul434(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fpmul434_asm(a, b, c);
+}
+
+#else
 
 void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
 { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
         
     UNREFERENCED_PARAMETER(nwords);
-
-#if (OS_TARGET == OS_WIN)
     digit_t t = 0;
     uint128_t uv = {0};
     unsigned int carry = 0;
@@ -330,12 +358,6 @@ void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int n
     MULADD128(a[6], b[6], uv, carry, uv);
     c[12] = uv[0];
     c[13] = uv[1];
-
-#elif (OS_TARGET == OS_NIX)
-    
-    mul434_asm(a, b, c);
-
-#endif
 }
 
 
@@ -343,9 +365,7 @@ void rdc_mont(digit_t* ma, digit_t* mc)
 { // Montgomery reduction exploiting special form of the prime.
   // mc = ma*R^-1 mod p434x2, where R = 2^448.
   // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
-  // ma is assumed to be in Montgomery representation.
-        
-#if (OS_TARGET == OS_WIN)
+  // ma is assumed to be in Montgomery representation.        
     unsigned int carry;
     digit_t t = 0;
     uint128_t uv = {0};
@@ -478,11 +498,7 @@ void rdc_mont(digit_t* ma, digit_t* mc)
     MULADD128(mc[6], ((digit_t*)p434p1)[6], uv, carry, uv);
     t += carry;
     ADDC(0, uv[0], ma[12], carry, mc[5]); 
-    ADDC(carry, uv[1], ma[13], carry, mc[6]); 
-    
-#elif (OS_TARGET == OS_NIX)                 
-    
-    rdc434_asm(ma, mc);    
+    ADDC(carry, uv[1], ma[13], carry, mc[6]);
+}
 
-#endif
-}
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/src/P434/AMD64/fp_x64_asm.S b/src/P434/AMD64/fp_x64_asm.S
index 5cea37c..86be311 100644
--- a/src/P434/AMD64/fp_x64_asm.S
+++ b/src/P434/AMD64/fp_x64_asm.S
@@ -1,1020 +1,813 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license 
 //
 // Abstract: field arithmetic in x64 assembly for P434 on Linux
-//*******************************************************************************************  
-
-.intel_syntax noprefix 
-
-// Format function and variable names for Mac OS X
-#if defined(__APPLE__)
-    #define fmt(f)    _##f
-#else
-    #define fmt(f)    f
-#endif
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-
-// Define addition instructions
-#ifdef _MULX_
-#ifdef _ADX_
-
-#define ADD1    adox
-#define ADC1    adox
-#define ADD2    adcx
-#define ADC2    adcx
-
-#else
-
-#define ADD1    add
-#define ADC1    adc
-#define ADD2    add
-#define ADC2    adc
-
-#endif    
-#endif
-
-
-.text
-//***********************************************************************
-//  Field addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fmt(fpadd434_asm)
-fmt(fpadd434_asm):
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  push   rbx
-  push   rbp
-  
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  adc    r12, [reg_p2+32] 
-  adc    r13, [reg_p2+40] 
-  adc    r14, [reg_p2+48]
-
-  mov    rbx, [rip+fmt(p434x2)]
-  sub    r8, rbx
-  mov    rcx, [rip+fmt(p434x2)+8]
-  sbb    r9, rcx
-  sbb    r10, rcx
-  mov    rdi, [rip+fmt(p434x2)+24]
-  sbb    r11, rdi
-  mov    rsi, [rip+fmt(p434x2)+32]
-  sbb    r12, rsi
-  mov    rbp, [rip+fmt(p434x2)+40]
-  sbb    r13, rbp
-  mov    r15, [rip+fmt(p434x2)+48]
-  sbb    r14, r15
-  sbb    rax, 0
-  
-  and    rbx, rax
-  and    rcx, rax
-  and    rdi, rax
-  and    rsi, rax
-  and    rbp, rax
-  and    r15, rax
-  
-  add    r8, rbx  
-  adc    r9, rcx  
-  adc    r10, rcx  
-  adc    r11, rdi 
-  adc    r12, rsi 
-  adc    r13, rbp   
-  adc    r14, r15
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13 
-  mov    [reg_p3+48], r14
-  
-  pop    rbp
-  pop    rbx
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
-//***********************************************************************
-//  Field subtraction
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fmt(fpsub434_asm)
-fmt(fpsub434_asm):
-  push   r12
-  push   r13
-  push   r14
-  
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48]
-  sbb    rax, 0
-  
-  mov    rcx, [rip+fmt(p434x2)]
-  mov    rdi, [rip+fmt(p434x2)+8]
-  mov    rsi, [rip+fmt(p434x2)+24]
-  and    rcx, rax
-  and    rdi, rax
-  and    rsi, rax  
-  add    r8, rcx  
-  adc    r9, rdi  
-  adc    r10, rdi  
-  adc    r11, rsi 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11 
-  setc   cl  
-
-  mov    r8, [rip+fmt(p434x2)+32]
-  mov    rdi, [rip+fmt(p434x2)+40]
-  mov    rsi, [rip+fmt(p434x2)+48]
-  and    r8, rax
-  and    rdi, rax
-  and    rsi, rax  
-  bt     rcx, 0  
-  adc    r12, r8 
-  adc    r13, rdi   
-  adc    r14, rsi
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-  
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
-///////////////////////////////////////////////////////////////// MACRO
-.macro SUB434_PX  P0
-  push   r12
-  push   r13
-  
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    rcx, [reg_p1+48]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    rcx, [reg_p2+48]
-
-  mov    rax, [rip+\P0]
-  mov    rdi, [rip+\P0+8]
-  mov    rsi, [rip+\P0+24]
-  add    r8, rax
-  mov    rax, [rip+\P0+32]  
-  adc    r9, rdi  
-  adc    r10, rdi 
-  adc    r11, rsi 
-  mov    rdi, [rip+\P0+40]
-  mov    rsi, [rip+\P0+48]
-  adc    r12, rax   
-  adc    r13, rdi  
-  adc    rcx, rsi
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], rcx
-  
-  pop    r13
-  pop    r12
-  .endm
-
-
-//***********************************************************************
-//  Multiprecision subtraction with correction with 2*p434
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434
-//*********************************************************************** 
-.global fmt(mp_sub434_p2_asm)
-fmt(mp_sub434_p2_asm):
-
-  SUB434_PX  fmt(p434x2)
-  ret
-
-
-//***********************************************************************
-//  Multiprecision subtraction with correction with 4*p434
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p434
-//*********************************************************************** 
-.global fmt(mp_sub434_p4_asm)
-fmt(mp_sub434_p4_asm):
-
-  SUB434_PX  fmt(p434x4)
-  ret
-
-
-#ifdef _MULX_
-    
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  memory pointers M0 and M1
-// Outputs: memory pointer C and regs T1, T3, rax
-// Temps:   regs T0:T6
-/////////////////////////////////////////////////////////////////
-
-#ifdef _ADX_
-.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    xor    rax, rax   
-    adox   \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adox   \T1, \T3
-           
-    mov    rdx, 8\M0
-    mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
-    adox   \T2, rax 
-    xor    rax, rax   
-    mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
-    adox   \T4, \T0
-    mov    8\C, \T4          // C1_final  
-    adcx   \T3, \T6      
-    mulx   \T6, \T0, 16\M1   // T6:T0 = A1*B2 
-    adox   \T3, \T1  
-    adcx   \T5, \T0     
-    adcx   \T6, rax 
-    adox   \T5, \T2	
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    adox   \T6, rax
-    xor    rax, rax 
-    mulx   \T4, \T2, 8\M1    // T4:T2 = A2*B1
-    adox   \T0, \T3   
-    mov    16\C, \T0         // C2_final 
-    adcx   \T1, \T5    
-    mulx   \T0, \T3, 16\M1   // T0:T3 = A2*B2
-    adcx   \T4, \T6  
-    adcx   \T0, rax
-    adox   \T1, \T2
-    adox   \T3, \T4
-    adox   rax, \T0
-.endm 
-    
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  memory pointers M0 and M1
-// Outputs: memory pointer C
-// Temps:   regs T0:T9
-/////////////////////////////////////////////////////////////////
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    xor    rax, rax   
-    adox   \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adox   \T1, \T3        
-    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
-    adox   \T2, \T4 
-           
-    mov    rdx, 8\M0
-    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
-    adox   \T3, rax 
-    xor    rax, rax   
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
-    adox   \T4, \T0
-    mov    8\C, \T4          // C1_final  
-    adcx   \T5, \T7      
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
-    adcx   \T6, \T8  
-    adox   \T5, \T1      
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
-    adcx   \T7, \T9        
-    adcx   \T8, rax   
-    adox   \T6, \T2
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    adox   \T7, \T3
-    adox   \T8, rax
-    xor    rax, rax 
-    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
-    adox   \T0, \T5   
-    mov    16\C, \T0         // C2_final 
-    adcx   \T1, \T3    
-    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
-    adcx   \T2, \T4 
-    adox   \T1, \T6       
-    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
-    adcx   \T3, \T9        
-    mov    rdx, 24\M0
-    adcx   \T4, rax         
-
-    adox   \T2, \T7
-    adox   \T3, \T8
-    adox   \T4, rax
-
-    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
-    xor    rax, rax 
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
-    adcx   \T5, \T7 
-    adox   \T1, \T0       
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
-    adcx   \T6, \T8  
-    adox   \T2, \T5      
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
-    adcx   \T7, \T9        
-    adcx   \T8, rax         
-
-    adox   \T3, \T6
-    adox   \T4, \T7
-    adox   \T8, rax
-    mov    24\C, \T1         // C3_final
-    mov    32\C, \T2         // C4_final
-    mov    40\C, \T3         // C5_final
-    mov    48\C, \T4         // C6_final
-    mov    56\C, \T8         // C7_final
-.endm 
-
-#else
-
-.macro MUL192_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    add    \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adc    \T1, \T3
-           
-    mov    rdx, 8\M0
-    mulx   \T3, \T4, \M1     // T3:T4 = A1*B0
-    adc    \T2, 0   
-    mulx   \T5, \T6, 8\M1    // T5:T6 = A1*B1
-    add    \T4, \T0
-    mov    8\C, \T4          // C1_final
-    adc    \T3, \T1  
-    adc    \T5, \T2	    
-    mulx   \T2, \T1, 16\M1   // T2:T1 = A1*B2
-    adc    \T2, 0    
-
-    add    \T3, \T6  
-    adc    \T5, \T1     
-    adc    \T2, 0
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    add    \T0, \T3   
-    mov    16\C, \T0         // C2_final 
-    mulx   \T4, \T6, 8\M1    // T4:T6 = A2*B1
-    adc    \T1, \T5    
-    adc    \T2, \T4 
-    mulx   rax, \T3, 16\M1   // rax:T3 = A2*B2 
-    adc    rax, 0
-    add    \T1, \T6
-    adc    \T3, \T2
-    adc    rax, 0
-.endm 
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    add    \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adc    \T1, \T3         
-    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
-    adc    \T2, \T4        
-    mov    rdx, 8\M0
-    adc    \T3, 0         
-
-    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
-    add    \T5, \T7        
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
-    adc    \T6, \T8        
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
-    adc    \T7, \T9        
-    adc    \T8, 0         
-
-    add    \T4, \T0
-    mov    8\C, \T4          // C1_final
-    adc    \T5, \T1
-    adc    \T6, \T2
-    adc    \T7, \T3
-    mov    rdx, 16\M0
-    adc    \T8, 0
-
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
-    add    \T1, \T3        
-    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
-    adc    \T2, \T4        
-    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
-    adc    \T3, \T9        
-    mov    rdx, 24\M0
-    adc    \T4, 0          
-
-    add    \T0, \T5
-    mov    16\C, \T0         // C2_final
-    adc    \T1, \T6
-    adc    \T2, \T7
-    adc    \T3, \T8
-    adc    \T4, 0
-
-    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
-    add    \T5, \T7        
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
-    adc    \T6, \T8        
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
-    adc    \T7, \T9         
-    adc    \T8, 0         
-
-    add    \T1, \T0
-    mov    24\C, \T1         // C3_final
-    adc    \T2, \T5
-    mov    32\C, \T2         // C4_final
-    adc    \T3, \T6
-    mov    40\C, \T3         // C5_final
-    adc    \T4, \T7
-    mov    48\C, \T4         // C6_final
-    adc    \T8, 0
-    mov    56\C, \T8         // C7_final
-.endm
-#endif
-
-
-//*****************************************************************************
-//  434-bit multiplication using Karatsuba (one level), schoolbook (one level)
-//***************************************************************************** 
-.global fmt(mul434_asm)
-fmt(mul434_asm):    
-    push   r12
-    push   r13 
-    push   r14 
-    push   r15
-    mov    rcx, reg_p3 
-
-    // r8-r11 <- AH + AL, rax <- mask
-    xor    rax, rax
-    mov    r8, [reg_p1]
-    mov    r9, [reg_p1+8]
-    mov    r10, [reg_p1+16]
-    mov    r11, [reg_p1+24] 
-    push   rbx 
-    push   rbp
-    sub    rsp, 96
-    add    r8, [reg_p1+32]
-    adc    r9, [reg_p1+40]
-    adc    r10, [reg_p1+48]
-    adc    r11, 0
-    sbb    rax, 0
-    mov    [rsp], r8
-    mov    [rsp+8], r9
-    mov    [rsp+16], r10
-    mov    [rsp+24], r11
-
-    // r12-r15 <- BH + BL, rbx <- mask
-    xor    rbx, rbx
-    mov    r12, [reg_p2]
-    mov    r13, [reg_p2+8]
-    mov    r14, [reg_p2+16]
-    mov    r15, [reg_p2+24]
-    add    r12, [reg_p2+32]
-    adc    r13, [reg_p2+40]
-    adc    r14, [reg_p2+48]
-    adc    r15, 0
-    sbb    rbx, 0
-    mov    [rsp+32], r12
-    mov    [rsp+40], r13
-    mov    [rsp+48], r14
-    mov    [rsp+56], r15
-    
-    // r12-r15 <- masked (BH + BL)
-    and    r12, rax
-    and    r13, rax
-    and    r14, rax
-    and    r15, rax
-
-    // r8-r11 <- masked (AH + AL)
-    and    r8, rbx
-    and    r9, rbx
-    and    r10, rbx
-    and    r11, rbx
-
-    // r8-r11 <- masked (AH + AL) + masked (AH + AL)
-    add    r8, r12
-    adc    r9, r13
-    adc    r10, r14
-    adc    r11, r15
-    mov    [rsp+64], r8
-    mov    [rsp+72], r9
-    mov    [rsp+80], r10
-    mov    [rsp+88], r11
-
-    // [rsp] <- (AH+AL) x (BH+BL), low part 
-    MUL256_SCHOOL  [rsp], [rsp+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp 
-
-    // [rcx] <- AL x BL
-    MUL256_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp     // Result C0-C3
-
-    // [rcx+64], rbx, rbp, rax <- AH x BH 
-    MUL192_SCHOOL  [reg_p1+32], [reg_p2+32], [rcx+64], r8, rbx, r10, rbp, r12, r13, r14
-    
-    // r8-r11 <- (AH+AL) x (BH+BL), final step
-    mov    r8, [rsp+64]
-    mov    r9, [rsp+72]
-    mov    r10, [rsp+80]
-    mov    r11, [rsp+88]
-    mov    rdx, [rsp+32]
-    add    r8, rdx
-    mov    rdx, [rsp+40]
-    adc    r9, rdx
-    mov    rdx, [rsp+48]
-    adc    r10, rdx
-    mov    rdx, [rsp+56]
-    adc    r11, rdx
-    
-    // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL
-    mov    r12, [rsp]
-    mov    r13, [rsp+8]
-    mov    r14, [rsp+16]
-    mov    r15, [rsp+24]
-    sub    r12, [rcx]
-    sbb    r13, [rcx+8]
-    sbb    r14, [rcx+16]
-    sbb    r15, [rcx+24]
-    sbb    r8, [rcx+32]
-    sbb    r9, [rcx+40]
-    sbb    r10, [rcx+48]
-    sbb    r11, [rcx+56]
-    
-    // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
-    sub    r12, [rcx+64]
-    sbb    r13, [rcx+72]
-    sbb    r14, [rcx+80]
-    sbb    r15, rbx
-    sbb    r8, rbp
-    sbb    r9, rax
-    sbb    r10, 0
-    sbb    r11, 0
-    
-    add    r12, [rcx+32]
-    mov    [rcx+32], r12    // Result C4-C7
-    adc    r13, [rcx+40]
-    mov    [rcx+40], r13 
-    adc    r14, [rcx+48]
-    mov    [rcx+48], r14 
-    adc    r15, [rcx+56]
-    mov    [rcx+56], r15
-    adc    r8, [rcx+64] 
-    mov    [rcx+64], r8    // Result C8-C15
-    adc    r9, [rcx+72]
-    mov    [rcx+72], r9 
-    adc    r10, [rcx+80]
-    mov    [rcx+80], r10
-    adc    r11, rbx
-    mov    [rcx+88], r11
-    adc    rbp, 0
-    mov    [rcx+96], rbp 
-    adc    rax, 0
-    mov    [rcx+104], rax
-    
-    add    rsp, 96    
-    pop    rbp  
-    pop    rbx
-    pop    r15
-    pop    r14
-    pop    r13
-    pop    r12
-    ret
-
-#else
-
-# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE"
-
-#endif
-
-
-#ifdef _MULX_
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  reg I0 and memory pointer M1
-// Outputs: regs T0:T4
-// Temps:   regs T0:T5
-/////////////////////////////////////////////////////////////////
-.macro MUL64x256_SCHOOL I0, M1, T0, T1, T2, T3, T4, T5 
-    mulx   \T2, \T4, 8\M1
-    xor    rax, rax
-    mulx   \T3, \T5, 16\M1 
-    ADD1   \T1, \T4            // T1 <- C1_final   
-    ADC1   \T2, \T5            // T2 <- C2_final 
-    mulx   \T4, \T5, 24\M1
-    ADC1   \T3, \T5            // T3 <- C3_final
-    ADC1   \T4, rax            // T4 <- C4_final
-.endm
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  regs I0 and I1, and memory pointer M1
-// Outputs: regs T0:T5
-// Temps:   regs T0:T5
-/////////////////////////////////////////////////////////////////
-
-#ifdef _ADX_
-.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
-    mulx   \T2, \T4, 8\M1
-    xor    rax, rax
-    mulx   \T3, \T5, 16\M1 
-    ADD1   \T1, \T4               
-    ADC1   \T2, \T5     
-    mulx   \T4, \T5, 24\M1
-    ADC1   \T3, \T5 
-    ADC1   \T4, rax   
-    
-    xor    rax, rax
-    mov    rdx, \I1 
-    mulx   \I1, \T5, \M1 
-    ADD2   \T1, \T5            // T1 <- C1_final 
-    ADC2   \T2, \I1     
-    mulx   \T5, \I1, 8\M1
-    ADC2   \T3, \T5 
-    ADD1   \T2, \I1        
-    mulx   \T5, \I1, 16\M1
-    ADC2   \T4, \T5 
-    ADC1   \T3, \I1     
-    mulx   \T5, \I1, 24\M1   
-    ADC2   \T5, rax         
-    ADC1   \T4, \I1  
-    ADC1   \T5, rax 
-.endm
-
-#else
-
-.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 
-    mulx   \T2, \T4, 8\M1
-    mulx   \T3, \T5, 16\M1 
-    add    \T1, \T4               
-    adc    \T2, \T5     
-    mulx   \T4, \T5, 24\M1
-    adc    \T3, \T5 
-    adc    \T4, 0   
-    
-    mov    rdx, \I1 
-    mulx   \I1, \T5, \M1 
-    add    \T1, \T5            // T1 <- C1_final 
-    adc    \T2, \I1     
-    mulx   \T5, \I1, 8\M1
-    adc    \T3, \T5       
-    mulx   \T5, rax, 16\M1
-    adc    \T4, \T5     
-    mulx   \T5, rdx, 24\M1 
-    adc    \T5, 0
-    add    \T2, \I1  
-    adc    \T3, rax        
-    adc    \T4, rdx  
-    adc    \T5, 0 
-.endm
-#endif
-
-  
-//**************************************************************************************
-//  Montgomery reduction
-//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
-//  Operation: c [reg_p2] = a [reg_p1]
-//************************************************************************************** 
-.global fmt(rdc434_asm)
-fmt(rdc434_asm):
-    push   r14
-
-    // a[0-1] x p434p1_nz --> result: r8:r13 
-    mov    rdx, [reg_p1]
-    mov    r14, [reg_p1+8]  
-    mulx   r9, r8, [rip+fmt(p434p1)+24]   // result r8    
-    push   r12
-    push   r13
-    push   r15
-    push   rbp
-    push   rbx 
-    MUL128x256_SCHOOL rdx, r14, [rip+fmt(p434p1)+24], r8, r9, r10, r11, r12, r13     
-
-    mov    rdx, [reg_p1+16]   
-    mov    rcx, [reg_p1+72]
-    add    r8, [reg_p1+24]  
-    adc    r9, [reg_p1+32]  
-    adc    r10, [reg_p1+40]   
-    adc    r11, [reg_p1+48]   
-    adc    r12, [reg_p1+56]   
-    adc    r13, [reg_p1+64] 
-    adc    rcx, 0 
-    mulx   rbp, rbx, [rip+fmt(p434p1)+24]   // result rbx
-    mov    [reg_p2], r9  
-    mov    [reg_p2+8], r10  
-    mov    [reg_p2+16], r11  
-    mov    [reg_p2+24], r12  
-    mov    [reg_p2+32], r13 
-    mov    r9, [reg_p1+80]  
-    mov    r10, [reg_p1+88]  
-    mov    r11, [reg_p1+96]
-    mov    rdi, [reg_p1+104]
-    adc    r9, 0
-    adc    r10, 0
-    adc    r11, 0
-    adc    rdi, 0
-
-    // a[2-3] x p434p1_nz --> result: rbx, rbp, r12:r15
-    MUL128x256_SCHOOL rdx, r8, [rip+fmt(p434p1)+24], rbx, rbp, r12, r13, r14, r15
-
-    mov    rdx, [reg_p2]
-    add    rbx, [reg_p2+8]  
-    adc    rbp, [reg_p2+16]  
-    adc    r12, [reg_p2+24]   
-    adc    r13, [reg_p2+32]  
-    adc    r14, rcx  
-    mov    rcx, 0 
-    adc    r15, r9
-    adc    rcx, r10
-    mulx   r9, r8, [rip+fmt(p434p1)+24]   // result r8
-    mov    [reg_p2], rbp 
-    mov    [reg_p2+8], r12  
-    mov    [reg_p2+16], r13 
-    adc    r11, 0
-    adc    rdi, 0 
-
-    // a[4-5] x p434p1_nz --> result: r8:r13
-    MUL128x256_SCHOOL rdx, rbx, [rip+fmt(p434p1)+24], r8, r9, r10, rbp, r12, r13  
-
-    mov    rdx, [reg_p2]
-    add    r8, [reg_p2+8]  
-    adc    r9, [reg_p2+16]  
-    adc    r10, r14   
-    adc    rbp, r15 
-    adc    r12, rcx 
-    adc    r13, r11   
-    adc    rdi, 0  
-    mulx   r15, r14, [rip+fmt(p434p1)+24]  // result r14 
-    mov    [reg_p2], r8        // Final result c0-c1
-    mov    [reg_p2+8], r9    
-
-    // a[6-7] x p434p1_nz --> result: r14:r15, r8:r9, r11
-    MUL64x256_SCHOOL rdx, [rip+fmt(p434p1)+24], r14, r15, r8, r9, r11, rcx  
-    
-    // Final result c2:c6
-    add    r14, r10  
-    adc    r15, rbp 
-    pop    rbx
-    pop    rbp 
-    adc    r8, r12   
-    adc    r9, r13  
-    adc    r11, rdi 
-    mov    [reg_p2+16], r14  
-    mov    [reg_p2+24], r15  
-    pop    r15
-    pop    r13
-    mov    [reg_p2+32], r8  
-    mov    [reg_p2+40], r9  
-    mov    [reg_p2+48], r11
-
-    pop    r12
-    pop    r14
-    ret
-
-  #else
-
-  # error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE"
-
-  #endif
-
-
-//***********************************************************************
-//  434-bit multiprecision addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fmt(mp_add434_asm)
-fmt(mp_add434_asm): 
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  
-  mov    r8, [reg_p1+32]
-  mov    r9, [reg_p1+40]
-  mov    r10, [reg_p1+48]
-  adc    r8, [reg_p2+32] 
-  adc    r9, [reg_p2+40] 
-  adc    r10, [reg_p2+48] 
-  mov    [reg_p3+32], r8
-  mov    [reg_p3+40], r9
-  mov    [reg_p3+48], r10
-  ret
-
-
-//***************************************************************************
-//  2x434-bit multiprecision subtraction/addition
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. If c < 0, add p434*2^448
-//*************************************************************************** 
-.global fmt(mp_subadd434x2_asm)
-fmt(mp_subadd434x2_asm):
-  push   r12
-  push   r13 
-  push   r14 
-  push   r15 
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12
-
-  mov    r8, [reg_p1+40]
-  mov    r9, [reg_p1+48]
-  mov    r10, [reg_p1+56] 
-  mov    r11, [reg_p1+64]
-  mov    r12, [reg_p1+72] 
-  sbb    r8, [reg_p2+40] 
-  sbb    r9, [reg_p2+48] 
-  sbb    r10, [reg_p2+56]
-  sbb    r11, [reg_p2+64] 
-  sbb    r12, [reg_p2+72]
-  mov    [reg_p3+40], r8
-  mov    [reg_p3+48], r9
-  mov    [reg_p3+56], r10
-  
-  mov    r13, [reg_p1+80]
-  mov    r14, [reg_p1+88] 
-  mov    r15, [reg_p1+96]
-  mov    rcx, [reg_p1+104]
-  sbb    r13, [reg_p2+80]
-  sbb    r14, [reg_p2+88]
-  sbb    r15, [reg_p2+96] 
-  sbb    rcx, [reg_p2+104] 
-  sbb    rax, 0
-  
-  // Add p434 anded with the mask in rax 
-  mov    r8, [rip+fmt(p434)]
-  mov    r9, [rip+fmt(p434)+24]
-  mov    r10, [rip+fmt(p434)+32]
-  mov    rdi, [rip+fmt(p434)+40]
-  mov    rsi, [rip+fmt(p434)+48]
-  and    r8, rax
-  and    r9, rax
-  and    r10, rax
-  and    rdi, rax
-  and    rsi, rax
-  mov    rax, [reg_p3+56]
-  add    rax, r8
-  adc    r11, r8
-  adc    r12, r8
-  adc    r13, r9
-  adc    r14, r10
-  adc    r15, rdi
-  adc    rcx, rsi
-  
-  mov    [reg_p3+56], rax
-  mov    [reg_p3+64], r11
-  mov    [reg_p3+72], r12
-  mov    [reg_p3+80], r13
-  mov    [reg_p3+88], r14
-  mov    [reg_p3+96], r15
-  mov    [reg_p3+104], rcx
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
-//***********************************************************************
-//  Double 2x434-bit multiprecision subtraction
-//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fmt(mp_dblsub434x2_asm)
-fmt(mp_dblsub434x2_asm):
-  push   r12
-  push   r13
-  push   r14
-  
-  mov    r8, [reg_p3]
-  mov    r9, [reg_p3+8]
-  mov    r10, [reg_p3+16]
-  mov    r11, [reg_p3+24]
-  mov    r12, [reg_p3+32]
-  mov    r13, [reg_p3+40]
-  mov    r14, [reg_p3+48]
-  sub    r8, [reg_p1]
-  sbb    r9, [reg_p1+8] 
-  sbb    r10, [reg_p1+16] 
-  sbb    r11, [reg_p1+24] 
-  sbb    r12, [reg_p1+32] 
-  sbb    r13, [reg_p1+40] 
-  sbb    r14, [reg_p1+48]
-  setc   al  
-  sub    r8, [reg_p2]
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48]
-  setc   cl  
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-    
-  mov    r8, [reg_p3+56]
-  mov    r9, [reg_p3+64]
-  mov    r10, [reg_p3+72]
-  mov    r11, [reg_p3+80]
-  mov    r12, [reg_p3+88]
-  mov    r13, [reg_p3+96]
-  mov    r14, [reg_p3+104]
-  bt     rax, 0  
-  sbb    r8, [reg_p1+56] 
-  sbb    r9, [reg_p1+64] 
-  sbb    r10, [reg_p1+72] 
-  sbb    r11, [reg_p1+80] 
-  sbb    r12, [reg_p1+88] 
-  sbb    r13, [reg_p1+96] 
-  sbb    r14, [reg_p1+104]
-  bt     rcx, 0  
-  sbb    r8, [reg_p2+56] 
-  sbb    r9, [reg_p2+64] 
-  sbb    r10, [reg_p2+72] 
-  sbb    r11, [reg_p2+80] 
-  sbb    r12, [reg_p2+88] 
-  sbb    r13, [reg_p2+96] 
-  sbb    r14, [reg_p2+104] 
-  mov    [reg_p3+56], r8
-  mov    [reg_p3+64], r9
-  mov    [reg_p3+72], r10
-  mov    [reg_p3+80], r11
-  mov    [reg_p3+88], r12
-  mov    [reg_p3+96], r13
-  mov    [reg_p3+104], r14
-  
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
\ No newline at end of file
+//*******************************************************************************************
+
+.intel_syntax noprefix 
+
+// Format function and variable names for Mac OS X
+#if defined(__APPLE__)
+    #define fmt(f)    _##f
+#else
+    #define fmt(f)    f
+#endif
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+
+
+.text
+//***********************************************************************
+//  434-bit multiprecision addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//*********************************************************************** 
+.global fmt(mp_add434_asm)
+fmt(mp_add434_asm): 
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+  
+  mov    r8, [reg_p1+32]
+  mov    r9, [reg_p1+40]
+  mov    r10, [reg_p1+48]
+  adc    r8, [reg_p2+32] 
+  adc    r9, [reg_p2+40] 
+  adc    r10, [reg_p2+48] 
+  mov    [reg_p3+32], r8
+  mov    [reg_p3+40], r9
+  mov    [reg_p3+48], r10
+  ret
+
+
+//***********************************************************************
+//  Field addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//*********************************************************************** 
+.global fmt(fpadd434_asm)
+fmt(fpadd434_asm):
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  push   rbx
+  push   rbp
+  
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  adc    r12, [reg_p2+32] 
+  adc    r13, [reg_p2+40] 
+  adc    r14, [reg_p2+48]
+
+  mov    rbx, [rip+fmt(p434x2)]
+  sub    r8, rbx
+  mov    rcx, [rip+fmt(p434x2)+8]
+  sbb    r9, rcx
+  sbb    r10, rcx
+  mov    rdi, [rip+fmt(p434x2)+24]
+  sbb    r11, rdi
+  mov    rsi, [rip+fmt(p434x2)+32]
+  sbb    r12, rsi
+  mov    rbp, [rip+fmt(p434x2)+40]
+  sbb    r13, rbp
+  mov    r15, [rip+fmt(p434x2)+48]
+  sbb    r14, r15
+  sbb    rax, 0
+  
+  and    rbx, rax
+  and    rcx, rax
+  and    rdi, rax
+  and    rsi, rax
+  and    rbp, rax
+  and    r15, rax
+  
+  add    r8, rbx  
+  adc    r9, rcx  
+  adc    r10, rcx  
+  adc    r11, rdi 
+  adc    r12, rsi 
+  adc    r13, rbp   
+  adc    r14, r15
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13 
+  mov    [reg_p3+48], r14
+  
+  pop    rbp
+  pop    rbx
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+
+//***********************************************************************
+//  Field subtraction
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
+//*********************************************************************** 
+.global fmt(fpsub434_asm)
+fmt(fpsub434_asm):
+  push   r12
+  push   r13
+  push   r14
+  
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    r14, [reg_p2+48]
+  sbb    rax, 0
+  
+  mov    rcx, [rip+fmt(p434x2)]
+  mov    rdi, [rip+fmt(p434x2)+8]
+  mov    rsi, [rip+fmt(p434x2)+24]
+  and    rcx, rax
+  and    rdi, rax
+  and    rsi, rax  
+  add    r8, rcx  
+  adc    r9, rdi  
+  adc    r10, rdi  
+  adc    r11, rsi 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11 
+  setc   cl  
+
+  mov    r8, [rip+fmt(p434x2)+32]
+  mov    rdi, [rip+fmt(p434x2)+40]
+  mov    rsi, [rip+fmt(p434x2)+48]
+  and    r8, rax
+  and    rdi, rax
+  and    rsi, rax  
+  bt     rcx, 0  
+  adc    r12, r8 
+  adc    r13, rdi   
+  adc    r14, rsi
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], r14
+  
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+
+///////////////////////////////////////////////////////////////// MACRO
+.macro SUB434_PX  P0
+  push   r12
+  push   r13
+  
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    rcx, [reg_p1+48]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    rcx, [reg_p2+48]
+
+  mov    rax, [rip+\P0]
+  mov    rdi, [rip+\P0+8]
+  mov    rsi, [rip+\P0+24]
+  add    r8, rax
+  mov    rax, [rip+\P0+32]  
+  adc    r9, rdi  
+  adc    r10, rdi 
+  adc    r11, rsi 
+  mov    rdi, [rip+\P0+40]
+  mov    rsi, [rip+\P0+48]
+  adc    r12, rax   
+  adc    r13, rdi  
+  adc    rcx, rsi
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], rcx
+  
+  pop    r13
+  pop    r12
+.endm
+
+
+//***********************************************************************
+//  Multiprecision subtraction with correction with 2*p434
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p434
+//*********************************************************************** 
+.global fmt(mp_sub434_p2_asm)
+fmt(mp_sub434_p2_asm):
+
+  SUB434_PX  fmt(p434x2)
+  ret
+
+
+#ifdef _MULX_
+#ifdef _ADX_
+
+///////////////////////////////////////////////////////////////// MACROS
+// z = a x bi + z
+// Inputs: base memory pointer M1 (a),
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z4] or [Z0:Z7]
+// Output: [Z0:Z4] or [Z0:Z7]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro MULADD64x448 M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1, C
+    mulx   \T0, \T1, \M1     // A0*B0
+    xor    \C, \C
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    mulx   \T0, \T1, 32\M1   // A0*B4          
+    adcx   \Z4, \T1
+    adox   \Z5, \T0
+    mulx   \T0, \T1, 40\M1   // A0*B5          
+    adcx   \Z5, \T1
+    adox   \Z6, \T0
+    mulx   \T0, \T1, 48\M1   // A0*B6          
+    adcx   \Z6, \T1
+    adox   \Z7, \T0
+    adc    \Z7, 0    
+.endm
+
+
+.macro MULADD64x256 M1, Z0, Z1, Z2, Z3, Z4, T0, T1
+    mulx   \T0, \T1, \M1     // A0*B0
+    xor    rax, rax
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    adcx   \Z4, rax    
+.endm
+
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x b + c x d (mod p)
+// Inputs: base memory pointers M0 (a,c), M1 (b,d)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z7], pre-stores a0 x b
+// Output: [Z0:Z7]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro FPDBLMUL448x448 M00, M01, M10, M11, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1           
+    mov    rdx, \M11    
+    MULADD64x448 \M01, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, rax                
+    // [Z1:Z7] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z0                 // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1
+    
+    // [Z1:Z7, \Z0] <- z = a01 x a1 + z 
+    mov    rdx, 8\M10
+    MULADD64x448 \M00, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \Z0           
+    mov    rdx, 8\M11    
+    MULADD64x448 \M01, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, rax        
+    // [Z2:Z7, Z0] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z1                 // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1
+    
+    // [Z2:Z7, Z0:Z1] <- z = a02 x a1 + z  
+    mov    rdx, 16\M10
+    MULADD64x448 \M00, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \Z1          
+    mov    rdx, 16\M11    
+    MULADD64x448 \M01, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, rax     
+    // [Z3:Z7, Z0:Z1] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z2                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1
+    
+    // [Z3:Z7, Z0:Z2] <- z = a03 x a1 + z
+    mov    rdx, 24\M10
+    MULADD64x448 \M00, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \Z2          
+    mov    rdx, 24\M11    
+    MULADD64x448 \M01, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, rax     
+    // [Z4:Z7, Z0:Z2] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z3                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1
+    
+    // [Z4:Z7, Z0:Z3] <- z = a04 x a1 + z 
+    mov    rdx, 32\M10
+    MULADD64x448 \M00, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3          
+    mov    rdx, 32\M11    
+    MULADD64x448 \M01, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, rax     
+    // [Z5:Z7, Z0:Z3] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z4                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+    
+    // [Z5:Z7, Z0:Z4] <- z = a05 x a1 + z    
+    mov    rdx, 40\M10
+    MULADD64x448 \M00, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4          
+    mov    rdx, 40\M11    
+    MULADD64x448 \M01, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, rax     
+    // [Z6:Z7, Z0:Z4] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z5                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1
+    
+    // [Z6:Z7, Z0:Z5] <- z = a06 x a1 + z  
+    mov    rdx, 48\M10
+    MULADD64x448 \M00, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5          
+    mov    rdx, 48\M11    
+    MULADD64x448 \M01, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, rax 
+    pop    \T1
+    mov    [rcx], \Z7     
+    // [Z7, Z0:Z5] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z6                // rdx <- z0
+    //MULADD64x256 [rip+fmt(p434p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1 
+    mulx   \Z6, \Z7, [rip+p434p1+24]
+    pop    \T0
+    adox   \Z1, \Z7
+    adox   \Z2, \Z6  
+    mov    [rcx+8], \Z0  
+    mulx   \Z6, \Z7, [rip+p434p1+32]
+    mov    [rcx+16], \Z1 
+    adcx   \Z2, \Z7
+    adox   \Z3, \Z6   
+    mov    [rcx+24], \Z2  
+    mulx   \Z2, \Z1, [rip+p434p1+40]
+    pop    \Z7
+    adcx   \Z3, \Z1
+    adox   \Z4, \Z2   
+    mov    [rcx+32], \Z3   
+    mulx   \Z2, \Z1, [rip+p434p1+48] 
+    pop    \Z6        
+    adcx   \Z4, \Z1
+    adox   \Z5, \Z2
+    adc    \Z5, 0
+.endm
+
+
+//***********************************************************************
+//  Multiplication in GF(p^2), non-complex part
+//  Operation: c [reg_p3] = a0 x b0 - a1 x b1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//          b = [b1, b0] stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fp2mul434_c0_asm)
+fmt(fp2mul434_c0_asm):   
+    push   r12 
+    mov    rcx, reg_p3
+	
+	// [rcx0:rcx48] <- 8*p - b1
+	mov    r8, [rip+fmt(p434x8)]  
+	mov    r9, [rip+fmt(p434x8)+8]   
+	mov    r11, [rip+fmt(p434x8)+24]
+	mov    r12, [rip+fmt(p434x8)+32] 
+	mov    rax, [reg_p2+56]
+	mov    rdx, [reg_p2+64]
+	mov    r10, r9                 
+	sub    r8, rax
+    push   r13 
+	sbb    r9, rdx
+	mov    rax, [reg_p2+72]
+	mov    rdx, [reg_p2+80]
+	sbb    r10, rax
+    push   r14 
+	sbb    r11, rdx
+	mov    rax, [reg_p2+88]
+	mov    rdx, [reg_p2+96]
+	mov    r13, [rip+fmt(p434x8)+40]
+	mov    r14, [rip+fmt(p434x8)+48]
+	mov    [rcx], r8
+	sbb    r12, rax
+    push   r15 
+	sbb    r13, rdx
+	mov    rax, [reg_p2+104]
+	mov    [rcx+8], r9
+	sbb    r14, rax 
+	mov    [rcx+16], r10
+    
+    // [r8:r15] <- z = a0 x b00 - a1 x b10
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1] 
+	mov    [rcx+24], r11      
+    xor    rax, rax 
+    mulx   r10, r11, [reg_p1+8] 
+	mov    [rcx+32], r12     
+    adox   r9, r11        
+    mulx   r11, r12, [reg_p1+16]
+	mov    [rcx+40], r13     
+    adox   r10, r12        
+    mulx   r12, r13, [reg_p1+24] 
+	mov    [rcx+48], r14     
+    adox   r11, r13       
+    mulx   r13, r14, [reg_p1+32]
+    push   rbx 
+    adox   r12, r14      
+    mulx   r14, r15, [reg_p1+40] 
+    push   rbp  
+    adox   r13, r15      
+    mulx   r15, rbx, [reg_p1+48]   
+    adox   r14, rbx 
+    adox   r15, rax
+
+	FPDBLMUL448x448 [reg_p1], [reg_p1+56], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
+	           
+    mov    [rcx+40], r12      
+    mov    [rcx+48], r13
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Multiplication in GF(p^2), complex part
+//  Operation: c [reg_p3] = a0 x b1 + a1 x b0
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//          b = [b1, b0] stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fp2mul434_c1_asm)
+fmt(fp2mul434_c1_asm): 
+    mov    rcx, reg_p3
+    
+    // [r8:r15] <- z = a0 x b10 + a1 x b00
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1+56]     // a0 x b10
+    xor    rax, rax     
+    push   r12 
+    mulx   r10, r11, [reg_p1+64]  
+    push   r13  
+    adox   r9, r11        
+    mulx   r11, r12, [reg_p1+72]  
+    push   r14  
+    adox   r10, r12        
+    mulx   r12, r13, [reg_p1+80]  
+    push   r15   
+    adox   r11, r13       
+    mulx   r13, r14, [reg_p1+88] 
+    push   rbx    
+    adox   r12, r14      
+    mulx   r14, r15, [reg_p1+96] 
+    push   rbp 
+    adox   r13, r15      
+    mulx   r15, rbx, [reg_p1+104] 
+    adox   r14, rbx 
+    adox   r15, rax 
+
+	FPDBLMUL448x448 [reg_p1+56], [reg_p1], [reg_p2], [reg_p2+56], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
+                             
+    mov    [rcx+40], r12      
+    mov    [rcx+48], r13
+    pop    r13
+    pop    r12
+    ret
+
+#else
+
+# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE"
+
+#endif
+
+#else
+
+# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE"
+
+#endif
+
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x b (mod p)
+// Inputs: base memory pointers M0 (a), M1 (b)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z7], pre-stores a0 x b
+// Output: [Z0:Z7]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro FPMUL448x448 M0, M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1           
+    // [Z1:Z7] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z0                 // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1
+    
+    // [Z1:Z7, \Z0] <- z = a01 x a1 + z 
+    mov    rdx, 8\M0
+    MULADD64x448 \M1, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \Z0
+    // [Z2:Z7, Z0] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z1                 // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1
+    
+    // [Z2:Z7, Z0:Z1] <- z = a02 x a1 + z  
+    mov    rdx, 16\M0
+    MULADD64x448 \M1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \Z1
+    // [Z3:Z7, Z0:Z1] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z2                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1
+    
+    // [Z3:Z7, Z0:Z2] <- z = a03 x a1 + z
+    mov    rdx, 24\M0
+    MULADD64x448 \M1, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \Z2
+    // [Z4:Z7, Z0:Z2] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z3                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1
+    
+    // [Z4:Z7, Z0:Z3] <- z = a04 x a1 + z 
+    mov    rdx, 32\M0
+    MULADD64x448 \M1, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3
+    // [Z5:Z7, Z0:Z3] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z4                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+    
+    // [Z5:Z7, Z0:Z4] <- z = a05 x a1 + z    
+    mov    rdx, 40\M0
+    MULADD64x448 \M1, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4
+    // [Z6:Z7, Z0:Z4] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z5                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1
+    
+    // [Z6:Z7, Z0:Z5] <- z = a06 x a1 + z  
+    mov    rdx, 48\M0
+    MULADD64x448 \M1, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5
+    // [Z7, Z0:Z5] <- z = (z0 x p434p1 + z)/2^64
+    mov    rdx, \Z6                // rdx <- z0
+    MULADD64x256 [rip+fmt(p434p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1
+.endm
+
+
+//***********************************************************************
+//  Squaring in GF(p^2), non-complex part
+//  Operation: c [reg_p2] = (a0+a1) x (a0-a1)
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [reg_p2]
+//***********************************************************************
+.global fmt(fp2sqr434_c0_asm)
+fmt(fp2sqr434_c0_asm):   
+    push   r12 
+
+	// a0 + a1
+	mov    r8, [reg_p1]
+	mov    r9, [reg_p1+8]
+	mov    r10, [reg_p1+16]
+	mov    r11, [reg_p1+24]
+	mov    r12, [reg_p1+32]
+	add    r8, [reg_p1+56]
+    push   r13
+	adc    r9, [reg_p1+64]
+	adc    r10, [reg_p1+72]
+    push   r14 
+	adc    r11, [reg_p1+80]
+	adc    r12, [reg_p1+88]
+	mov    r13, [reg_p1+40]
+	mov    r14, [reg_p1+48]
+	adc    r13, [reg_p1+96]
+	adc    r14, [reg_p1+104]
+	mov    [reg_p2], r8
+	mov    [reg_p2+8], r9
+	mov    [reg_p2+16], r10
+	mov    [reg_p2+24], r11
+	mov    [reg_p2+32], r12
+	mov    [reg_p2+40], r13
+	mov    [reg_p2+48], r14
+	
+	// a0 - a1 + 4xp434
+	mov    rax, [reg_p1]
+	mov    r10, [reg_p1+8]
+	mov    r12, [reg_p1+16]
+	mov    r13, [reg_p1+24]
+	mov    r14, [reg_p1+32]
+	sub    rax, [reg_p1+56]
+	sbb    r10, [reg_p1+64]
+	sbb    r12, [reg_p1+72] 
+    push   r15 
+	sbb    r13, [reg_p1+80]
+	sbb    r14, [reg_p1+88]
+	mov    r15, [reg_p1+40]
+	mov    rcx, [reg_p1+48]
+	sbb    r15, [reg_p1+96]
+	sbb    rcx, [reg_p1+104]
+	add    rax, [rip+fmt(p434x4)]                    
+	mov    rdx, [rip+fmt(p434x4)+8]
+	adc    r10, rdx
+	adc    r12, rdx
+	adc    r13, [rip+fmt(p434x4)+24]
+	adc    r14, [rip+fmt(p434x4)+32]
+	adc    r15, [rip+fmt(p434x4)+40]
+	adc    rcx, [rip+fmt(p434x4)+48]
+	mov    [reg_p2+56], rax 
+    
+    // [r8:r15] <- z = a00 x a1
+    mov    rdx, r8
+    mulx   r9, r8, rax                 
+	mov    [reg_p2+64], r10 
+    xor    rax, rax   
+    push   rbx  
+    mulx   r10, r11, r10  
+	mov    [reg_p2+72], r12 
+    adox   r9, r11        
+    mulx   r11, r12, r12 
+	mov    [reg_p2+80], r13  
+    adox   r10, r12        
+    mulx   r12, r13, r13 
+	mov    [reg_p2+88], r14  
+    adox   r11, r13       
+    mulx   r13, r14, r14 
+	mov    [reg_p2+96], r15  
+    adox   r12, r14      
+    mulx   r14, r15, r15  
+	mov    [reg_p2+104], rcx 
+    adox   r13, r15      
+    mulx   r15, rbx, rcx  
+    adox   r14, rbx 
+    adox   r15, rax 
+           
+	FPMUL448x448 [reg_p2], [reg_p2+56], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rcx
+           
+    mov    [reg_p2], r15   
+    mov    [reg_p2+8], r8         
+    mov    [reg_p2+16], r9         
+    mov    [reg_p2+24], r10      
+    mov    [reg_p2+32], r11      
+    mov    [reg_p2+40], r12      
+    mov    [reg_p2+48], r13
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Squaring in GF(p^2), complex part
+//  Operation: c [reg_p2] = 2a0 x a1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [reg_p2]
+//***********************************************************************
+.global fmt(fp2sqr434_c1_asm)
+fmt(fp2sqr434_c1_asm):  
+    push   r12
+	
+	mov    r8, [reg_p1]
+	mov    r9, [reg_p1+8]
+	mov    r10, [reg_p1+16]
+	mov    r11, [reg_p1+24]
+	mov    r12, [reg_p1+32]
+	add    r8, r8
+    push   r13 
+	adc    r9, r9
+	adc    r10, r10
+    push   r14 
+	adc    r11, r11 
+	mov    r13, [reg_p1+40] 
+	mov    r14, [reg_p1+48] 
+	adc    r12, r12    
+    push   r15 
+	adc    r13, r13  
+    push   rbx 
+	adc    r14, r14
+	sub    rsp, 56
+	mov    [rsp+8], r9
+	mov    [rsp+16], r10
+    
+    // [r8:r15] <- z = a00 x a1
+    mov    rdx, r8
+    mulx   r9, r8, [reg_p1+56] 
+	mov    [rsp+24], r11   
+    xor    rax, rax 
+    mulx   r10, r11, [reg_p1+64]
+	mov    [rsp+32], r12
+    adox   r9, r11        
+    mulx   r11, r12, [reg_p1+72]
+	mov    [rsp+40], r13
+    adox   r10, r12        
+    mulx   r12, r13, [reg_p1+80] 
+	mov    [rsp+48], r14
+    adox   r11, r13       
+    mulx   r13, r14, [reg_p1+88]
+    adox   r12, r14      
+    mulx   r14, r15, [reg_p1+96]   
+    adox   r13, r15      
+    mulx   r15, rbx, [reg_p1+104]  
+    adox   r14, rbx 
+    adox   r15, rax 
+
+	FPMUL448x448 [rsp], [reg_p1+56], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rcx
+
+	add    rsp, 56
+    mov    [reg_p2], r15 
+    mov    [reg_p2+8], r8         
+    mov    [reg_p2+16], r9         
+    mov    [reg_p2+24], r10      
+    mov    [reg_p2+32], r11      
+    mov    [reg_p2+40], r12      
+    mov    [reg_p2+48], r13
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Field multiplication in GF(p)
+//  Operation: c = a x b mod p
+//  Inputs: a stored in [reg_p1], b stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fpmul434_asm)
+fmt(fpmul434_asm): 
+    mov    rcx, reg_p3
+     
+    // [r8:r15] <- z = a x b0
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1]  
+    push   r12
+    xor    rax, rax 
+    mulx   r10, r11, [reg_p1+8]
+    push   r13 
+    adox   r9, r11        
+    mulx   r11, r12, [reg_p1+16]
+    push   r14 
+    adox   r10, r12        
+    mulx   r12, r13, [reg_p1+24]   
+    push   r15 
+    adox   r11, r13       
+    mulx   r13, r14, [reg_p1+32]    
+    push   rbx   
+    adox   r12, r14      
+    mulx   r14, r15, [reg_p1+40]  
+    push   rbp
+    adox   r13, r15      
+    mulx   r15, rbx, [reg_p1+48]  
+    adox   r14, rbx 
+    adox   r15, rax 
+
+	FPMUL448x448 [reg_p2], [reg_p1], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp
+
+    mov    [rcx], r15 
+    mov    [rcx+8], r8         
+    mov    [rcx+16], r9         
+    mov    [rcx+24], r10      
+    mov    [rcx+32], r11      
+    mov    [rcx+40], r12      
+    mov    [rcx+48], r13
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
\ No newline at end of file
diff --git a/src/P434/ARM64/fp_arm64.c b/src/P434/ARM64/fp_arm64.c
index 48cf3de..de65295 100644
--- a/src/P434/ARM64/fp_arm64.c
+++ b/src/P434/ARM64/fp_arm64.c
@@ -1,10 +1,15 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P434
 *********************************************************************************************/
 
 #include "../P434_internal.h"
+#include "../../internal.h"
 
 // Global constants
 extern const uint64_t p434[NWORDS_FIELD];
@@ -13,21 +18,21 @@ extern const uint64_t p434x2[NWORDS_FIELD];
 extern const uint64_t p434x4[NWORDS_FIELD];
 
 
-__inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub434_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. 
     
     mp_sub434_p2_asm(a, b, c); 
 } 
 
 
-__inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub434_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. 
     
     mp_sub434_p4_asm(a, b, c);
 }
 
 
-__inline void fpadd434(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpadd434(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular addition, c = a+b mod p434.
   // Inputs: a, b in [0, 2*p434-1] 
   // Output: c in [0, 2*p434-1]
@@ -36,7 +41,7 @@ __inline void fpadd434(const digit_t* a, const digit_t* b, digit_t* c)
 } 
 
 
-__inline void fpsub434(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpsub434(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular subtraction, c = a-b mod p434.
   // Inputs: a, b in [0, 2*p434-1] 
   // Output: c in [0, 2*p434-1] 
@@ -45,7 +50,7 @@ __inline void fpsub434(const digit_t* a, const digit_t* b, digit_t* c)
 }
 
 
-__inline void fpneg434(digit_t* a)
+inline void fpneg434(digit_t* a)
 { // Modular negation, a = -a mod p434.
   // Input/output: a in [0, 2*p434-1] 
     unsigned int i, borrow = 0;
diff --git a/src/P434/ARM64/fp_arm64_asm.S b/src/P434/ARM64/fp_arm64_asm.S
index ad4ddf3..c85480e 100644
--- a/src/P434/ARM64/fp_arm64_asm.S
+++ b/src/P434/ARM64/fp_arm64_asm.S
@@ -1,5 +1,9 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license
 //
 // Abstract: field arithmetic in 64-bit ARMv8 assembly for P434 on Linux
 //*******************************************************************************************
diff --git a/src/P434/P434.c b/src/P434/P434.c
index 800364e..b14863f 100644
--- a/src/P434/P434.c
+++ b/src/P434/P434.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P434
 *********************************************************************************************/  
@@ -27,12 +31,10 @@ const uint64_t p434x2[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF
                                                      0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 }; 
 const uint64_t p434x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xF705D9EB8BFFFFFF, 
                                                      0xEF1971E0C562BA8F, 0xB3F17F5A07148159, 0x0008D07C9C5DCD11 }; 
+const uint64_t p434x8[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEE0BB3D717FFFFFF, 
+                                                     0xDE32E3C18AC5751F, 0x67E2FEB40E2902B3, 0x0011A0F938BB9A23 }; 
 const uint64_t p434p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000,
-                                                     0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344 };  
-const uint64_t p434x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x47D130A3A0000000, 
-                                                     0x873470F9D4EA2B80, 0x6074052FC75BF530, 0x54497C1B1D119772, 0xC55F373D2CDCA412, 
-                                                     0x732CA2221C664B96, 0x6445AB96AF6359A5, 0x221708AB42ABE1B4, 0xAE3D3D0063244F01, 
-                                                     0x18B920F2ECF68816, 0x0000004DB194809D }; 
+                                                     0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344 }; 
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000 }; 
 // Order of Bob's subgroup
@@ -90,6 +92,7 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fpneg                         fpneg434
 #define fpdiv2                        fpdiv2_434
 #define fpcorrection                  fpcorrection434
+#define fpmul                         fpmul434
 #define fpmul_mont                    fpmul434_mont
 #define fpsqr_mont                    fpsqr434_mont
 #define fpinv_mont                    fpinv434_mont
@@ -107,6 +110,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fp2correction                 fp2correction434
 #define fp2mul_mont                   fp2mul434_mont
 #define fp2sqr_mont                   fp2sqr434_mont
+#define fp2mul_c0_mont                fp2mul434_c0_mont
+#define fp2mul_c1_mont                fp2mul434_c1_mont
+#define fp2sqr_c0_mont                fp2sqr434_c0_mont
+#define fp2sqr_c1_mont                fp2sqr434_c1_mont
 #define fp2inv_mont                   fp2inv434_mont
 #define fp2inv_mont_bingcd            fp2inv434_mont_bingcd
 #define fpequal_non_constant_time     fpequal434_non_constant_time
diff --git a/src/P434/P434_api.h b/src/P434/P434_api.h
index e274237..679b2c5 100644
--- a/src/P434/P434_api.h
+++ b/src/P434/P434_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P434
 *********************************************************************************************/  
diff --git a/src/P434/P434_compressed.c b/src/P434/P434_compressed.c
index 3c84740..7f07d38 100644
--- a/src/P434/P434_compressed.c
+++ b/src/P434/P434_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * Supersingular Isogeny Key Encapsulation Library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P434_compressed
 *********************************************************************************************/
@@ -28,12 +32,10 @@ const uint64_t p434x2[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF
                                                      0xF78CB8F062B15D47, 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 }; 
 const uint64_t p434x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xF705D9EB8BFFFFFF, 
                                                      0xEF1971E0C562BA8F, 0xB3F17F5A07148159, 0x0008D07C9C5DCD11 }; 
+const uint64_t p434x8[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEE0BB3D717FFFFFF, 
+                                                     0xDE32E3C18AC5751F, 0x67E2FEB40E2902B3, 0x0011A0F938BB9A23 }; 
 const uint64_t p434p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFDC1767AE3000000,
                                                      0x7BC65C783158AEA3, 0x6CFC5FD681C52056, 0x0002341F27177344 };  
-const uint64_t p434x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x47D130A3A0000000, 
-                                                     0x873470F9D4EA2B80, 0x6074052FC75BF530, 0x54497C1B1D119772, 0xC55F373D2CDCA412, 
-                                                     0x732CA2221C664B96, 0x6445AB96AF6359A5, 0x221708AB42ABE1B4, 0xAE3D3D0063244F01, 
-                                                     0x18B920F2ECF68816, 0x0000004DB194809D }; 
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000001000000 }; 
 // Order of Bob's subgroup
@@ -346,6 +348,7 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fpneg                         fpneg434
 #define fpdiv2                        fpdiv2_434
 #define fpcorrection                  fpcorrection434
+#define fpmul                         fpmul434
 #define fpmul_mont                    fpmul434_mont
 #define fpsqr_mont                    fpsqr434_mont
 #define fpinv_mont                    fpinv434_mont
@@ -363,6 +366,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fp2correction                 fp2correction434
 #define fp2mul_mont                   fp2mul434_mont
 #define fp2sqr_mont                   fp2sqr434_mont
+#define fp2mul_c0_mont                fp2mul434_c0_mont
+#define fp2mul_c1_mont                fp2mul434_c1_mont
+#define fp2sqr_c0_mont                fp2sqr434_c0_mont
+#define fp2sqr_c1_mont                fp2sqr434_c1_mont
 #define fp2inv_mont                   fp2inv434_mont
 #define fp2inv_mont_bingcd            fp2inv434_mont_bingcd
 #define fpequal_non_constant_time     fpequal434_non_constant_time
diff --git a/src/P434/P434_compressed_api.h b/src/P434/P434_compressed_api.h
index 06a2d6d..bb1022c 100644
--- a/src/P434/P434_compressed_api.h
+++ b/src/P434/P434_compressed_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P434 using compression 
 *********************************************************************************************/  
diff --git a/src/P434/P434_compressed_dlog_tables.c b/src/P434/P434_compressed_dlog_tables.c
index 2356ebf..4750acc 100644
--- a/src/P434/P434_compressed_dlog_tables.c
+++ b/src/P434/P434_compressed_dlog_tables.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for Pohlig-Hellman when using compression
 *********************************************************************************************/ 
diff --git a/src/P434/P434_compressed_pair_tables.c b/src/P434/P434_compressed_pair_tables.c
index ee8e334..64b6086 100644
--- a/src/P434/P434_compressed_pair_tables.c
+++ b/src/P434/P434_compressed_pair_tables.c
@@ -1,5 +1,9 @@
 /**************************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression
 ***************************************************************************************************/  
diff --git a/src/P434/P434_internal.h b/src/P434/P434_internal.h
index 52c5705..4c95cb2 100644
--- a/src/P434/P434_internal.h
+++ b/src/P434/P434_internal.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: internal header file for P434
 *********************************************************************************************/  
@@ -168,6 +172,8 @@ void rdc434_asm(digit_t* ma, digit_t* mc);
             
 // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768
 void fpmul434_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fpmul434(const digit_t* a, const digit_t* b, digit_t* c);
+void fpmul434_asm(const digit_t* a, const digit_t* b, digit_t* c);
 void mul434_asm(const digit_t* a, const digit_t* b, digit_t* c);
    
 // Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p434, where R=2^768
@@ -207,9 +213,17 @@ void fp2correction434(f2elm_t a);
             
 // GF(p434^2) squaring using Montgomery arithmetic, c = a^2 in GF(p434^2)
 void fp2sqr434_mont(const f2elm_t a, f2elm_t c);
+void fp2sqr434_c0_mont(const digit_t* a, digit_t* c);
+void fp2sqr434_c0_asm(const digit_t* a, digit_t* c);
+void fp2sqr434_c1_mont(const digit_t* a, digit_t* c);
+void fp2sqr434_c1_asm(const digit_t* a, digit_t* c);
  
 // GF(p434^2) multiplication using Montgomery arithmetic, c = a*b in GF(p434^2)
 void fp2mul434_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
+void fp2mul434_c0_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul434_c0_asm(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul434_c1_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul434_c1_asm(const digit_t* a, const digit_t* b, digit_t* c);
 
 // GF(p434^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
 void fp2inv434_mont(f2elm_t a);
diff --git a/src/P434/generic/fp_generic.c b/src/P434/generic/fp_generic.c
index 83856b9..ae8663c 100755
--- a/src/P434/generic/fp_generic.c
+++ b/src/P434/generic/fp_generic.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: portable modular arithmetic for P434
 *********************************************************************************************/
diff --git a/src/P503/AMD64/fp_x64.c b/src/P503/AMD64/fp_x64.c
index ca3c6f2..6553325 100644
--- a/src/P503/AMD64/fp_x64.c
+++ b/src/P503/AMD64/fp_x64.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for x64 platforms for P503
 *********************************************************************************************/
@@ -17,7 +21,7 @@ extern const uint64_t p503x4[NWORDS_FIELD];
 
 inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -39,7 +43,7 @@ inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c)
 
 inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -50,11 +54,6 @@ inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c)
     for (i = 0; i < NWORDS_FIELD; i++) {
         ADDC(borrow, c[i], ((digit_t*)p503x4)[i], borrow, c[i]); 
     }
-    
-#elif (OS_TARGET == OS_NIX)                 
-    
-    mp_sub503_p4_asm(a, b, c);    
-
 #endif
 } 
 
@@ -161,13 +160,43 @@ void fpcorrection503(digit_t* a)
     }
 }
 
+#if (OS_TARGET == OS_NIX)
+
+void fp2mul503_c0_mont(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fp2mul503_c0_asm(a, b, c);
+}
+
+
+void fp2mul503_c1_mont(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fp2mul503_c1_asm(a, b, c);
+}
+
+
+void fp2sqr503_c0_mont(const digit_t* a, digit_t* c)
+{
+    fp2sqr503_c0_asm(a, c);
+}
+
+
+void fp2sqr503_c1_mont(const digit_t* a, digit_t* c)
+{
+    fp2sqr503_c1_asm(a, c);
+}
+
+
+void fpmul503(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fpmul503_asm(a, b, c);
+}
+
+#else
 
 void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
 { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
         
     UNREFERENCED_PARAMETER(nwords);
-
-#if (OS_TARGET == OS_WIN)
     digit_t t = 0;
     uint128_t uv = {0};
     unsigned int carry = 0;
@@ -370,12 +399,6 @@ void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int n
     MULADD128(a[7], b[7], uv, carry, uv);
     c[14] = uv[0];
     c[15] = uv[1];
-
-#elif (OS_TARGET == OS_NIX)
-    
-    mul503_asm(a, b, c);
-
-#endif
 }
 
 
@@ -384,8 +407,6 @@ void rdc_mont(digit_t* ma, digit_t* mc)
   // mc = ma*R^-1 mod p503x2, where R = 2^512.
   // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1].
   // ma is assumed to be in Montgomery representation.
-        
-#if (OS_TARGET == OS_WIN)
     unsigned int carry;
     digit_t t = 0;
     uint128_t uv = {0};
@@ -559,11 +580,7 @@ void rdc_mont(digit_t* ma, digit_t* mc)
     t += carry;
     ADDC(0, uv[0], ma[14], carry, mc[6]); 
     ADDC(carry, uv[1], 0, carry, uv[1]); 
-    ADDC(0, uv[1], ma[15], carry, mc[7]); 
-    
-#elif (OS_TARGET == OS_NIX)                 
-    
-    rdc503_asm(ma, mc);    
+    ADDC(0, uv[1], ma[15], carry, mc[7]);
+}
 
-#endif
-}
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/src/P503/AMD64/fp_x64_asm.S b/src/P503/AMD64/fp_x64_asm.S
index 8ebce4f..2843464 100644
--- a/src/P503/AMD64/fp_x64_asm.S
+++ b/src/P503/AMD64/fp_x64_asm.S
@@ -1,1820 +1,1687 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license 
 //
 // Abstract: field arithmetic in x64 assembly for P503 on Linux
-//*******************************************************************************************  
-
-.intel_syntax noprefix
-
-// Format function and variable names for Mac OS X
-#if defined(__APPLE__)
-    #define fmt(f)    _##f
-#else
-    #define fmt(f)    f
-#endif
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-
-// Define addition instructions
-#ifdef _MULX_
-#ifdef _ADX_
-
-#define ADD1    adox
-#define ADC1    adox
-#define ADD2    adcx
-#define ADC2    adcx
-
-#else
-
-#define ADD1    add
-#define ADC1    adc
-#define ADD2    add
-#define ADC2    adc
-
-#endif   
-#endif
-
-
-.text
-//***********************************************************************
-//  Field addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fmt(fpadd503_asm)
-fmt(fpadd503_asm):
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  mov    r15, [reg_p1+56] 
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  adc    r12, [reg_p2+32] 
-  adc    r13, [reg_p2+40] 
-  adc    r14, [reg_p2+48] 
-  adc    r15, [reg_p2+56]
-
-  mov    rcx, [rip+fmt(p503x2)]
-  sub    r8, rcx
-  mov    rcx, [rip+fmt(p503x2)+8]
-  sbb    r9, rcx
-  sbb    r10, rcx
-  mov    rcx, [rip+fmt(p503x2)+24]
-  sbb    r11, rcx
-  mov    rcx, [rip+fmt(p503x2)+32]
-  sbb    r12, rcx
-  mov    rcx, [rip+fmt(p503x2)+40]
-  sbb    r13, rcx
-  mov    rcx, [rip+fmt(p503x2)+48]
-  sbb    r14, rcx
-  mov    rcx, [rip+fmt(p503x2)+56]
-  sbb    r15, rcx
-  sbb    rax, 0
-  
-  mov    rdi, [rip+fmt(p503x2)]
-  and    rdi, rax
-  mov    rsi, [rip+fmt(p503x2)+8]
-  and    rsi, rax
-  mov    rcx, [rip+fmt(p503x2)+24]
-  and    rcx, rax
-  
-  add    r8, rdi  
-  adc    r9, rsi  
-  adc    r10, rsi 
-  adc    r11, rcx 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11 
-  setc   cl
-
-  mov    r8, [rip+fmt(p503x2)+32]
-  and    r8, rax
-  mov    r9, [rip+fmt(p503x2)+40]
-  and    r9, rax
-  mov    r10, [rip+fmt(p503x2)+48]
-  and    r10, rax
-  mov    r11, [rip+fmt(p503x2)+56]
-  and    r11, rax
-  
-  bt     rcx, 0
-  adc    r12, r8   
-  adc    r13, r9  
-  adc    r14, r10  
-  adc    r15, r11  
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13 
-  mov    [reg_p3+48], r14 
-  mov    [reg_p3+56], r15 
-  
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
-//***********************************************************************
-//  Field subtraction
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fmt(fpsub503_asm)
-fmt(fpsub503_asm):
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  mov    r15, [reg_p1+56]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48] 
-  sbb    r15, [reg_p2+56]
-  sbb    rax, 0
-  
-  mov    rdi, [rip+fmt(p503x2)]
-  and    rdi, rax
-  mov    rsi, [rip+fmt(p503x2)+8]
-  and    rsi, rax
-  mov    rcx, [rip+fmt(p503x2)+24]
-  and    rcx, rax
-  
-  add    r8, rdi  
-  adc    r9, rsi  
-  adc    r10, rsi 
-  adc    r11, rcx 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11 
-  setc   cl
-
-  mov    r8, [rip+fmt(p503x2)+32]
-  and    r8, rax
-  mov    r9, [rip+fmt(p503x2)+40]
-  and    r9, rax
-  mov    r10, [rip+fmt(p503x2)+48]
-  and    r10, rax
-  mov    r11, [rip+fmt(p503x2)+56]
-  and    r11, rax
-  
-  bt     rcx, 0
-  adc    r12, r8   
-  adc    r13, r9  
-  adc    r14, r10  
-  adc    r15, r11  
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13 
-  mov    [reg_p3+48], r14 
-  mov    [reg_p3+56], r15 
-  
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
-///////////////////////////////////////////////////////////////// MACRO
-.macro SUB503_PX  P0
-  push   r12
-  push   r13
-  push   r14
-  
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  mov    rcx, [reg_p1+56]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48] 
-  sbb    rcx, [reg_p2+56]
-
-  mov    rax, [rip+\P0]
-  mov    rdi, [rip+\P0+8]
-  mov    rsi, [rip+\P0+24]
-  add    r8, rax
-  mov    rax, [rip+\P0+32]  
-  adc    r9, rdi  
-  adc    r10, rdi 
-  adc    r11, rsi 
-  adc    r12, rax
-  mov    rdi, [rip+\P0+40]
-  mov    rsi, [rip+\P0+48]
-  mov    rax, [rip+\P0+56]
-  adc    r13, rdi  
-  adc    r14, rsi
-  adc    rcx, rax   
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-  mov    [reg_p3+56], rcx
-  
-  pop    r14
-  pop    r13
-  pop    r12
-  .endm
-
-
-//***********************************************************************
-//  Multiprecision subtraction with correction with 2*p503
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p503
-//*********************************************************************** 
-.global fmt(mp_sub503_p2_asm)
-fmt(mp_sub503_p2_asm):
-
-  SUB503_PX  fmt(p503x2)
-  ret
+//******************************************************************************************* 
+
+.intel_syntax noprefix
+
+// Format function and variable names for Mac OS X
+#if defined(__APPLE__)
+    #define fmt(f)    _##f
+#else
+    #define fmt(f)    f
+#endif
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+
+
+.text
+//***********************************************************************
+//  503-bit multiprecision addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//*********************************************************************** 
+.global fmt(mp_add503_asm)
+fmt(mp_add503_asm): 
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+  
+  mov    r8, [reg_p1+32]
+  mov    r9, [reg_p1+40]
+  mov    r10, [reg_p1+48]
+  mov    r11, [reg_p1+56]
+  adc    r8, [reg_p2+32] 
+  adc    r9, [reg_p2+40] 
+  adc    r10, [reg_p2+48] 
+  adc    r11, [reg_p2+56]
+  mov    [reg_p3+32], r8
+  mov    [reg_p3+40], r9
+  mov    [reg_p3+48], r10
+  mov    [reg_p3+56], r11
+  ret
+
+
+//***********************************************************************
+//  Field addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//*********************************************************************** 
+.global fmt(fpadd503_asm)
+fmt(fpadd503_asm):
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  mov    r15, [reg_p1+56] 
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  adc    r12, [reg_p2+32] 
+  adc    r13, [reg_p2+40] 
+  adc    r14, [reg_p2+48] 
+  adc    r15, [reg_p2+56]
+
+  mov    rcx, [rip+fmt(p503x2)]
+  sub    r8, rcx
+  mov    rcx, [rip+fmt(p503x2)+8]
+  sbb    r9, rcx
+  sbb    r10, rcx
+  mov    rcx, [rip+fmt(p503x2)+24]
+  sbb    r11, rcx
+  mov    rcx, [rip+fmt(p503x2)+32]
+  sbb    r12, rcx
+  mov    rcx, [rip+fmt(p503x2)+40]
+  sbb    r13, rcx
+  mov    rcx, [rip+fmt(p503x2)+48]
+  sbb    r14, rcx
+  mov    rcx, [rip+fmt(p503x2)+56]
+  sbb    r15, rcx
+  sbb    rax, 0
+  
+  mov    rdi, [rip+fmt(p503x2)]
+  and    rdi, rax
+  mov    rsi, [rip+fmt(p503x2)+8]
+  and    rsi, rax
+  mov    rcx, [rip+fmt(p503x2)+24]
+  and    rcx, rax
+  
+  add    r8, rdi  
+  adc    r9, rsi  
+  adc    r10, rsi 
+  adc    r11, rcx 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11 
+  setc   cl
+
+  mov    r8, [rip+fmt(p503x2)+32]
+  and    r8, rax
+  mov    r9, [rip+fmt(p503x2)+40]
+  and    r9, rax
+  mov    r10, [rip+fmt(p503x2)+48]
+  and    r10, rax
+  mov    r11, [rip+fmt(p503x2)+56]
+  and    r11, rax
+  
+  bt     rcx, 0
+  adc    r12, r8   
+  adc    r13, r9  
+  adc    r14, r10  
+  adc    r15, r11  
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13 
+  mov    [reg_p3+48], r14 
+  mov    [reg_p3+56], r15 
+  
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+
+//***********************************************************************
+//  Field subtraction
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
+//*********************************************************************** 
+.global fmt(fpsub503_asm)
+fmt(fpsub503_asm):
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  
+  xor    rax, rax
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  mov    r15, [reg_p1+56]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    r14, [reg_p2+48] 
+  sbb    r15, [reg_p2+56]
+  sbb    rax, 0
+  
+  mov    rdi, [rip+fmt(p503x2)]
+  and    rdi, rax
+  mov    rsi, [rip+fmt(p503x2)+8]
+  and    rsi, rax
+  mov    rcx, [rip+fmt(p503x2)+24]
+  and    rcx, rax
+  
+  add    r8, rdi  
+  adc    r9, rsi  
+  adc    r10, rsi 
+  adc    r11, rcx 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11 
+  setc   cl
+
+  mov    r8, [rip+fmt(p503x2)+32]
+  and    r8, rax
+  mov    r9, [rip+fmt(p503x2)+40]
+  and    r9, rax
+  mov    r10, [rip+fmt(p503x2)+48]
+  and    r10, rax
+  mov    r11, [rip+fmt(p503x2)+56]
+  and    r11, rax
+  
+  bt     rcx, 0
+  adc    r12, r8   
+  adc    r13, r9  
+  adc    r14, r10  
+  adc    r15, r11  
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13 
+  mov    [reg_p3+48], r14 
+  mov    [reg_p3+56], r15 
+  
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+
+///////////////////////////////////////////////////////////////// MACRO
+.macro SUB503_PX  P0
+  push   r12
+  push   r13
+  push   r14
+  
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  mov    rcx, [reg_p1+56]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    r14, [reg_p2+48] 
+  sbb    rcx, [reg_p2+56]
+
+  mov    rax, [rip+\P0]
+  mov    rdi, [rip+\P0+8]
+  mov    rsi, [rip+\P0+24]
+  add    r8, rax
+  mov    rax, [rip+\P0+32]  
+  adc    r9, rdi  
+  adc    r10, rdi 
+  adc    r11, rsi 
+  adc    r12, rax
+  mov    rdi, [rip+\P0+40]
+  mov    rsi, [rip+\P0+48]
+  mov    rax, [rip+\P0+56]
+  adc    r13, rdi  
+  adc    r14, rsi
+  adc    rcx, rax   
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], r14
+  mov    [reg_p3+56], rcx
+  
+  pop    r14
+  pop    r13
+  pop    r12
+.endm
+
+
+//***********************************************************************
+//  Multiprecision subtraction with correction with 2*p503
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p503
+//*********************************************************************** 
+.global fmt(mp_sub503_p2_asm)
+fmt(mp_sub503_p2_asm):
+
+  SUB503_PX  fmt(p503x2)
+  ret
+
+
+#ifdef _MULX_
+#ifdef _ADX_
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x bi + z
+// Inputs: base memory pointer M1 (a),
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z5] or [Z0:Z8]
+// Output: [Z0:Z5] or [Z0:Z8]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro MULADD64x512 M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, T0, T1, C
+	xor    \C, \C
+    mulx   \T0, \T1, \M1     // A0*B0
+    adox   \Z0, \T1
+    adox   \Z1, \T0 
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    mulx   \T0, \T1, 32\M1   // A0*B4          
+    adcx   \Z4, \T1
+    adox   \Z5, \T0
+    mulx   \T0, \T1, 40\M1   // A0*B5          
+    adcx   \Z5, \T1
+    adox   \Z6, \T0
+    mulx   \T0, \T1, 48\M1   // A0*B6               
+    adcx   \Z6, \T1
+    adox   \Z7, \T0
+    mulx   \T0, \T1, 56\M1   // A0*B7         
+    adcx   \Z7, \T1
+    adox   \Z8, \T0
+    adc    \Z8, 0 
+.endm
+
+
+.macro MULADD64x320 M1, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1
+	xor    \T0, \T0
+    mulx   \T0, \T1, \M1     // A0*B0
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    mulx   \T0, \T1, 32\M1   // A0*B4 
+    adcx   \Z4, \T1
+    adox   \Z5, \T0
+    adc    \Z5, 0    
+.endm
+
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x b + c x d (mod p)
+// Inputs: base memory pointers M0 (a,c), M1 (b,d)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z8], pre-stores a0 x b
+// Output: [Z0:Z8]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro FPDBLMUL512x512 M00, M01, M10, M11, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, T0, T1  
+    mov    rdx, \M11        
+    MULADD64x512 \M01, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \T0, \T1, \T0        
+    // [Z1:Z8, Z0] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z0          // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \T0, \T1
+    
+    // [Z1:Z8, Z0] <- z = a0 x b01 - a1 x b11 + z 
+    mov    rdx, 8\M10
+    MULADD64x512 \M00, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1, \Z0
+    mov    rdx, 8\M11    
+    MULADD64x512 \M01, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1, \T0
+    // [Z2:Z8, Z0] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z1          // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1 
+    
+    // [Z2:Z8, Z0:Z1] <- z = a0 x b02 - a1 x b12 + z 
+    mov    rdx, 16\M10
+    MULADD64x512 \M00, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1, \Z1
+    mov    rdx, 16\M11    
+    MULADD64x512 \M01, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1, \T0
+    // [Z3:Z8, Z0:Z1] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z2         // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1
+    
+    // [Z3:Z8, Z0:Z2] <- z = a0 x b03 - a1 x b13 + z  
+    mov    rdx, 24\M10
+    MULADD64x512 \M00, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1, \Z2
+    mov    rdx, 24\M11    
+    MULADD64x512 \M01, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1, \T0
+    // [Z4:Z8, Z0:Z2] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z3         // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1
+    
+    // [Z4:Z8, Z0:Z3] <- z = a0 x b04 - a1 x b14 + z 
+    mov    rdx, 32\M10
+    MULADD64x512 \M00, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3
+    mov    rdx, 32\M11    
+    MULADD64x512 \M01, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T0
+    // [Z5:Z8, Z0:Z3] <- z = (z0 x p503p1 + z)/2^64   
+    mov    rdx, \Z4         // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+    
+    // [Z5:Z8, Z0:Z4] <- z = a0 x b05 - a1 x b15 + z
+    mov    rdx, 40\M10
+    MULADD64x512 \M00, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4
+    mov    rdx, 40\M11    
+    MULADD64x512 \M01, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T0
+    // [Z6:Z8, Z0:Z4] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z5         // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1 
+    
+    // [Z6:Z8, Z0:Z5] <- z = a0 x b06 - a1 x b16 + z
+    mov    rdx, 48\M10
+    MULADD64x512 \M00, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5
+    mov    rdx, 48\M11    
+    MULADD64x512 \M01, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T0
+    // [Z7, Z0:Z5] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z6         // rdx <- z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1
+    
+    // [Z7, Z0:Z6] <- z = a0 x b07 - a1 x b17 + z  
+    mov    rdx, 56\M10
+    MULADD64x512 \M00, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \Z6
+    mov    rdx, 56\M11    
+    MULADD64x512 \M01, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T0
+    // [Z8, Z0:Z6] <- z = (z0 x p503p1 + z)/2^64  
+    mov    rdx, \Z7         // rdx <- z0 
+    mov    [rcx], \Z8  
+    mov    [rcx+8], \Z0 
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1 
+.endm
+
+
+//***********************************************************************
+//  Multiplication in GF(p^2), non-complex part
+//  Operation: c [reg_p3] = a0 x b0 - a1 x b1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//          b = [b1, b0] stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fp2mul503_c0_asm)
+fmt(fp2mul503_c0_asm):     
+    push   r12 
+    mov    rcx, reg_p3
+	
+	// [rcx0:7] <- 8*p - b1
+	mov    r8, [rip+fmt(p503x8)]  
+	mov    r9, [rip+fmt(p503x8)+8]   
+	mov    r11, [rip+fmt(p503x8)+24]
+	mov    r12, [rip+fmt(p503x8)+32] 
+	mov    rax, [reg_p2+64]
+	mov    rdx, [reg_p2+72]
+	mov    r10, r9                 
+	sub    r8, rax
+    push   r13 
+	sbb    r9, rdx
+	mov    rax, [reg_p2+80]
+	mov    rdx, [reg_p2+88]
+	sbb    r10, rax
+    push   r14 
+	sbb    r11, rdx
+	mov    rax, [reg_p2+96]
+	mov    rdx, [reg_p2+104]
+	mov    [rcx], r8
+	mov    [rcx+8], r9
+	mov    r13, [rip+fmt(p503x8)+40]
+	mov    r14, [rip+fmt(p503x8)+48]
+	sbb    r12, rax
+    push   r15 
+	sbb    r13, rdx
+	mov    rax, [reg_p2+112]
+	mov    rdx, [reg_p2+120]
+	mov    r15, [rip+fmt(p503x8)+56]
+	sbb    r14, rax 
+	sbb    r15, rdx 
+	mov    [rcx+16], r10
+    
+    // [r8:r14, rax, r15] <- z = a0 x b00 - a1 x b10
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1] 
+	mov    [rcx+24], r11   
+    xor    rax, rax
+    mulx   r10, r11, [reg_p1+8]
+	mov    [rcx+32], r12     
+    adcx   r9, r11        
+    mulx   r11, r12, [reg_p1+16]
+	mov    [rcx+40], r13     
+    adcx   r10, r12        
+    mulx   r12, r13, [reg_p1+24] 
+	mov    [rcx+48], r14     
+    adcx   r11, r13       
+    mulx   r13, r14, [reg_p1+32] 
+	mov    [rcx+56], r15      
+    adcx   r12, r14      
+    mulx   r14, rax, [reg_p1+40] 
+    push   rbx
+    adcx   r13, rax      
+    mulx   r15, rax, [reg_p1+48]
+    push   rbp 
+    adcx   r14, rax     
+    mulx   rax, rbx, [reg_p1+56] 
+    mov    rdx, [rcx]  
+    adcx   r15, rbx     
+    adc    rax, 0 
+
+	FPDBLMUL512x512 [reg_p1], [reg_p1+64], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rax, rbx, rbp 
+                  
+    mov    [rcx+16], r9        
+    mov    [rcx+24], r10      
+    mov    [rcx+32], r11      
+    mov    [rcx+40], r12      
+    mov    [rcx+48], r13    
+    mov    [rcx+56], r14
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Multiplication in GF(p^2), complex part
+//  Operation: c [reg_p3] = a0 x b1 + a1 x b0
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//          b = [b1, b0] stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fp2mul503_c1_asm)
+fmt(fp2mul503_c1_asm):
+    mov    rcx, reg_p3
+    
+    // [r8, r9:r15, rax] <- z = a0 x b10 + a1 x b00
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1+64]     // a0 x b10
+    xor    rax, rax   
+    push   r12
+    mulx   r10, r11, [reg_p1+72]  
+    push   r13  
+    adcx   r9, r11        
+    mulx   r11, r12, [reg_p1+80]  
+    push   r14   
+    adcx   r10, r12        
+    mulx   r12, r13, [reg_p1+88] 
+    push   r15    
+    adcx   r11, r13       
+    mulx   r13, r14, [reg_p1+96]   
+    push   rbx  
+    adcx   r12, r14      
+    mulx   r14, r15, [reg_p1+104] 
+    push   rbp   
+    adcx   r13, r15      
+    mulx   r15, rbp, [reg_p1+112]
+    adcx   r14, rbp     
+    mulx   rax, rbx, [reg_p1+120] 
+    adcx   r15, rbx     
+    adc    rax, 0 
+
+	FPDBLMUL512x512 [reg_p1+64], [reg_p1], [reg_p2], [reg_p2+64], r8, r9, r10, r11, r12, r13, r14, r15, rax, rbx, rbp 
+           
+    mov    [rcx+16], r9 
+    mov    [rcx+24], r10   
+    mov    [rcx+32], r11      
+    mov    [rcx+40], r12                  
+    mov    [rcx+48], r13    
+    mov    [rcx+56], r14
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+#else
+
+# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE"
+
+#endif
+
+#else
+
+//***********************************************************************
+//  Integer multiplication
+//  Based on Karatsuba method
+//  Operation: c [reg_p3] = a [reg_p1] * b [reg_p2]
+//  NOTE: a=c or b=c are not allowed
+//***********************************************************************
+.global fmt(mul503_asm)
+fmt(mul503_asm):
+  push   r12
+  push   r13
+  push   r14
+  mov    rcx, reg_p3
+  
+  // rcx[0-3] <- AH+AL
+  xor    rax, rax
+  mov    r8, [reg_p1+32]
+  mov    r9, [reg_p1+40]
+  mov    r10, [reg_p1+48]
+  mov    r11, [reg_p1+56]
+  add    r8, [reg_p1] 
+  adc    r9, [reg_p1+8] 
+  adc    r10, [reg_p1+16] 
+  adc    r11, [reg_p1+24] 
+  push   r15  
+  mov    [rcx], r8
+  mov    [rcx+8], r9
+  mov    [rcx+16], r10
+  mov    [rcx+24], r11
+  sbb    rax, 0 
+  sub    rsp, 80           // Allocating space in stack
+       
+  // r12-r15 <- BH+BL
+  xor    rdx, rdx
+  mov    r12, [reg_p2+32]
+  mov    r13, [reg_p2+40]
+  mov    r14, [reg_p2+48]
+  mov    r15, [reg_p2+56]
+  add    r12, [reg_p2] 
+  adc    r13, [reg_p2+8] 
+  adc    r14, [reg_p2+16] 
+  adc    r15, [reg_p2+24] 
+  sbb    rdx, 0 
+  mov    [rsp+64], rax
+  mov    [rsp+72], rdx
+  
+  // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL)
+  mov    rax, [rcx]
+  mul    r12
+  mov    [rsp], rax        // c0
+  mov    r8, rdx
+  
+  xor    r9, r9
+  mov    rax, [rcx]
+  mul    r13
+  add    r8, rax
+  adc    r9, rdx
+  
+  xor    r10, r10
+  mov    rax, [rcx+8] 
+  mul    r12
+  add    r8, rax
+  mov    [rsp+8], r8       // c1 
+  adc    r9, rdx
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [rcx] 
+  mul    r14
+  add    r9, rax 
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  mov    rax, [rcx+16] 
+  mul    r12
+  add    r9, rax
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  mov    rax, [rcx+8] 
+  mul    r13
+  add    r9, rax
+  mov    [rsp+16], r9      // c2 
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  xor    r9, r9
+  mov    rax, [rcx] 
+  mul    r15
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [rcx+24] 
+  mul    r12
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [rcx+8] 
+  mul    r14
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [rcx+16] 
+  mul    r13
+  add    r10, rax
+  mov    [rsp+24], r10     // c3 
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  xor    r10, r10
+  mov    rax, [rcx+8] 
+  mul    r15
+  add    r8, rax
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  mov    rax, [rcx+24] 
+  mul    r13
+  add    r8, rax
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  mov    rax, [rcx+16] 
+  mul    r14
+  add    r8, rax
+  mov    [rsp+32], r8      // c4 
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  xor    r11, r11
+  mov    rax, [rcx+16]
+  mul    r15
+  add    r9, rax
+  adc    r10, rdx
+  adc    r11, 0
+
+  mov    rax, [rcx+24] 
+  mul    r14
+  add    r9, rax          // c5 
+  adc    r10, rdx
+  adc    r11, 0
+
+  mov    rax, [rcx+24] 
+  mul    r15
+  add    r10, rax         // c6 
+  adc    r11, rdx         // c7 
+  
+  mov    rax, [rsp+64]
+  and    r12, rax
+  and    r13, rax
+  and    r14, rax
+  and    r15, rax
+  add    r12, r8
+  adc    r13, r9
+  adc    r14, r10
+  adc    r15, r11
+
+  mov    rax, [rsp+72]  
+  mov    r8, [rcx]
+  mov    r9, [rcx+8]
+  mov    r10, [rcx+16]
+  mov    r11, [rcx+24]
+  and    r8, rax
+  and    r9, rax
+  and    r10, rax
+  and    r11, rax
+  add    r8, r12
+  adc    r9, r13
+  adc    r10, r14
+  adc    r11, r15
+  mov    [rsp+32], r8
+  mov    [rsp+40], r9
+  mov    [rsp+48], r10
+  mov    [rsp+56], r11
+  
+  // rcx[0-7] <- AL*BL
+  mov    r11, [reg_p1]
+  mov    rax, [reg_p2] 
+  mul    r11
+  xor    r9, r9
+  mov    [rcx], rax        // c0
+  mov    r8, rdx
+  
+  mov    r14, [reg_p1+16] 
+  mov    rax, [reg_p2+8]
+  mul    r11
+  xor    r10, r10
+  add    r8, rax
+  adc    r9, rdx
+
+  mov    r12, [reg_p1+8] 
+  mov    rax, [reg_p2] 
+  mul    r12
+  add    r8, rax
+  mov    [rcx+8], r8       // c1 
+  adc    r9, rdx
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [reg_p2+16] 
+  mul    r11
+  add    r9, rax
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  mov    r13, [reg_p2] 
+  mov    rax, r14 
+  mul    r13
+  add    r9, rax
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  mov    rax, [reg_p2+8] 
+  mul    r12
+  add    r9, rax
+  mov    [rcx+16], r9      // c2 
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  xor    r9, r9
+  mov    rax, [reg_p2+24] 
+  mul    r11
+  mov    r15, [reg_p1+24] 
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, r15 
+  mul    r13
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [reg_p2+16] 
+  mul    r12
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [reg_p2+8] 
+  mul    r14
+  add    r10, rax
+  mov    [rcx+24], r10     // c3 
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  xor    r10, r10
+  mov    rax, [reg_p2+24] 
+  mul    r12
+  add    r8, rax
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  mov    rax, [reg_p2+8] 
+  mul    r15
+  add    r8, rax
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  mov    rax, [reg_p2+16] 
+  mul    r14
+  add    r8, rax
+  mov    [rcx+32], r8     // c4 
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [reg_p2+24]
+  mul    r14
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+
+  mov    rax, [reg_p2+16] 
+  mul    r15
+  add    r9, rax
+  mov    [rcx+40], r9      // c5 
+  adc    r10, rdx
+  adc    r8, 0
+
+  mov    rax, [reg_p2+24] 
+  mul    r15
+  add    r10, rax
+  mov    [rcx+48], r10     // c6 
+  adc    r8, rdx   
+  mov    [rcx+56], r8      // c7 
+
+  // rcx[8-15] <- AH*BH
+  mov    r11, [reg_p1+32]
+  mov    rax, [reg_p2+32] 
+  mul    r11
+  xor    r9, r9
+  mov    [rcx+64], rax     // c0
+  mov    r8, rdx
+  
+  mov    r14, [reg_p1+48] 
+  mov    rax, [reg_p2+40]
+  mul    r11
+  xor    r10, r10
+  add    r8, rax
+  adc    r9, rdx
+
+  mov    r12, [reg_p1+40] 
+  mov    rax, [reg_p2+32] 
+  mul    r12
+  add    r8, rax
+  mov    [rcx+72], r8      // c1 
+  adc    r9, rdx
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [reg_p2+48] 
+  mul    r11
+  add    r9, rax
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  mov    r13, [reg_p2+32] 
+  mov    rax, r14 
+  mul    r13
+  add    r9, rax
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  mov    rax, [reg_p2+40] 
+  mul    r12
+  add    r9, rax
+  mov    [rcx+80], r9      // c2 
+  adc    r10, rdx 
+  adc    r8, 0
+  
+  xor    r9, r9
+  mov    rax, [reg_p2+56] 
+  mul    r11
+  mov    r15, [reg_p1+56] 
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, r15 
+  mul    r13
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [reg_p2+48] 
+  mul    r12
+  add    r10, rax
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  mov    rax, [reg_p2+40] 
+  mul    r14
+  add    r10, rax
+  mov    [rcx+88], r10     // c3 
+  adc    r8, rdx 
+  adc    r9, 0
+  
+  xor    r10, r10
+  mov    rax, [reg_p2+56] 
+  mul    r12
+  add    r8, rax
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  mov    rax, [reg_p2+40] 
+  mul    r15
+  add    r8, rax
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  mov    rax, [reg_p2+48] 
+  mul    r14
+  add    r8, rax
+  mov    [rcx+96], r8      // c4 
+  adc    r9, rdx 
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [reg_p2+56]
+  mul    r14
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+
+  mov    rax, [reg_p2+48] 
+  mul    r15
+  add    r9, rax
+  mov    [rcx+104], r9     // c5 
+  adc    r10, rdx
+  adc    r8, 0
+
+  mov    rax, [reg_p2+56] 
+  mul    r15
+  add    r10, rax
+  mov    [rcx+112], r10    // c6 
+  adc    r8, rdx   
+  mov    [rcx+120], r8     // c7 
+      
+  // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL 
+  mov    r8,  [rsp]
+  sub    r8,  [rcx] 
+  mov    r9,  [rsp+8]
+  sbb    r9,  [rcx+8]
+  mov    r10, [rsp+16]
+  sbb    r10, [rcx+16]
+  mov    r11, [rsp+24]
+  sbb    r11, [rcx+24] 
+  mov    r12, [rsp+32]
+  sbb    r12, [rcx+32]
+  mov    r13, [rsp+40]
+  sbb    r13, [rcx+40] 
+  mov    r14, [rsp+48]
+  sbb    r14, [rcx+48] 
+  mov    r15, [rsp+56]
+  sbb    r15, [rcx+56]
+      
+  // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
+  mov    rax, [rcx+64]
+  sub    r8,  rax 
+  mov    rax, [rcx+72]
+  sbb    r9,  rax
+  mov    rax, [rcx+80]
+  sbb    r10, rax
+  mov    rax, [rcx+88]
+  sbb    r11, rax 
+  mov    rax, [rcx+96]
+  sbb    r12, rax
+  mov    rdx, [rcx+104]
+  sbb    r13, rdx
+  mov    rdi, [rcx+112]
+  sbb    r14, rdi 
+  mov    rsi, [rcx+120]
+  sbb    r15, rsi 
+      
+  // Final result
+  add    r8,  [rcx+32] 
+  mov    [rcx+32], r8
+  adc    r9,  [rcx+40]
+  mov    [rcx+40], r9
+  adc    r10, [rcx+48]
+  mov    [rcx+48], r10
+  adc    r11, [rcx+56]
+  mov    [rcx+56], r11
+  adc    r12, [rcx+64]
+  mov    [rcx+64], r12
+  adc    r13, [rcx+72]
+  mov    [rcx+72], r13
+  adc    r14, [rcx+80] 
+  mov    [rcx+80], r14
+  adc    r15, [rcx+88] 
+  mov    [rcx+88], r15
+  adc    rax, 0
+  mov    [rcx+96], rax
+  adc    rdx, 0
+  mov    [rcx+104], rdx
+  adc    rdi, 0
+  mov    [rcx+112], rdi
+  adc    rsi, 0
+  mov    [rcx+120], rsi
+    
+  add    rsp, 80           // Restoring space in stack
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+  
+//***********************************************************************
+//  Montgomery reduction
+//  Based on comba method
+//  Operation: c [reg_p2] = a [reg_p1]
+//  NOTE: a=c is not allowed
+//*********************************************************************** 
+.global fmt(rdc503_asm)
+fmt(rdc503_asm):
+  push   r12
+  push   r13 
+  push   r14 
+  push   r15 
+
+  mov    r11, [reg_p1]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    r11
+  xor    r8, r8
+  add    rax, [reg_p1+24]
+  mov    [reg_p2+24], rax    // z3
+  adc    r8, rdx
+  
+  xor    r9, r9
+  mov    rax, [rip+fmt(p503p1)+32] 
+  mul    r11
+  xor    r10, r10
+  add    r8, rax
+  adc    r9, rdx
+
+  mov    r12, [reg_p1+8]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    r12
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  add    r8, [reg_p1+32]
+  mov    [reg_p2+32], r8    // z4
+  adc    r9, 0
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    r11
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+32] 
+  mul    r12
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    r13, [reg_p1+16]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    r13
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  add    r9, [reg_p1+40]
+  mov    [reg_p2+40], r9    // z5
+  adc    r10, 0
+  adc    r8, 0
+  
+  xor    r9, r9
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r11
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    r12
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  
+  mov    rax, [rip+fmt(p503p1)+32]
+  mul    r13
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  
+  mov    r14, [reg_p2+24]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    r14
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  add    r10, [reg_p1+48]
+  mov    [reg_p2+48], r10   // z6
+  adc    r8, 0
+  adc    r9, 0
+  
+  xor    r10, r10
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r11
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r12
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    r13
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+32] 
+  mul    r14
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    r15, [reg_p2+32]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    r15
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  add    r8, [reg_p1+56]
+  mov    [reg_p2+56], r8    // z7
+  adc    r9, 0
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r12
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r13
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    r14
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+32] 
+  mul    r15
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rcx, [reg_p2+40]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    rcx
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  add    r9, [reg_p1+64]
+  mov    [reg_p2], r9        // z0
+  adc    r10, 0
+  adc    r8, 0
+  
+  xor    r9, r9
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r13
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r14
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+
+  mov    rax, [rip+fmt(p503p1)+40]
+  mul    r15
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+
+  mov    rax, [rip+fmt(p503p1)+32]
+  mul    rcx
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  
+  mov    r13, [reg_p2+48]
+  mov    rax, [rip+fmt(p503p1)+24]
+  mul    r13
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  add    r10, [reg_p1+72]
+  mov    [reg_p2+8], r10     // z1
+  adc    r8, 0
+  adc    r9, 0
+  
+  xor    r10, r10
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r14
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r15
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    rcx
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+32] 
+  mul    r13
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  
+  mov    r14, [reg_p2+56]
+  mov    rax, [rip+fmt(p503p1)+24] 
+  mul    r14
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  add    r8, [reg_p1+80]
+  mov    [reg_p2+16], r8     // z2
+  adc    r9, 0
+  adc    r10, 0
+  
+  xor    r8, r8
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r15
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    rcx
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    r13
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  
+  mov    rax, [rip+fmt(p503p1)+32] 
+  mul    r14
+  add    r9, rax
+  adc    r10, rdx
+  adc    r8, 0
+  add    r9, [reg_p1+88]
+  mov    [reg_p2+24], r9     // z3
+  adc    r10, 0
+  adc    r8, 0
+  
+  xor    r9, r9
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    rcx
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r13
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  
+  mov    rax, [rip+fmt(p503p1)+40] 
+  mul    r14
+  add    r10, rax
+  adc    r8, rdx
+  adc    r9, 0
+  add    r10, [reg_p1+96]
+  mov    [reg_p2+32], r10    // z4
+  adc    r8, 0
+  adc    r9, 0
+  
+  xor    r10, r10
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r13
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+
+  mov    rax, [rip+fmt(p503p1)+48] 
+  mul    r14
+  add    r8, rax
+  adc    r9, rdx
+  adc    r10, 0
+  add    r8, [reg_p1+104]    // z5
+  mov    [reg_p2+40], r8     // z5
+  adc    r9, 0
+  adc    r10, 0
+  
+  mov    rax, [rip+fmt(p503p1)+56] 
+  mul    r14
+  add    r9, rax
+  adc    r10, rdx
+  add    r9, [reg_p1+112]    // z6
+  mov    [reg_p2+48], r9     // z6
+  adc    r10, 0  
+  add    r10, [reg_p1+120]   // z7
+  mov    [reg_p2+56], r10    // z7
+
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+#endif
+
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = z + a x b
+// Inputs: base memory pointers M0 (a), M1 (b)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z8], pre-stores a0 x b
+// Output: [Z0:Z7]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro FPMUL512x512 M0, M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z8, T0, T1           
+    // [Z1:Z7, Z8] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z0                 // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \T0, \T1
+    
+    // [Z1:Z7, Z8, Z0] <- z = a01 x a1 + z  
+    mov    rdx, 8\M0
+    MULADD64x512 \M1, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1, \Z0
+    // [Z2:Z7, Z8, Z0] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z1                 // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \T0, \T1
+    
+    // [Z2:Z7, Z8, Z0:Z1] <- z = a02 x a1 + z 
+    mov    rdx, 16\M0
+    MULADD64x512 \M1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1, \Z1
+    // [Z3:Z7, Z8, Z0:Z1] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z2                // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \T0, \T1
+    
+    // [Z3:Z7, Z8, Z8, Z0:Z2] <- z = a03 x a1 + z 
+    mov    rdx, 24\M0
+    MULADD64x512 \M1, \Z3, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1, \Z2
+    // [Z4:Z7, Z8, Z0:Z2] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z3                // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \T0, \T1
+    
+    // [Z4:Z7, Z8, Z0:Z3] <- z = a04 x a1 + z 
+    mov    rdx, 32\M0
+    MULADD64x512 \M1, \Z4, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \Z3
+    // [Z5:Z7, Z8, Z0:Z3] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z4                // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+    
+    // [Z5:Z7, Z8, Z0:Z4] <- z = a05 x a1 + z 
+    mov    rdx, 40\M0
+    MULADD64x512 \M1, \Z5, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \Z4
+    // [Z6:Z7, Z8, Z0:Z4] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z5                // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1
+    
+    // [Z6:Z7, Z8, Z0:Z5] <- z = a06 x a1 + z 
+    mov    rdx, 48\M0
+    MULADD64x512 \M1, \Z6, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \Z5
+    // [Z7, Z8, Z0:Z5] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z6                // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1       
+    
+    // [Z7, Z8, Z0:Z6] <- z = a07 x a1 + z 
+    mov    rdx, 56\M0
+    MULADD64x512 \M1, \Z7, \Z8, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \Z6
+    // [Z8, Z0:Z6] <- z = (z0 x p503p1 + z)/2^64
+    mov    rdx, \Z7                // rdx <- z0
+    MULADD64x320 [rip+fmt(p503p1)+24], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1 
+.endm
 
 
 //***********************************************************************
-//  Multiprecision subtraction with correction with 4*p503
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p503
-//*********************************************************************** 
-.global fmt(mp_sub503_p4_asm)
-fmt(mp_sub503_p4_asm):
-
-  SUB503_PX  fmt(p503x4)
-  ret
-
-
-#ifdef _MULX_
-    
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  memory pointers M0 and M1
-// Outputs: memory pointer C and regs T1, T2, T3, T4, T8
-// Temps:   regs T0:T9
-/////////////////////////////////////////////////////////////////
-
-#ifdef _ADX_
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    xor    rax, rax   
-    adox   \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adox   \T1, \T3        
-    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
-    adox   \T2, \T4 
-           
-    mov    rdx, 8\M0
-    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
-    adox   \T3, rax 
-    xor    rax, rax   
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
-    adox   \T4, \T0
-    mov    8\C, \T4          // C1_final  
-    adcx   \T5, \T7      
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
-    adcx   \T6, \T8  
-    adox   \T5, \T1      
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
-    adcx   \T7, \T9        
-    adcx   \T8, rax   
-    adox   \T6, \T2
-    
-    mov    rdx, 16\M0
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    adox   \T7, \T3
-    adox   \T8, rax
-    xor    rax, rax 
-    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
-    adox   \T0, \T5   
-    mov    16\C, \T0         // C2_final 
-    adcx   \T1, \T3    
-    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
-    adcx   \T2, \T4 
-    adox   \T1, \T6       
-    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
-    adcx   \T3, \T9        
-    mov    rdx, 24\M0
-    adcx   \T4, rax         
-
-    adox   \T2, \T7
-    adox   \T3, \T8
-    adox   \T4, rax
-
-    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
-    xor    rax, rax 
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
-    adcx   \T5, \T7 
-    adox   \T1, \T0       
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
-    adcx   \T6, \T8  
-    adox   \T2, \T5      
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
-    adcx   \T7, \T9        
-    adcx   \T8, rax         
-
-    adox   \T3, \T6
-    adox   \T4, \T7
-    adox   \T8, rax
-.endm 
-
-#else
-
-.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1     // T0:T1 = A0*B0
-    mov    \C, \T1           // C0_final
-    mulx   \T1, \T2, 8\M1    // T1:T2 = A0*B1
-    add    \T0, \T2        
-    mulx   \T2, \T3, 16\M1   // T2:T3 = A0*B2
-    adc    \T1, \T3         
-    mulx   \T3, \T4, 24\M1   // T3:T4 = A0*B3
-    adc    \T2, \T4        
-    mov    rdx, 8\M0
-    adc    \T3, 0         
-
-    mulx   \T5, \T4, \M1     // T5:T4 = A1*B0
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A1*B1
-    add    \T5, \T7        
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A1*B2
-    adc    \T6, \T8        
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A1*B3
-    adc    \T7, \T9        
-    adc    \T8, 0         
-
-    add    \T4, \T0
-    mov    8\C, \T4          // C1_final
-    adc    \T5, \T1
-    adc    \T6, \T2
-    adc    \T7, \T3
-    mov    rdx, 16\M0
-    adc    \T8, 0
-
-    mulx   \T1, \T0, \M1     // T1:T0 = A2*B0
-    mulx   \T2, \T3, 8\M1    // T2:T3 = A2*B1
-    add    \T1, \T3        
-    mulx   \T3, \T4, 16\M1   // T3:T4 = A2*B2
-    adc    \T2, \T4        
-    mulx   \T4,\T9, 24\M1    // T3:T4 = A2*B3
-    adc    \T3, \T9        
-    mov    rdx, 24\M0
-    adc    \T4, 0          
-
-    add    \T0, \T5
-    mov    16\C, \T0         // C2_final
-    adc    \T1, \T6
-    adc    \T2, \T7
-    adc    \T3, \T8
-    adc    \T4, 0
-
-    mulx   \T5, \T0, \M1     // T5:T0 = A3*B0
-    mulx   \T6, \T7, 8\M1    // T6:T7 = A3*B1
-    add    \T5, \T7        
-    mulx   \T7, \T8, 16\M1   // T7:T8 = A3*B2
-    adc    \T6, \T8        
-    mulx   \T8, \T9, 24\M1   // T8:T9 = A3*B3
-    adc    \T7, \T9         
-    adc    \T8, 0         
-
-    add    \T1, \T0
-    adc    \T2, \T5
-    adc    \T3, \T6
-    adc    \T4, \T7
-    adc    \T8, 0
-.endm
-#endif
-
-
-//*****************************************************************************
-//  503-bit multiplication using Karatsuba (one level), schoolbook (one level)
-//***************************************************************************** 
-.global fmt(mul503_asm)
-fmt(mul503_asm):    
-    push   r12
-    push   r13 
-    push   r14 
-    push   r15
-    mov    rcx, reg_p3 
-
-    // r8-r11 <- AH + AL, rax <- mask
-    xor    rax, rax
-    mov    r8, [reg_p1]
-    mov    r9, [reg_p1+8]
-    mov    r10, [reg_p1+16]
-    mov    r11, [reg_p1+24] 
-    push   rbx 
-    push   rbp
-    sub    rsp, 96
-    add    r8, [reg_p1+32]
-    adc    r9, [reg_p1+40]
-    adc    r10, [reg_p1+48]
-    adc    r11, [reg_p1+56]
-    sbb    rax, 0
-    mov    [rsp], r8
-    mov    [rsp+8], r9
-    mov    [rsp+16], r10
-    mov    [rsp+24], r11
-
-    // r12-r15 <- BH + BL, rbx <- mask
-    xor    rbx, rbx
-    mov    r12, [reg_p2]
-    mov    r13, [reg_p2+8]
-    mov    r14, [reg_p2+16]
-    mov    r15, [reg_p2+24]
-    add    r12, [reg_p2+32]
-    adc    r13, [reg_p2+40]
-    adc    r14, [reg_p2+48]
-    adc    r15, [reg_p2+56]
-    sbb    rbx, 0
-    mov    [rsp+32], r12
-    mov    [rsp+40], r13
-    mov    [rsp+48], r14
-    mov    [rsp+56], r15
-    
-    // r12-r15 <- masked (BH + BL)
-    and    r12, rax
-    and    r13, rax
-    and    r14, rax
-    and    r15, rax
-
-    // r8-r11 <- masked (AH + AL)
-    and    r8, rbx
-    and    r9, rbx
-    and    r10, rbx
-    and    r11, rbx
-
-    // r8-r11 <- masked (AH + AL) + masked (AH + AL)
-    add    r8, r12
-    adc    r9, r13
-    adc    r10, r14
-    adc    r11, r15
-    mov    [rsp+64], r8
-    mov    [rsp+72], r9
-    mov    [rsp+80], r10
-    mov    [rsp+88], r11
-
-    // [rcx+64], r9-r12, rbx <- (AH+AL) x (BH+BL), low part 
-    MUL256_SCHOOL  [rsp], [rsp+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp 
-    mov    [rcx+88], r9  
-    mov    [rcx+96], r10 
-    mov    [rcx+104], r11
-    mov    [rcx+112], r12
-    mov    [rcx+120], rbx
-
-    // [rcx], r9-r12, rbx <- AL x BL
-    MUL256_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp     // Result C0-C3 
-    mov    [rcx+24], r9  
-    mov    [rcx+32], r10 
-    mov    [rcx+40], r11
-    mov    [rcx+48], r12
-    mov    [rcx+56], rbx
-
-    // [rsp], rbx, rbp, r13-r15 <- AH x BH 
-    MUL256_SCHOOL  [reg_p1+32], [reg_p2+32], [rsp], r8, rbx, rbp, r13, r14, r9, r10, r11, r15, r12
-    
-    // r8-r11 <- (AH+AL) x (BH+BL), final step
-    mov    r8, [rsp+64]
-    mov    r9, [rsp+72]
-    mov    r10, [rsp+80]
-    mov    r11, [rsp+88]
-    mov    rax, [rcx+96]
-    add    r8, rax
-    mov    rax, [rcx+104]
-    adc    r9, rax
-    mov    rax, [rcx+112]
-    adc    r10, rax
-    mov    rax, [rcx+120]
-    adc    r11, rax
-    
-    // r8-r12, rdi, rsi, rdx <- (AH+AL) x (BH+BL) - ALxBL
-    mov    r12, [rcx+64]
-    mov    rdi, [rcx+72]
-    mov    rsi, [rcx+80]
-    mov    rdx, [rcx+88]
-    sub    r12, [rcx]
-    sbb    rdi, [rcx+8]
-    sbb    rsi, [rcx+16]
-    sbb    rdx, [rcx+24]
-    sbb    r8, [rcx+32]
-    sbb    r9, [rcx+40]
-    sbb    r10, [rcx+48]
-    sbb    r11, [rcx+56]
-    
-    // r8-r12, rdi, rsi, rdx <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
-    sub    r12, [rsp]
-    sbb    rdi, [rsp+8]
-    sbb    rsi, [rsp+16]
-    sbb    rdx, rbx
-    sbb    r8, rbp
-    sbb    r9, r13
-    sbb    r10, r14
-    sbb    r11, r15
-    
-    add    r12, [rcx+32]
-    mov    [rcx+32], r12    // Result C4-C7
-    adc    rdi, [rcx+40]
-    mov    [rcx+40], rdi 
-    adc    rsi, [rcx+48]
-    mov    [rcx+48], rsi 
-    adc    rdx, [rcx+56]
-    mov    [rcx+56], rdx 
-    mov    rax, [rsp]
-    adc    r8, rax 
-    mov    [rcx+64], r8    // Result C8-C15
-    mov    rax, [rsp+8]
-    adc    r9, rax
-    mov    [rcx+72], r9 
-    mov    rax, [rsp+16]
-    adc    r10, rax
-    mov    [rcx+80], r10 
-    adc    r11, rbx
-    mov    [rcx+88], r11
-    adc    rbp, 0
-    mov    [rcx+96], rbp
-    adc    r13, 0
-    mov    [rcx+104], r13
-    adc    r14, 0
-    mov    [rcx+112], r14
-    adc    r15, 0
-    mov    [rcx+120], r15  
-    
-    add    rsp, 96    
-    pop    rbp  
-    pop    rbx
-    pop    r15
-    pop    r14
-    pop    r13
-    pop    r12
+//  Squaring in GF(p^2), non-complex part
+//  Operation: c [reg_p2] = (a0+a1) x (a0-a1)
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [reg_p2]
+//***********************************************************************
+.global fmt(fp2sqr503_c0_asm)
+fmt(fp2sqr503_c0_asm):   
+    push   r12
+    push   r13
+
+	// a0 + a1
+	mov    r8, [reg_p1]
+	mov    r9, [reg_p1+8]
+	mov    r10, [reg_p1+16]
+	mov    r11, [reg_p1+24]
+	mov    r12, [reg_p1+32]
+	mov    r13, [reg_p1+40]
+	add    r8, [reg_p1+64]
+	adc    r9, [reg_p1+72] 
+    push   r14 
+	adc    r10, [reg_p1+80]
+	adc    r11, [reg_p1+88] 
+    push   r15
+	adc    r12, [reg_p1+96]
+	adc    r13, [reg_p1+104]
+	mov    r14, [reg_p1+48]
+	mov    r15, [reg_p1+56]
+	adc    r14, [reg_p1+112]
+	adc    r15, [reg_p1+120]
+	mov    [reg_p2+8], r9
+	mov    [reg_p2+16], r10
+	mov    [reg_p2+24], r11
+	mov    [reg_p2+32], r12
+	mov    [reg_p2+40], r13
+	mov    [reg_p2+48], r14
+	mov    [reg_p2+56], r15
+	
+	// a0 - a1 + 4xp503
+	mov    rcx, [reg_p1]
+	mov    r10, [reg_p1+8]
+	mov    r12, [reg_p1+16]
+	mov    r13, [reg_p1+24]
+	mov    r14, [reg_p1+32]
+	mov    r15, [reg_p1+40]
+	sub    rcx, [reg_p1+64]
+	sbb    r10, [reg_p1+72]  
+    push   rbx 
+	sbb    r12, [reg_p1+80]
+	sbb    r13, [reg_p1+88] 
+    push   rbp
+	sbb    r14, [reg_p1+96]
+	sbb    r15, [reg_p1+104]
+	mov    rbx, [reg_p1+48]
+	mov    rbp, [reg_p1+56]
+	sbb    rbx, [reg_p1+112]
+	sbb    rbp, [reg_p1+120]
+	add    rcx, [rip+fmt(p503x4)]                    
+	mov    rdx, [rip+fmt(p503x4)+8]
+	adc    r10, rdx
+	adc    r12, rdx
+	adc    r13, [rip+fmt(p503x4)+24]
+	adc    r14, [rip+fmt(p503x4)+32]
+	adc    r15, [rip+fmt(p503x4)+40]
+	adc    rbx, [rip+fmt(p503x4)+48]
+	adc    rbp, [rip+fmt(p503x4)+56]
+	mov    [reg_p2+64], rcx                 
+	mov    [reg_p2+72], r10
+    
+    // [r8:r15, rbp] <- z = a00 x a1
+    mov    rdx, r8
+    mulx   r9, r8, rcx   
+    xor    rax, rax 
+	mov    [reg_p2+80], r12  
+    mulx   r10, r11, r10  
+	mov    [reg_p2+88], r13 
+    adox   r9, r11        
+    mulx   r11, r12, r12  
+	mov    [reg_p2+96], r14 
+    adox   r10, r12        
+    mulx   r12, r13, r13 
+	mov    [reg_p2+104], r15  
+    adox   r11, r13       
+    mulx   r13, r14, r14  
+	mov    [reg_p2+112], rbx 
+    adox   r12, r14      
+    mulx   r14, r15, r15 
+	mov    [reg_p2+120], rbp 
+    adox   r13, r15      
+    mulx   r15, rbp, rbx  
+    adox   r14, rbp      
+    mulx   rbp, rbx, [reg_p2+120]  
+    adox   r15, rbx 
+    adox   rbp, rax
+           
+	FPMUL512x512 [reg_p2], [reg_p2+64], r8, r9, r10, r11, r12, r13, r14, r15, rbp, rbx, rcx
+                 
+    mov    [reg_p2], rbp
+    mov    [reg_p2+8], r8         
+    mov    [reg_p2+16], r9      
+    mov    [reg_p2+24], r10     
+    mov    [reg_p2+32], r11     
+    mov    [reg_p2+40], r12     
+    mov    [reg_p2+48], r13
+    mov    [reg_p2+56], r14
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
     ret
 
-#else
-
-//***********************************************************************
-//  Integer multiplication
-//  Based on Karatsuba method
-//  Operation: c [reg_p3] = a [reg_p1] * b [reg_p2]
-//  NOTE: a=c or b=c are not allowed
-//***********************************************************************
-.global fmt(mul503_asm)
-fmt(mul503_asm):
-  push   r12
-  push   r13
-  push   r14
-  mov    rcx, reg_p3
-  
-  // rcx[0-3] <- AH+AL
-  xor    rax, rax
-  mov    r8, [reg_p1+32]
-  mov    r9, [reg_p1+40]
-  mov    r10, [reg_p1+48]
-  mov    r11, [reg_p1+56]
-  add    r8, [reg_p1] 
-  adc    r9, [reg_p1+8] 
-  adc    r10, [reg_p1+16] 
-  adc    r11, [reg_p1+24] 
-  push   r15  
-  mov    [rcx], r8
-  mov    [rcx+8], r9
-  mov    [rcx+16], r10
-  mov    [rcx+24], r11
-  sbb    rax, 0 
-  sub    rsp, 80           // Allocating space in stack
-       
-  // r12-r15 <- BH+BL
-  xor    rdx, rdx
-  mov    r12, [reg_p2+32]
-  mov    r13, [reg_p2+40]
-  mov    r14, [reg_p2+48]
-  mov    r15, [reg_p2+56]
-  add    r12, [reg_p2] 
-  adc    r13, [reg_p2+8] 
-  adc    r14, [reg_p2+16] 
-  adc    r15, [reg_p2+24] 
-  sbb    rdx, 0 
-  mov    [rsp+64], rax
-  mov    [rsp+72], rdx
-  
-  // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL)
-  mov    rax, [rcx]
-  mul    r12
-  mov    [rsp], rax        // c0
-  mov    r8, rdx
-  
-  xor    r9, r9
-  mov    rax, [rcx]
-  mul    r13
-  add    r8, rax
-  adc    r9, rdx
-  
-  xor    r10, r10
-  mov    rax, [rcx+8] 
-  mul    r12
-  add    r8, rax
-  mov    [rsp+8], r8       // c1 
-  adc    r9, rdx
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [rcx] 
-  mul    r14
-  add    r9, rax 
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  mov    rax, [rcx+16] 
-  mul    r12
-  add    r9, rax
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  mov    rax, [rcx+8] 
-  mul    r13
-  add    r9, rax
-  mov    [rsp+16], r9      // c2 
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  xor    r9, r9
-  mov    rax, [rcx] 
-  mul    r15
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [rcx+24] 
-  mul    r12
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [rcx+8] 
-  mul    r14
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [rcx+16] 
-  mul    r13
-  add    r10, rax
-  mov    [rsp+24], r10     // c3 
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  xor    r10, r10
-  mov    rax, [rcx+8] 
-  mul    r15
-  add    r8, rax
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  mov    rax, [rcx+24] 
-  mul    r13
-  add    r8, rax
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  mov    rax, [rcx+16] 
-  mul    r14
-  add    r8, rax
-  mov    [rsp+32], r8      // c4 
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  xor    r11, r11
-  mov    rax, [rcx+16]
-  mul    r15
-  add    r9, rax
-  adc    r10, rdx
-  adc    r11, 0
-
-  mov    rax, [rcx+24] 
-  mul    r14
-  add    r9, rax          // c5 
-  adc    r10, rdx
-  adc    r11, 0
-
-  mov    rax, [rcx+24] 
-  mul    r15
-  add    r10, rax         // c6 
-  adc    r11, rdx         // c7 
-  
-  mov    rax, [rsp+64]
-  and    r12, rax
-  and    r13, rax
-  and    r14, rax
-  and    r15, rax
-  add    r12, r8
-  adc    r13, r9
-  adc    r14, r10
-  adc    r15, r11
-
-  mov    rax, [rsp+72]  
-  mov    r8, [rcx]
-  mov    r9, [rcx+8]
-  mov    r10, [rcx+16]
-  mov    r11, [rcx+24]
-  and    r8, rax
-  and    r9, rax
-  and    r10, rax
-  and    r11, rax
-  add    r8, r12
-  adc    r9, r13
-  adc    r10, r14
-  adc    r11, r15
-  mov    [rsp+32], r8
-  mov    [rsp+40], r9
-  mov    [rsp+48], r10
-  mov    [rsp+56], r11
-  
-  // rcx[0-7] <- AL*BL
-  mov    r11, [reg_p1]
-  mov    rax, [reg_p2] 
-  mul    r11
-  xor    r9, r9
-  mov    [rcx], rax        // c0
-  mov    r8, rdx
-  
-  mov    r14, [reg_p1+16] 
-  mov    rax, [reg_p2+8]
-  mul    r11
-  xor    r10, r10
-  add    r8, rax
-  adc    r9, rdx
-
-  mov    r12, [reg_p1+8] 
-  mov    rax, [reg_p2] 
-  mul    r12
-  add    r8, rax
-  mov    [rcx+8], r8       // c1 
-  adc    r9, rdx
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [reg_p2+16] 
-  mul    r11
-  add    r9, rax
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  mov    r13, [reg_p2] 
-  mov    rax, r14 
-  mul    r13
-  add    r9, rax
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  mov    rax, [reg_p2+8] 
-  mul    r12
-  add    r9, rax
-  mov    [rcx+16], r9      // c2 
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  xor    r9, r9
-  mov    rax, [reg_p2+24] 
-  mul    r11
-  mov    r15, [reg_p1+24] 
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, r15 
-  mul    r13
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [reg_p2+16] 
-  mul    r12
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [reg_p2+8] 
-  mul    r14
-  add    r10, rax
-  mov    [rcx+24], r10     // c3 
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  xor    r10, r10
-  mov    rax, [reg_p2+24] 
-  mul    r12
-  add    r8, rax
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  mov    rax, [reg_p2+8] 
-  mul    r15
-  add    r8, rax
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  mov    rax, [reg_p2+16] 
-  mul    r14
-  add    r8, rax
-  mov    [rcx+32], r8     // c4 
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [reg_p2+24]
-  mul    r14
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-
-  mov    rax, [reg_p2+16] 
-  mul    r15
-  add    r9, rax
-  mov    [rcx+40], r9      // c5 
-  adc    r10, rdx
-  adc    r8, 0
-
-  mov    rax, [reg_p2+24] 
-  mul    r15
-  add    r10, rax
-  mov    [rcx+48], r10     // c6 
-  adc    r8, rdx   
-  mov    [rcx+56], r8      // c7 
-
-  // rcx[8-15] <- AH*BH
-  mov    r11, [reg_p1+32]
-  mov    rax, [reg_p2+32] 
-  mul    r11
-  xor    r9, r9
-  mov    [rcx+64], rax     // c0
-  mov    r8, rdx
-  
-  mov    r14, [reg_p1+48] 
-  mov    rax, [reg_p2+40]
-  mul    r11
-  xor    r10, r10
-  add    r8, rax
-  adc    r9, rdx
-
-  mov    r12, [reg_p1+40] 
-  mov    rax, [reg_p2+32] 
-  mul    r12
-  add    r8, rax
-  mov    [rcx+72], r8      // c1 
-  adc    r9, rdx
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [reg_p2+48] 
-  mul    r11
-  add    r9, rax
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  mov    r13, [reg_p2+32] 
-  mov    rax, r14 
-  mul    r13
-  add    r9, rax
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  mov    rax, [reg_p2+40] 
-  mul    r12
-  add    r9, rax
-  mov    [rcx+80], r9      // c2 
-  adc    r10, rdx 
-  adc    r8, 0
-  
-  xor    r9, r9
-  mov    rax, [reg_p2+56] 
-  mul    r11
-  mov    r15, [reg_p1+56] 
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, r15 
-  mul    r13
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [reg_p2+48] 
-  mul    r12
-  add    r10, rax
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  mov    rax, [reg_p2+40] 
-  mul    r14
-  add    r10, rax
-  mov    [rcx+88], r10     // c3 
-  adc    r8, rdx 
-  adc    r9, 0
-  
-  xor    r10, r10
-  mov    rax, [reg_p2+56] 
-  mul    r12
-  add    r8, rax
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  mov    rax, [reg_p2+40] 
-  mul    r15
-  add    r8, rax
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  mov    rax, [reg_p2+48] 
-  mul    r14
-  add    r8, rax
-  mov    [rcx+96], r8      // c4 
-  adc    r9, rdx 
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [reg_p2+56]
-  mul    r14
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-
-  mov    rax, [reg_p2+48] 
-  mul    r15
-  add    r9, rax
-  mov    [rcx+104], r9     // c5 
-  adc    r10, rdx
-  adc    r8, 0
-
-  mov    rax, [reg_p2+56] 
-  mul    r15
-  add    r10, rax
-  mov    [rcx+112], r10    // c6 
-  adc    r8, rdx   
-  mov    [rcx+120], r8     // c7 
-      
-  // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL 
-  mov    r8,  [rsp]
-  sub    r8,  [rcx] 
-  mov    r9,  [rsp+8]
-  sbb    r9,  [rcx+8]
-  mov    r10, [rsp+16]
-  sbb    r10, [rcx+16]
-  mov    r11, [rsp+24]
-  sbb    r11, [rcx+24] 
-  mov    r12, [rsp+32]
-  sbb    r12, [rcx+32]
-  mov    r13, [rsp+40]
-  sbb    r13, [rcx+40] 
-  mov    r14, [rsp+48]
-  sbb    r14, [rcx+48] 
-  mov    r15, [rsp+56]
-  sbb    r15, [rcx+56]
-      
-  // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH
-  mov    rax, [rcx+64]
-  sub    r8,  rax 
-  mov    rax, [rcx+72]
-  sbb    r9,  rax
-  mov    rax, [rcx+80]
-  sbb    r10, rax
-  mov    rax, [rcx+88]
-  sbb    r11, rax 
-  mov    rax, [rcx+96]
-  sbb    r12, rax
-  mov    rdx, [rcx+104]
-  sbb    r13, rdx
-  mov    rdi, [rcx+112]
-  sbb    r14, rdi 
-  mov    rsi, [rcx+120]
-  sbb    r15, rsi 
-      
-  // Final result
-  add    r8,  [rcx+32] 
-  mov    [rcx+32], r8
-  adc    r9,  [rcx+40]
-  mov    [rcx+40], r9
-  adc    r10, [rcx+48]
-  mov    [rcx+48], r10
-  adc    r11, [rcx+56]
-  mov    [rcx+56], r11
-  adc    r12, [rcx+64]
-  mov    [rcx+64], r12
-  adc    r13, [rcx+72]
-  mov    [rcx+72], r13
-  adc    r14, [rcx+80] 
-  mov    [rcx+80], r14
-  adc    r15, [rcx+88] 
-  mov    [rcx+88], r15
-  adc    rax, 0
-  mov    [rcx+96], rax
-  adc    rdx, 0
-  mov    [rcx+104], rdx
-  adc    rdi, 0
-  mov    [rcx+112], rdi
-  adc    rsi, 0
-  mov    [rcx+120], rsi
-    
-  add    rsp, 80           // Restoring space in stack
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-#endif
-
-
-#ifdef _MULX_
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  regs I0 and I1, and memory pointer M1
-// Outputs: regs T0:T5
-// Temps:   regs T0:T5
-/////////////////////////////////////////////////////////////////
-
-#ifdef _ADX_
-.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5
-    mulx   \T2, \T4, 8\M1
-    xor    rax, rax
-    mulx   \T3, \T5, 16\M1 
-    ADD1   \T1, \T4               
-    ADC1   \T2, \T5     
-    mulx   \T4, \T5, 24\M1
-    ADC1   \T3, \T5 
-    ADC1   \T4, rax   
-    
-    xor    rax, rax
-    mov    rdx, \I1 
-    mulx   \I1, \T5, \M1 
-    ADD2   \T1, \T5            // T1 <- C1_final 
-    ADC2   \T2, \I1     
-    mulx   \T5, \I1, 8\M1
-    ADC2   \T3, \T5 
-    ADD1   \T2, \I1        
-    mulx   \T5, \I1, 16\M1
-    ADC2   \T4, \T5 
-    ADC1   \T3, \I1     
-    mulx   \T5, \I1, 24\M1   
-    ADC2   \T5, rax         
-    ADC1   \T4, \I1  
-    ADC1   \T5, rax 
-.endm
-
-#else
-
-.macro MUL128x256_SCHOOL I0, I1, M1, T0, T1, T2, T3, T4, T5 
-    mulx   \T2, \T4, 8\M1
-    mulx   \T3, \T5, 16\M1 
-    add    \T1, \T4               
-    adc    \T2, \T5     
-    mulx   \T4, \T5, 24\M1
-    adc    \T3, \T5 
-    adc    \T4, 0   
-    
-    mov    rdx, \I1 
-    mulx   \I1, \T5, \M1 
-    add    \T1, \T5            // T1 <- C1_final 
-    adc    \T2, \I1     
-    mulx   \T5, \I1, 8\M1
-    adc    \T3, \T5       
-    mulx   \T5, rax, 16\M1
-    adc    \T4, \T5     
-    mulx   \T5, rdx, 24\M1 
-    adc    \T5, 0
-    add    \T2, \I1  
-    adc    \T3, rax        
-    adc    \T4, rdx  
-    adc    \T5, 0 
-.endm
-#endif
-
-  
-//**************************************************************************************
-//  Montgomery reduction
-//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015  
-//  Operation: c [reg_p2] = a [reg_p1]
-//************************************************************************************** 
-.global fmt(rdc503_asm)
-fmt(rdc503_asm):
-
-    // a[0-1] x 64xp503p1_nz --> result: r8:r13  
-    mov    rdx, [reg_p1]
-    mov    rcx, [reg_p1+8]  
-    mulx   r9, r8, [rip+fmt(p503p1x64)]   // result r8  
-    push   rbx
-    push   rbp
-    push   r12
-    push   r13 
-    push   r14 
-    push   r15
-    MUL128x256_SCHOOL rdx, rcx, [rip+fmt(p503p1x64)], r8, r9, r10, r11, r12, r13     
-
-    xor    r15, r15
-    shrd   r15, r8, 6 
-    shrd   r8, r9, 6 
-    shrd   r9, r10, 6 
-    shrd   r10, r11, 6 
-    shrd   r11, r12, 6 
-    shrd   r12, r13, 6 
-    shr    r13, 6
-    mov    rdx, [reg_p1+16] 
-    mov    r14, [reg_p1+80] 
-    add    r15, [reg_p1+24]
-    adc    r8, [reg_p1+32]  
-    adc    r9, [reg_p1+40]  
-    adc    r10, [reg_p1+48]   
-    adc    r11, [reg_p1+56]   
-    adc    r12, [reg_p1+64]   
-    adc    r13, [reg_p1+72]  
-    mulx   rbx, rcx, [rip+fmt(p503p1x64)]   // result rcx
-    adc    r14, 0
-    mov    [reg_p2], r8  
-    mov    [reg_p2+8], r9  
-    mov    [reg_p2+16], r10  
-    mov    [reg_p2+24], r11  
-    mov    [reg_p2+32], r12   
-    mov    [reg_p2+40], r13    
-    mov    [reg_p2+48], r14 
-    mov    r9, [reg_p1+88]  
-    mov    r10, [reg_p1+96]  
-    mov    r11, [reg_p1+104]  
-    mov    r12, [reg_p1+112]
-    mov    rdi, [reg_p1+120]
-    adc    r9, 0
-    adc    r10, 0
-    adc    r11, 0
-    adc    r12, 0
-    adc    rdi, 0
-
-    // a[2-3] x 64xp503p1_nz --> result: rcx, rbx, rbp, r14, r8, r13
-    MUL128x256_SCHOOL rdx, r15, [rip+fmt(p503p1x64)], rcx, rbx, rbp, r14, r8, r13 
-
-    xor    r15, r15
-    shrd   r15, rcx, 6 
-    shrd   rcx, rbx, 6 
-    shrd   rbx, rbp, 6 
-    shrd   rbp, r14, 6 
-    shrd   r14, r8, 6  
-    shrd   r8, r13, 6 
-    shr    r13, 6
-    mov    rdx, [reg_p2]
-    add    r15, [reg_p2+8]
-    adc    rcx, [reg_p2+16]  
-    adc    rbx, [reg_p2+24]  
-    adc    rbp, [reg_p2+32]   
-    adc    r14, [reg_p2+40]  
-    adc    r8, [reg_p2+48]
-    mov    [reg_p2+16], rcx  
-    mov    [reg_p2+24], rbx  
-    mov    [reg_p2+32], rbp   
-    mov    [reg_p2+40], r14 
-    mov    [reg_p2+48], r8 
-    mulx   rbp, rbx, [rip+fmt(p503p1x64)]   // result rbx    
-    adc    r9, r13 
-    adc    r10, 0
-    adc    r11, 0
-    adc    r12, 0
-    adc    rdi, 0
-
-    // a[4-5] x 64xp503p1_nz --> result: rbx, rbp, r14, r8, r13, rcx
-    MUL128x256_SCHOOL rdx, r15, [rip+fmt(p503p1x64)], rbx, rbp, r14, r8, r13, rcx  
-
-    xor    r15, r15
-    shrd   r15, rbx, 6 
-    shrd   rbx, rbp, 6 
-    shrd   rbp, r14, 6 
-    shrd   r14, r8, 6 
-    shrd   r8, r13, 6 
-    shrd   r13, rcx, 6 
-    shr    rcx, 6
-    mov    rdx, [reg_p2+16]
-    add    r15, [reg_p2+24]
-    adc    rbx, [reg_p2+32]  
-    adc    rbp, [reg_p2+40]  
-    adc    r14, [reg_p2+48]  
-    mov    [reg_p2], rbx              // Final result c0
-    mov    [reg_p2+8], rbp   
-    mov    [reg_p2+16], r14
-    adc    r9, r8 
-    adc    r10, r13  
-    mulx   r14, rbp, [rip+fmt(p503p1x64)]   // result rbp  
-    adc    r11, rcx
-    adc    r12, 0
-    adc    rdi, 0
-
-    // a[6-7] x 64xp503p1_nz --> result: rbp, r14, r8, r13, rcx, rbx
-    MUL128x256_SCHOOL rdx, r15, [rip+fmt(p503p1x64)], rbp, r14, r8, r13, rcx, rbx  
-    
-    xor    r15, r15
-    shrd   r15, rbp, 6 
-    shrd   rbp, r14, 6 
-    shrd   r14, r8, 6 
-    shrd   r8, r13, 6 
-    shrd   r13, rcx, 6 
-    shrd   rcx, rbx, 6 
-    shr    rbx, 6 
-    add    r15, [reg_p2+8]
-    adc    rbp, [reg_p2+16] 
-    mov    [reg_p2+8], r15       // Final result c1-c7
-    mov    [reg_p2+16], rbp  
-    adc    r9, r14 
-    adc    r10, r8 
-    adc    r11, r13
-    adc    r12, rcx
-    adc    rdi, rbx    
-    mov    [reg_p2+24], r9  
-    mov    [reg_p2+32], r10  
-    mov    [reg_p2+40], r11   
-    mov    [reg_p2+48], r12
-    mov    [reg_p2+56], rdi
-
-    pop    r15
-    pop    r14
-    pop    r13
-    pop    r12
-    pop    rbp
-    pop    rbx
-    ret
-    
-  #else
-  
-//***********************************************************************
-//  Montgomery reduction
-//  Based on comba method
-//  Operation: c [reg_p2] = a [reg_p1]
-//  NOTE: a=c is not allowed
-//*********************************************************************** 
-.global fmt(rdc503_asm)
-fmt(rdc503_asm):
-  push   r12
-  push   r13 
-  push   r14 
-  push   r15 
-
-  mov    r11, [reg_p1]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    r11
-  xor    r8, r8
-  add    rax, [reg_p1+24]
-  mov    [reg_p2+24], rax    // z3
-  adc    r8, rdx
-  
-  xor    r9, r9
-  mov    rax, [rip+fmt(p503p1)+32] 
-  mul    r11
-  xor    r10, r10
-  add    r8, rax
-  adc    r9, rdx
-
-  mov    r12, [reg_p1+8]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    r12
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  add    r8, [reg_p1+32]
-  mov    [reg_p2+32], r8    // z4
-  adc    r9, 0
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    r11
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+32] 
-  mul    r12
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    r13, [reg_p1+16]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    r13
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  add    r9, [reg_p1+40]
-  mov    [reg_p2+40], r9    // z5
-  adc    r10, 0
-  adc    r8, 0
-  
-  xor    r9, r9
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r11
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    r12
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  
-  mov    rax, [rip+fmt(p503p1)+32]
-  mul    r13
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  
-  mov    r14, [reg_p2+24]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    r14
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  add    r10, [reg_p1+48]
-  mov    [reg_p2+48], r10   // z6
-  adc    r8, 0
-  adc    r9, 0
-  
-  xor    r10, r10
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r11
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r12
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    r13
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+32] 
-  mul    r14
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    r15, [reg_p2+32]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    r15
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  add    r8, [reg_p1+56]
-  mov    [reg_p2+56], r8    // z7
-  adc    r9, 0
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r12
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r13
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    r14
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+32] 
-  mul    r15
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rcx, [reg_p2+40]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    rcx
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  add    r9, [reg_p1+64]
-  mov    [reg_p2], r9        // z0
-  adc    r10, 0
-  adc    r8, 0
-  
-  xor    r9, r9
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r13
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r14
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-
-  mov    rax, [rip+fmt(p503p1)+40]
-  mul    r15
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-
-  mov    rax, [rip+fmt(p503p1)+32]
-  mul    rcx
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  
-  mov    r13, [reg_p2+48]
-  mov    rax, [rip+fmt(p503p1)+24]
-  mul    r13
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  add    r10, [reg_p1+72]
-  mov    [reg_p2+8], r10     // z1
-  adc    r8, 0
-  adc    r9, 0
-  
-  xor    r10, r10
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r14
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r15
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    rcx
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+32] 
-  mul    r13
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  
-  mov    r14, [reg_p2+56]
-  mov    rax, [rip+fmt(p503p1)+24] 
-  mul    r14
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  add    r8, [reg_p1+80]
-  mov    [reg_p2+16], r8     // z2
-  adc    r9, 0
-  adc    r10, 0
-  
-  xor    r8, r8
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r15
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    rcx
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    r13
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  
-  mov    rax, [rip+fmt(p503p1)+32] 
-  mul    r14
-  add    r9, rax
-  adc    r10, rdx
-  adc    r8, 0
-  add    r9, [reg_p1+88]
-  mov    [reg_p2+24], r9     // z3
-  adc    r10, 0
-  adc    r8, 0
-  
-  xor    r9, r9
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    rcx
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r13
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  
-  mov    rax, [rip+fmt(p503p1)+40] 
-  mul    r14
-  add    r10, rax
-  adc    r8, rdx
-  adc    r9, 0
-  add    r10, [reg_p1+96]
-  mov    [reg_p2+32], r10    // z4
-  adc    r8, 0
-  adc    r9, 0
-  
-  xor    r10, r10
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r13
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-
-  mov    rax, [rip+fmt(p503p1)+48] 
-  mul    r14
-  add    r8, rax
-  adc    r9, rdx
-  adc    r10, 0
-  add    r8, [reg_p1+104]    // z5
-  mov    [reg_p2+40], r8     // z5
-  adc    r9, 0
-  adc    r10, 0
-  
-  mov    rax, [rip+fmt(p503p1)+56] 
-  mul    r14
-  add    r9, rax
-  adc    r10, rdx
-  add    r9, [reg_p1+112]    // z6
-  mov    [reg_p2+48], r9     // z6
-  adc    r10, 0  
-  add    r10, [reg_p1+120]   // z7
-  mov    [reg_p2+56], r10    // z7
-
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-  #endif
-
-
-//***********************************************************************
-//  503-bit multiprecision addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fmt(mp_add503_asm)
-fmt(mp_add503_asm): 
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  
-  mov    r8, [reg_p1+32]
-  mov    r9, [reg_p1+40]
-  mov    r10, [reg_p1+48]
-  mov    r11, [reg_p1+56]
-  adc    r8, [reg_p2+32] 
-  adc    r9, [reg_p2+40] 
-  adc    r10, [reg_p2+48] 
-  adc    r11, [reg_p2+56]
-  mov    [reg_p3+32], r8
-  mov    [reg_p3+40], r9
-  mov    [reg_p3+48], r10
-  mov    [reg_p3+56], r11
-  ret
-
-
-//***********************************************************************
-//  2x503-bit multiprecision subtraction/addition
-//  Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p503*2^512
-//*********************************************************************** 
-.global fmt(mp_subadd503x2_asm)
-fmt(mp_subadd503x2_asm):
-  push   r12
-  push   r13 
-  push   r14 
-  push   r15
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-
-  mov    r8, [reg_p1+32]
-  mov    r9, [reg_p1+40]
-  mov    r10, [reg_p1+48] 
-  mov    r11, [reg_p1+56]
-  sbb    r8, [reg_p2+32] 
-  sbb    r9, [reg_p2+40] 
-  sbb    r10, [reg_p2+48]
-  sbb    r11, [reg_p2+56] 
-  mov    [reg_p3+32], r8
-  mov    [reg_p3+40], r9
-  mov    [reg_p3+48], r10
-  mov    [reg_p3+56], r11
-
-  mov    r8, [reg_p1+64]
-  mov    r9, [reg_p1+72]
-  mov    r10, [reg_p1+80] 
-  mov    r11, [reg_p1+88]
-  sbb    r8, [reg_p2+64] 
-  sbb    r9, [reg_p2+72] 
-  sbb    r10, [reg_p2+80]
-  sbb    r11, [reg_p2+88] 
-  mov    [reg_p3+64], r8
-  mov    [reg_p3+72], r9
-  mov    [reg_p3+80], r10
-  mov    [reg_p3+88], r11
-  
-  mov    r12, [reg_p1+96]
-  mov    r13, [reg_p1+104] 
-  mov    r14, [reg_p1+112]
-  mov    r15, [reg_p1+120]
-  sbb    r12, [reg_p2+96]
-  sbb    r13, [reg_p2+104]
-  sbb    r14, [reg_p2+112]  
-  sbb    r15, [reg_p2+120] 
-  sbb    rax, 0
-  
-  // Add p503 anded with the mask in rax 
-  mov    r8, [rip+fmt(p503)]
-  mov    r9, [rip+fmt(p503)+24]
-  mov    r10, [rip+fmt(p503)+32]
-  mov    r11, [rip+fmt(p503)+40]
-  mov    rdi, [rip+fmt(p503)+48]
-  mov    rsi, [rip+fmt(p503)+56]
-  and    r8, rax
-  and    r9, rax
-  and    r10, rax
-  and    r11, rax
-  and    rdi, rax
-  and    rsi, rax
-  mov    rax, [reg_p3+64]
-  add    rax, r8
-  mov    [reg_p3+64], rax
-  mov    rax, [reg_p3+72]
-  adc    rax, r8
-  mov    [reg_p3+72], rax
-  mov    rax, [reg_p3+80]
-  adc    rax, r8
-  mov    [reg_p3+80], rax
-  mov    rax, [reg_p3+88]
-  adc    rax, r9
-  mov    [reg_p3+88], rax
-  adc    r12, r10
-  adc    r13, r11
-  adc    r14, rdi
-  adc    r15, rsi
-  
-  mov    [reg_p3+96], r12
-  mov    [reg_p3+104], r13
-  mov    [reg_p3+112], r14
-  mov    [reg_p3+120], r15
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
 
 //***********************************************************************
-//  Double 2x503-bit multiprecision subtraction
-//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fmt(mp_dblsub503x2_asm)
-fmt(mp_dblsub503x2_asm):
-  push   r12
-  push   r13
-  
-  mov    r8, [reg_p3]
-  mov    r9, [reg_p3+8]
-  mov    r10, [reg_p3+16]
-  mov    r11, [reg_p3+24]
-  mov    r12, [reg_p3+32]
-  mov    r13, [reg_p3+40]
-  sub    r8, [reg_p1]
-  sbb    r9, [reg_p1+8] 
-  sbb    r10, [reg_p1+16] 
-  sbb    r11, [reg_p1+24] 
-  sbb    r12, [reg_p1+32] 
-  sbb    r13, [reg_p1+40]
-  setc   al 
-  sub    r8, [reg_p2]
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  setc   cl
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12
-  mov    [reg_p3+40], r13
-  
-  mov    r8, [reg_p3+48]
-  mov    r9, [reg_p3+56]
-  mov    r10, [reg_p3+64]
-  mov    r11, [reg_p3+72]
-  mov    r12, [reg_p3+80]
-  mov    r13, [reg_p3+88]
-  bt     rax, 0  
-  sbb    r8, [reg_p1+48] 
-  sbb    r9, [reg_p1+56]
-  sbb    r10, [reg_p1+64] 
-  sbb    r11, [reg_p1+72] 
-  sbb    r12, [reg_p1+80] 
-  sbb    r13, [reg_p1+88]
-  setc   al 
-  bt     rcx, 0  
-  sbb    r8, [reg_p2+48] 
-  sbb    r9, [reg_p2+56]
-  sbb    r10, [reg_p2+64] 
-  sbb    r11, [reg_p2+72] 
-  sbb    r12, [reg_p2+80] 
-  sbb    r13, [reg_p2+88] 
-  setc   cl
-  mov    [reg_p3+48], r8
-  mov    [reg_p3+56], r9
-  mov    [reg_p3+64], r10
-  mov    [reg_p3+72], r11
-  mov    [reg_p3+80], r12
-  mov    [reg_p3+88], r13
-    
-  mov    r8, [reg_p3+96]
-  mov    r9, [reg_p3+104]
-  mov    r10, [reg_p3+112]
-  mov    r11, [reg_p3+120]
-  bt     rax, 0  
-  sbb    r8, [reg_p1+96] 
-  sbb    r9, [reg_p1+104] 
-  sbb    r10, [reg_p1+112] 
-  sbb    r11, [reg_p1+120]
-  bt     rcx, 0  
-  sbb    r8, [reg_p2+96] 
-  sbb    r9, [reg_p2+104] 
-  sbb    r10, [reg_p2+112] 
-  sbb    r11, [reg_p2+120]
-  mov    [reg_p3+96], r8
-  mov    [reg_p3+104], r9
-  mov    [reg_p3+112], r10
-  mov    [reg_p3+120], r11
-  
-  pop    r13
-  pop    r12
-  ret
\ No newline at end of file
+//  Squaring in GF(p^2), complex part
+//  Operation: c [reg_p2] = 2a0 x a1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [reg_p2]
+//***********************************************************************
+.global fmt(fp2sqr503_c1_asm)
+fmt(fp2sqr503_c1_asm):   
+    push   r12
+    push   r13 
+	
+	mov    r8, [reg_p1]
+	mov    r9, [reg_p1+8]
+	mov    r10, [reg_p1+16]
+	mov    r11, [reg_p1+24]
+	mov    r12, [reg_p1+32]
+	mov    r13, [reg_p1+40] 
+	add    r8, r8
+	adc    r9, r9
+    push   r14 
+	adc    r10, r10
+	adc    r11, r11 
+    push   r15 
+	adc    r12, r12
+	mov    r14, [reg_p1+48] 
+	mov    r15, [reg_p1+56] 
+	adc    r13, r13 
+    push   rbx 
+	adc    r14, r14 
+    push   rbp
+	adc    r15, r15
+	sub    rsp, 64
+	mov    [rsp+8], r9
+	mov    [rsp+16], r10
+    
+    // [r8:r15, rbp] <- z = a00 x a1
+    mov    rdx, r8
+    mulx   r9, r8, [reg_p1+64]  
+	mov    [rsp+24], r11  
+    xor    rax, rax   
+    mulx   r10, r11, [reg_p1+72]  
+	mov    [rsp+32], r12 
+    adox   r9, r11        
+    mulx   r11, r12, [reg_p1+80] 
+	mov    [rsp+40], r13  
+    adox   r10, r12        
+    mulx   r12, r13, [reg_p1+88]  
+	mov    [rsp+48], r14 
+    adox   r11, r13       
+    mulx   r13, r14, [reg_p1+96] 
+	mov    [rsp+56], r15  
+    adox   r12, r14      
+    mulx   r14, r15, [reg_p1+104]   
+    adox   r13, r15      
+    mulx   r15, rbp, [reg_p1+112]  
+    adox   r14, rbp      
+    mulx   rbp, rbx, [reg_p1+120]  
+    adox   r15, rbx 
+    adox   rbp, rax
+           
+	FPMUL512x512 [rsp], [reg_p1+64], r8, r9, r10, r11, r12, r13, r14, r15, rbp, rbx, rcx
+         
+    mov    [reg_p2], rbp         
+    mov    [reg_p2+8], r8         
+    mov    [reg_p2+16], r9      
+    mov    [reg_p2+24], r10     
+    mov    [reg_p2+32], r11     
+    mov    [reg_p2+40], r12     
+    mov    [reg_p2+48], r13
+    mov    [reg_p2+56], r14
+	add    rsp, 64
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Field multiplication in GF(p)
+//  Operation: c = a x b mod p
+//  Inputs: a stored in [reg_p1], b stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fpmul503_asm)
+fmt(fpmul503_asm): 
+    mov    rcx, reg_p3
+     
+    // [r8:r15] <- z = a x b0
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1]  
+    push   r12
+    xor    rax, rax 
+    mulx   r10, r11, [reg_p1+8]
+    push   r13 
+    adox   r9, r11        
+    mulx   r11, r12, [reg_p1+16]
+    push   r14 
+    adox   r10, r12        
+    mulx   r12, r13, [reg_p1+24]   
+    push   r15 
+    adox   r11, r13       
+    mulx   r13, r14, [reg_p1+32]    
+    push   rbx   
+    adox   r12, r14      
+    mulx   r14, r15, [reg_p1+40]  
+    push   rbp
+    adox   r13, r15      
+    mulx   r15, rbx, [reg_p1+48]  
+    adox   r14, rbx      
+    mulx   rbx, rbp, [reg_p1+56]  
+    adox   r15, rbp 
+    adox   rax, rbx 
+
+	FPMUL512x512 [reg_p2], [reg_p1], r8, r9, r10, r11, r12, r13, r14, r15, rax, rbx, rbp
+
+    mov    [rcx], rax 
+    mov    [rcx+8], r8         
+    mov    [rcx+16], r9         
+    mov    [rcx+24], r10      
+    mov    [rcx+32], r11      
+    mov    [rcx+40], r12      
+    mov    [rcx+48], r13     
+    mov    [rcx+56], r14
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
\ No newline at end of file
diff --git a/src/P503/ARM64/fp_arm64.c b/src/P503/ARM64/fp_arm64.c
index df4e8dc..cc36497 100644
--- a/src/P503/ARM64/fp_arm64.c
+++ b/src/P503/ARM64/fp_arm64.c
@@ -1,10 +1,15 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P503
 *********************************************************************************************/
 
 #include "../P503_internal.h"
+#include "../../internal.h"
 
 // Global constants
 extern const uint64_t p503[NWORDS_FIELD];
@@ -13,21 +18,21 @@ extern const uint64_t p503x2[NWORDS_FIELD];
 extern const uint64_t p503x4[NWORDS_FIELD];
 
 
-__inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub503_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. 
     
     mp_sub503_p2_asm(a, b, c); 
 } 
 
 
-__inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub503_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. 
     
     mp_sub503_p4_asm(a, b, c);
 }
 
 
-__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular addition, c = a+b mod p503.
   // Inputs: a, b in [0, 2*p503-1] 
   // Output: c in [0, 2*p503-1]
@@ -36,7 +41,7 @@ __inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c)
 } 
 
 
-__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular subtraction, c = a-b mod p503.
   // Inputs: a, b in [0, 2*p503-1] 
   // Output: c in [0, 2*p503-1] 
@@ -45,7 +50,7 @@ __inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c)
 }
 
 
-__inline void fpneg503(digit_t* a)
+inline void fpneg503(digit_t* a)
 { // Modular negation, a = -a mod p503.
   // Input/output: a in [0, 2*p503-1] 
     unsigned int i, borrow = 0;
diff --git a/src/P503/ARM64/fp_arm64_asm.S b/src/P503/ARM64/fp_arm64_asm.S
index 914d789..220b65a 100644
--- a/src/P503/ARM64/fp_arm64_asm.S
+++ b/src/P503/ARM64/fp_arm64_asm.S
@@ -1,5 +1,9 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license
 //
 // Abstract: field arithmetic in 64-bit ARMv8 assembly for P503 on Linux
 //*******************************************************************************************
diff --git a/src/P503/P503.c b/src/P503/P503.c
index caefc1e..9e576c7 100644
--- a/src/P503/P503.c
+++ b/src/P503/P503.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P503
 *********************************************************************************************/  
@@ -27,13 +31,10 @@ const uint64_t p503x2[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF
                                                      0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C }; 
 const uint64_t p503x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xAFFFFFFFFFFFFFFF, 
                                                      0x4C216F6888479E82, 0x6E6FDB21EDF9F6BC, 0x81171AF769DE9340, 0x01019BD506047879 };
+const uint64_t p503x8[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x5FFFFFFFFFFFFFFF, 
+                                                     0x9842DED1108F3D05, 0xDCDFB643DBF3ED78, 0x022E35EED3BD2680, 0x020337AA0C08F0F3 };
 const uint64_t p503p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000,
                                                      0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E };
-const uint64_t p503p1x64[NWORDS64_FIELD/2]       = { 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 };  
-const uint64_t p503x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 
-                                                     0x9EF484BBBDC30BEA, 0x8C8126F090304A1D, 0xF7472844B10B65FC, 0x30F32157CFDC3C33, 
-                                                     0x1463AB4329A333F7, 0xDFC933977C47D3A4, 0x338A3767F6F2520B, 0x4F8CB7565CCC13FA, 
-                                                     0xDE43B73AACD2189B, 0xBCF845CAC5405FBD, 0x516D02A09E684B7A, 0x0001033A4091BB86 }; 
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 }; 
 // Order of Bob's subgroup
@@ -96,6 +97,7 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fpneg                         fpneg503
 #define fpdiv2                        fpdiv2_503
 #define fpcorrection                  fpcorrection503
+#define fpmul                         fpmul503
 #define fpmul_mont                    fpmul503_mont
 #define fpsqr_mont                    fpsqr503_mont
 #define fpinv_mont                    fpinv503_mont
@@ -113,6 +115,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fp2correction                 fp2correction503
 #define fp2mul_mont                   fp2mul503_mont
 #define fp2sqr_mont                   fp2sqr503_mont
+#define fp2mul_c0_mont                fp2mul503_c0_mont
+#define fp2mul_c1_mont                fp2mul503_c1_mont
+#define fp2sqr_c0_mont                fp2sqr503_c0_mont
+#define fp2sqr_c1_mont                fp2sqr503_c1_mont
 #define fp2inv_mont                   fp2inv503_mont
 #define fp2inv_mont_bingcd            fp2inv503_mont_bingcd
 #define fpequal_non_constant_time     fpequal503_non_constant_time
diff --git a/src/P503/P503_api.h b/src/P503/P503_api.h
index 1c17447..1a209d3 100644
--- a/src/P503/P503_api.h
+++ b/src/P503/P503_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P503
 *********************************************************************************************/  
diff --git a/src/P503/P503_compressed.c b/src/P503/P503_compressed.c
index d3611e2..a68f98f 100644
--- a/src/P503/P503_compressed.c
+++ b/src/P503/P503_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * Supersingular Isogeny Key Encapsulation Library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P503_compressed
 *********************************************************************************************/ 
@@ -28,13 +32,10 @@ const uint64_t p503x2[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF
                                                      0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C }; 
 const uint64_t p503x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xAFFFFFFFFFFFFFFF, 
                                                      0x4C216F6888479E82, 0x6E6FDB21EDF9F6BC, 0x81171AF769DE9340, 0x01019BD506047879 };
+const uint64_t p503x8[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x5FFFFFFFFFFFFFFF, 
+                                                     0x9842DED1108F3D05, 0xDCDFB643DBF3ED78, 0x022E35EED3BD2680, 0x020337AA0C08F0F3 };
 const uint64_t p503p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000,
                                                      0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E };
-const uint64_t p503p1x64[NWORDS64_FIELD/2]       = { 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 };  
-const uint64_t p503x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 
-                                                     0x9EF484BBBDC30BEA, 0x8C8126F090304A1D, 0xF7472844B10B65FC, 0x30F32157CFDC3C33, 
-                                                     0x1463AB4329A333F7, 0xDFC933977C47D3A4, 0x338A3767F6F2520B, 0x4F8CB7565CCC13FA, 
-                                                     0xDE43B73AACD2189B, 0xBCF845CAC5405FBD, 0x516D02A09E684B7A, 0x0001033A4091BB86 };
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 }; 
 // Order of Bob's subgroup
@@ -353,6 +354,7 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fpneg                         fpneg503
 #define fpdiv2                        fpdiv2_503
 #define fpcorrection                  fpcorrection503
+#define fpmul                         fpmul503
 #define fpmul_mont                    fpmul503_mont
 #define fpsqr_mont                    fpsqr503_mont
 #define fpinv_mont                    fpinv503_mont
@@ -370,6 +372,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fp2correction                 fp2correction503
 #define fp2mul_mont                   fp2mul503_mont
 #define fp2sqr_mont                   fp2sqr503_mont
+#define fp2mul_c0_mont                fp2mul503_c0_mont
+#define fp2mul_c1_mont                fp2mul503_c1_mont
+#define fp2sqr_c0_mont                fp2sqr503_c0_mont
+#define fp2sqr_c1_mont                fp2sqr503_c1_mont
 #define fp2inv_mont                   fp2inv503_mont
 #define fp2inv_mont_bingcd            fp2inv503_mont_bingcd
 #define fpequal_non_constant_time     fpequal503_non_constant_time
diff --git a/src/P503/P503_compressed_api.h b/src/P503/P503_compressed_api.h
index 668f0f0..bc02f56 100644
--- a/src/P503/P503_compressed_api.h
+++ b/src/P503/P503_compressed_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P503 using compression
 *********************************************************************************************/  
diff --git a/src/P503/P503_compressed_dlog_tables.c b/src/P503/P503_compressed_dlog_tables.c
index 4ebb910..db9724a 100644
--- a/src/P503/P503_compressed_dlog_tables.c
+++ b/src/P503/P503_compressed_dlog_tables.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for Pohlig-Hellman when using compression
 *********************************************************************************************/ 
diff --git a/src/P503/P503_compressed_pair_tables.c b/src/P503/P503_compressed_pair_tables.c
index f7ee631..ff54f97 100644
--- a/src/P503/P503_compressed_pair_tables.c
+++ b/src/P503/P503_compressed_pair_tables.c
@@ -1,5 +1,9 @@
 /**************************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression
 ***************************************************************************************************/  
diff --git a/src/P503/P503_internal.h b/src/P503/P503_internal.h
index 53afc48..120f1e5 100644
--- a/src/P503/P503_internal.h
+++ b/src/P503/P503_internal.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: internal header file for P503
 *********************************************************************************************/  
@@ -168,6 +172,8 @@ void rdc503_asm(digit_t* ma, digit_t* mc);
             
 // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
 void fpmul503_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fpmul503(const digit_t* a, const digit_t* b, digit_t* c);
+void fpmul503_asm(const digit_t* a, const digit_t* b, digit_t* c);
 void mul503_asm(const digit_t* a, const digit_t* b, digit_t* c);
    
 // Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768
@@ -207,9 +213,17 @@ void fp2correction503(f2elm_t a);
             
 // GF(p503^2) squaring using Montgomery arithmetic, c = a^2 in GF(p503^2)
 void fp2sqr503_mont(const f2elm_t a, f2elm_t c);
+void fp2sqr503_c0_mont(const digit_t* a, digit_t* c);
+void fp2sqr503_c0_asm(const digit_t* a, digit_t* c);
+void fp2sqr503_c1_mont(const digit_t* a, digit_t* c);
+void fp2sqr503_c1_asm(const digit_t* a, digit_t* c);
  
 // GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2)
 void fp2mul503_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
+void fp2mul503_c0_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul503_c0_asm(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul503_c1_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul503_c1_asm(const digit_t* a, const digit_t* b, digit_t* c);
 
 // GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
 void fp2inv503_mont(f2elm_t a);
diff --git a/src/P503/generic/fp_generic.c b/src/P503/generic/fp_generic.c
index 87d8b09..1fbdff3 100755
--- a/src/P503/generic/fp_generic.c
+++ b/src/P503/generic/fp_generic.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: portable modular arithmetic for P503
 *********************************************************************************************/
diff --git a/src/P610/AMD64/fp_x64.c b/src/P610/AMD64/fp_x64.c
index e77022e..db8cb5b 100644
--- a/src/P610/AMD64/fp_x64.c
+++ b/src/P610/AMD64/fp_x64.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for x64 platforms for P610
 *********************************************************************************************/
@@ -17,7 +21,7 @@ extern const uint64_t p610x4[NWORDS_FIELD];
 
 inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -39,7 +43,7 @@ inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c)
 
 inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 610)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -50,11 +54,6 @@ inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c)
     for (i = 0; i < NWORDS_FIELD; i++) {
         ADDC(borrow, c[i], ((digit_t*)p610x4)[i], borrow, c[i]); 
     }
-    
-#elif (OS_TARGET == OS_NIX)                 
-    
-    mp_sub610_p4_asm(a, b, c);    
-
 #endif
 } 
 
@@ -161,13 +160,43 @@ void fpcorrection610(digit_t* a)
     }
 }
 
+#if (OS_TARGET == OS_NIX)
+
+void fp2mul610_c0_mont(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fp2mul610_c0_asm(a, b, c);
+}
+
+
+void fp2mul610_c1_mont(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fp2mul610_c1_asm(a, b, c);
+}
+
+
+void fp2sqr610_c0_mont(const digit_t* a, digit_t* c)
+{
+    fp2sqr610_c0_asm(a, c);
+}
+
+
+void fp2sqr610_c1_mont(const digit_t* a, digit_t* c)
+{
+    fp2sqr610_c1_asm(a, c);
+}
+
+
+void fpmul610(const digit_t* a, const digit_t* b, digit_t* c)
+{
+    fpmul610_asm(a, b, c);
+}
+
+#else
 
 void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords)
 { // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
         
     UNREFERENCED_PARAMETER(nwords);
-
-#if (OS_TARGET == OS_WIN)
     digit_t t = 0;
     uint128_t uv = {0};
     unsigned int carry = 0;
@@ -462,12 +491,6 @@ void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int n
     MULADD128(a[9], b[9], uv, carry, uv);
     c[18] = uv[0];
     c[19] = uv[1];
-
-#elif (OS_TARGET == OS_NIX)
-    
-    mul610_asm(a, b, c);
-
-#endif
 }
 
 
@@ -476,8 +499,6 @@ void rdc_mont(digit_t* ma, digit_t* mc)
   // mc = ma*R^-1 mod p610x2, where R = 2^640.
   // If ma < 2^640*p610, the output mc is in the range [0, 2*p610-1].
   // ma is assumed to be in Montgomery representation.
-        
-#if (OS_TARGET == OS_WIN)
     unsigned int carry;
     digit_t t = 0;
     uint128_t uv = {0};
@@ -717,10 +738,6 @@ void rdc_mont(digit_t* ma, digit_t* mc)
     ADDC(0, uv[0], ma[18], carry, mc[8]); 
     ADDC(carry, uv[1], 0, carry, uv[1]); 
     ADDC(0, uv[1], ma[19], carry, mc[9]); 
-    
-#elif (OS_TARGET == OS_NIX)                 
-    
-    rdc610_asm(ma, mc);    
+}
 
-#endif
-}
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/src/P610/AMD64/fp_x64_asm.S b/src/P610/AMD64/fp_x64_asm.S
index 8860cf6..0997164 100644
--- a/src/P610/AMD64/fp_x64_asm.S
+++ b/src/P610/AMD64/fp_x64_asm.S
@@ -1,1310 +1,1098 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
 //
-// Abstract: field arithmetic in x64 assembly for P610 on Linux 
-//*******************************************************************************************  
-
-.intel_syntax noprefix 
-
-// Format function and variable names for Mac OS X
-#if defined(__APPLE__)
-    #define fmt(f)    _##f
-#else
-    #define fmt(f)    f
-#endif
-
-// Registers that are used for parameter passing:
-#define reg_p1  rdi
-#define reg_p2  rsi
-#define reg_p3  rdx
-
-
-.text
-//***********************************************************************
-//  Field addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fmt(fpadd610_asm)
-fmt(fpadd610_asm):
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  mov    r15, [reg_p1+56] 
-  mov    rcx, [reg_p1+64]
-  mov    rdi, [reg_p1+72]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  adc    r12, [reg_p2+32] 
-  adc    r13, [reg_p2+40] 
-  adc    r14, [reg_p2+48] 
-  adc    r15, [reg_p2+56]
-  adc    rcx, [reg_p2+64]
-  adc    rdi, [reg_p2+72]
-
-  mov    rax, [rip+fmt(p610x2)]
-  sub    r8, rax
-  mov    rax, [rip+fmt(p610x2)+8]
-  sbb    r9, rax
-  sbb    r10, rax
-  sbb    r11, rax
-  mov    rax, [rip+fmt(p610x2)+32]
-  sbb    r12, rax
-  mov    rax, [rip+fmt(p610x2)+40]
-  sbb    r13, rax
-  mov    rax, [rip+fmt(p610x2)+48]
-  sbb    r14, rax
-  mov    rax, [rip+fmt(p610x2)+56]
-  sbb    r15, rax
-  mov    rax, [rip+fmt(p610x2)+64]
-  sbb    rcx, rax
-  mov    rax, [rip+fmt(p610x2)+72]
-  sbb    rdi, rax
-  mov    [reg_p3+64], rcx
-  mov    [reg_p3+72], rdi
-  mov    rax, 0
-  sbb    rax, 0
-  
-  mov    rsi, [rip+fmt(p610x2)]
-  and    rsi, rax
-  mov    rdi, [rip+fmt(p610x2)+8]
-  and    rdi, rax
-  
-  add    r8, rsi  
-  adc    r9, rdi 
-  adc    r10, rdi 
-  adc    r11, rdi
-  mov    [reg_p3], r8 
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11 
-  setc   cl
-  
-  mov    rdi, [rip+fmt(p610x2)+32]
-  and    rdi, rax
-  mov    rsi, [rip+fmt(p610x2)+40]
-  and    rsi, rax
-  mov    r8, [rip+fmt(p610x2)+48]
-  and    r8, rax
-  mov    r9, [rip+fmt(p610x2)+56]
-  and    r9, rax
-  mov    r10, [rip+fmt(p610x2)+64]
-  and    r10, rax
-  mov    r11, [rip+fmt(p610x2)+72]
-  and    r11, rax
-  
-  bt     rcx, 0
-  adc    r12, rdi
-  adc    r13, rsi  
-  adc    r14, r8
-  adc    r15, r9
-  mov    rsi, [reg_p3+64]
-  mov    rdi, [reg_p3+72]
-  adc    rsi, r10  
-  adc    rdi, r11
-  mov    [reg_p3+32], r12  
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-  mov    [reg_p3+56], r15  
-  mov    [reg_p3+64], rsi
-  mov    [reg_p3+72], rdi
-
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
-//***********************************************************************
-//  Field subtraction
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fmt(fpsub610_asm)
-fmt(fpsub610_asm):
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  mov    r15, [reg_p1+56] 
-  mov    rcx, [reg_p1+64] 
-  mov    rdi, [reg_p1+72]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48] 
-  sbb    r15, [reg_p2+56]
-  sbb    rcx, [reg_p2+64]
-  sbb    rdi, [reg_p2+72]
-  mov    [reg_p3+64], rcx
-  mov    [reg_p3+72], rdi
-  mov    rax, 0
-  sbb    rax, 0
-    
-  mov    rsi, [rip+fmt(p610x2)]
-  and    rsi, rax
-  mov    rdi, [rip+fmt(p610x2)+8]
-  and    rdi, rax
-  
-  add    r8, rsi  
-  adc    r9, rdi 
-  adc    r10, rdi 
-  adc    r11, rdi
-  mov    [reg_p3], r8 
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11 
-  setc   cl
-  
-  mov    rdi, [rip+fmt(p610x2)+32]
-  and    rdi, rax
-  mov    rsi, [rip+fmt(p610x2)+40]
-  and    rsi, rax
-  mov    r8, [rip+fmt(p610x2)+48]
-  and    r8, rax
-  mov    r9, [rip+fmt(p610x2)+56]
-  and    r9, rax
-  mov    r10, [rip+fmt(p610x2)+64]
-  and    r10, rax
-  mov    r11, [rip+fmt(p610x2)+72]
-  and    r11, rax
-  
-  bt     rcx, 0
-  adc    r12, rdi
-  adc    r13, rsi  
-  adc    r14, r8
-  adc    r15, r9
-  mov    rsi, [reg_p3+64]
-  mov    rdi, [reg_p3+72]
-  adc    rsi, r10  
-  adc    rdi, r11
-  mov    [reg_p3+32], r12  
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-  mov    [reg_p3+56], r15  
-  mov    [reg_p3+64], rsi
-  mov    [reg_p3+72], rdi
-  
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret 
-
-
-///////////////////////////////////////////////////////////////// MACRO
-.macro SUB610_PX  P0
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    r12, [reg_p1+32]
-  mov    r13, [reg_p1+40]
-  mov    r14, [reg_p1+48]
-  mov    r15, [reg_p1+56]
-  mov    rax, [reg_p1+64]
-  mov    rcx, [reg_p1+72]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40]
-  sbb    r14, [reg_p2+48] 
-  sbb    r15, [reg_p2+56] 
-  sbb    rax, [reg_p2+64] 
-  sbb    rcx, [reg_p2+72] 
-
-  mov    rdi, [rip+\P0]
-  mov    rsi, [rip+\P0+8]
-  add    r8, rdi  
-  adc    r9, rsi  
-  adc    r10, rsi 
-  adc    r11, rsi 
-  mov    rdi, [rip+\P0+32]
-  mov    rsi, [rip+\P0+40]
-  adc    r12, rdi   
-  adc    r13, rsi   
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9 
-  mov    [reg_p3+16], r10 
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12 
-  mov    [reg_p3+40], r13
-  mov    rdi, [rip+\P0+48]
-  mov    rsi, [rip+\P0+56]
-  adc    r14, rdi  
-  adc    r15, rsi  
-  mov    rdi, [rip+\P0+64]
-  mov    rsi, [rip+\P0+72]
-  adc    rax, rdi 
-  adc    rcx, rsi  
-  mov    [reg_p3+48], r14
-  mov    [reg_p3+56], r15 
-  mov    [reg_p3+64], rax 
-  mov    [reg_p3+72], rcx
-  
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  .endm
-
-
-//***********************************************************************
-//  Multiprecision subtraction with correction with 2*p610
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p610
-//*********************************************************************** 
-.global fmt(mp_sub610_p2_asm)
-fmt(mp_sub610_p2_asm):
-
-  SUB610_PX  fmt(p610x2)
-  ret
-
-
-//***********************************************************************
-//  Multiprecision subtraction with correction with 4*p610
-//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 4*p610
-//*********************************************************************** 
-.global fmt(mp_sub610_p4_asm)
-fmt(mp_sub610_p4_asm):
-
-  SUB610_PX  fmt(p610x4)
-  ret
-
-
-#ifdef _MULX_
-
-/////////////////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  memory pointers M0 and M1
-// Outputs: memory pointer C
-// Temps:   regs T0:T7
-///////////////////////////////////////////////////////////////////////////
-#ifdef _ADX_
-
-.macro MUL320_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7 
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1    
-    mulx   \T2, \T3, 8\M1
-    mov    \C, \T1             // C0_final 
-    xor    rax, rax
-    mulx   \T4, \T5, 16\M1 
-    adox   \T0, \T3               
-    adox   \T2, \T5     
-    mulx   \T1, \T3, 24\M1
-    adox   \T4, \T3         
-    mulx   \T5, \T6, 32\M1 
-    adox   \T1, \T6     
-    adox   \T5, rax        
-    
-    mov    rdx, 8\M0 
-    mulx   \T6, \T7, \M1 
-    xor    rax, rax
-    adcx   \T0, \T7 
-    mov    8\C, \T0            // C1_final 
-    adcx   \T2, \T6     
-    mulx   \T6, \T7, 8\M1
-    adox   \T2, \T7 
-    adcx   \T4, \T6        
-    mulx   \T0, \T6, 16\M1 
-    adox   \T4, \T6  
-    adcx   \T0, \T1     
-    mulx   \T1, \T7, 24\M1   
-    adcx   \T1, \T5 
-    adox   \T0, \T7   
-    mulx   \T5, \T6, 32\M1 
-    adcx   \T5, rax         
-    adox   \T1, \T6  
-    adox   \T5, rax         
-    
-    mov    rdx, 16\M0 
-    mulx   \T6, \T7, \M1
-    xor    rax, rax 
-    adcx   \T2, \T7 
-    mov    16\C, \T2           // C2_final 
-    adcx   \T4, \T6     
-    mulx   \T6, \T7, 8\M1
-    adox   \T4, \T7 
-    adcx   \T0, \T6        
-    mulx   \T2, \T6, 16\M1
-    adox   \T0, \T6 
-    adcx   \T1, \T2     
-    mulx   \T2, \T7, 24\M1   
-    adcx   \T5, \T2          
-    adox   \T1, \T7   
-    mulx   \T2, \T6, 32\M1   
-    adcx   \T2, rax 
-    adox   \T5, \T6 
-    adox   \T2, rax           
-    
-    mov    rdx, 24\M0 
-    mulx   \T6, \T7, \M1
-    xor    rax, rax 
-    adcx   \T4, \T7 
-    mov    24\C, \T4           // C3_final 
-    adcx   \T0, \T6     
-    mulx   \T6, \T7, 8\M1
-    adox   \T0, \T7
-    adcx   \T1, \T6        
-    mulx   \T4, \T6, 16\M1
-    adox   \T1, \T6  
-    adcx   \T5, \T4     
-    mulx   \T4, \T7, 24\M1   
-    adcx   \T2, \T4        
-    adox   \T5, \T7   
-    mulx   \T4, \T6, 32\M1   
-    adcx   \T4, rax 
-    adox   \T2, \T6  
-    adox   \T4, rax         
-    
-    mov    rdx, 32\M0 
-    mulx   \T6, \T7, \M1 
-    xor    rax, rax
-    adcx   \T0, \T7 
-    mov    32\C, \T0           // C4_final 
-    adcx   \T1, \T6     
-    mulx   \T6, \T7, 8\M1
-    adox   \T1, \T7 
-    adcx   \T5, \T6        
-    mulx   \T0, \T6, 16\M1 
-    adox   \T5, \T6 
-    adcx   \T2, \T0     
-    mulx   \T0, \T7, 24\M1   
-    adcx   \T4, \T0 
-    adox   \T2, \T7  
-    mulx   \T0, \T6, 32\M1   
-    adcx   \T0, rax           
-    adox   \T4, \T6 
-    adox   \T0, rax 
-
-    mov    40\C, \T1 
-    mov    48\C, \T5 
-    mov    56\C, \T2 
-    mov    64\C, \T4
-    mov    72\C, \T0
-.endm
-
-#else
-
-.macro MUL320_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7
-    mov    rdx, \M0
-    mulx   \T0, \T1, \M1    
-    mulx   \T2, \T3, 8\M1
-    mov    \C, \T1             // C0_final 
-    mulx   \T4, \T5, 16\M1 
-    add    \T0, \T3               
-    adc    \T2, \T5     
-    mulx   \T1, \T3, 24\M1
-    adc    \T3, \T4         
-    mulx   \T5, \T6, 32\M1 
-    adc    \T1, \T6     
-    adc    \T5, 0        
-    
-    mov    rdx, 8\M0 
-    mulx   \T6, \T7, \M1 
-    add    \T0, \T7 
-    mov    8\C, \T0            // C1_final 
-    adc    \T2, \T6     
-    mulx   \T6, \T7, 8\M1
-    adc    \T3, \T6        
-    mulx   \T0, \T4, 16\M1
-    adc    \T0, \T1     
-    mulx   \T1, \T6, 24\M1   
-    adc    \T5, \T1  
-    mulx   \T1, rax, 32\M1     
-    adc    \T1, 0 
-        
-    add    \T2, \T7 
-    adc    \T3, \T4  
-    adc    \T0, \T6  
-    adc    \T5, rax  
-    adc    \T1, 0         
-    
-    mov    rdx, 16\M0 
-    mulx   \T4, \T6, \M1 
-    add    \T2, \T6 
-    mov    16\C, \T2           // C2_final 
-    adc    \T3, \T4     
-    mulx   \T6, \T7, 8\M1
-    adc    \T0, \T6        
-    mulx   \T2, \T4, 16\M1 
-    adc    \T2, \T5     
-    mulx   \T5, \T6, 24\M1   
-    adc    \T1, \T5 
-    mulx   \T5, rax, 32\M1     
-    adc    \T5, 0 
-        
-    add    \T3, \T7
-    adc    \T0, \T4  
-    adc    \T2, \T6  
-    adc    \T1, rax 
-    adc    \T5, 0          
-    
-    mov    rdx, 24\M0
-    mulx   \T4, \T6, \M1 
-    add    \T3, \T6 
-    mov    24\C, \T3           // C3_final 
-    adc    \T0, \T4     
-    mulx   \T6, \T7, 8\M1
-    adc    \T2, \T6        
-    mulx   \T3, \T4, 16\M1 
-    adc    \T1, \T3     
-    mulx   \T3, \T6, 24\M1   
-    adc    \T3, \T5 
-    mulx   \T5, rax, 32\M1     
-    adc    \T5, 0
-        
-    add    \T0, \T7
-    adc    \T2, \T4  
-    adc    \T1, \T6  
-    adc    \T3, rax 
-    adc    \T5, 0       
-    
-    mov    rdx, 32\M0 
-    mulx   \T4, \T6, \M1 
-    add    \T0, \T6 
-    mov    32\C, \T0           // C4_final 
-    adc    \T2, \T4     
-    mulx   \T6, \T7, 8\M1
-    adc    \T1, \T6        
-    mulx   \T0, \T4, 16\M1 
-    adc    \T3, \T0     
-    mulx   \T0, \T6, 24\M1   
-    adc    \T0, \T5 
-    mulx   \T5, rax, 32\M1     
-    adc    \T5, 0
-        
-    add    \T2, \T7 
-    adc    \T1, \T4  
-    adc    \T3, \T6 
-    adc    \T0, rax 
-    adc    \T5, 0 
-    mov    40\C, \T2 
-    mov    48\C, \T1 
-    mov    56\C, \T3 
-    mov    64\C, \T0
-    mov    72\C, \T5 
-.endm
-
-#endif
-
-
-//*****************************************************************************
-//  610-bit multiplication using Karatsuba (one level), schoolbook (two levels)
-//***************************************************************************** 
-.global fmt(mul610_asm)
-fmt(mul610_asm):    
-    push   r12
-    push   r13 
-    push   r14 
-    push   r15
-    mov    rcx, reg_p3 
-
-    // [rsp] <- AH + AL, rax <- mask
-    xor    rax, rax
-    mov    r8, [reg_p1]
-    mov    r9, [reg_p1+8]
-    mov    r10, [reg_p1+16]
-    mov    r11, [reg_p1+24] 
-    mov    r12, [reg_p1+32] 
-    push   rbx 
-    sub    rsp, 112
-    add    r8, [reg_p1+40]
-    adc    r9, [reg_p1+48]
-    adc    r10, [reg_p1+56]
-    adc    r11, [reg_p1+64]
-    adc    r12, [reg_p1+72]
-    sbb    rax, 0
-    mov    [rsp], r8
-    mov    [rsp+8], r9
-    mov    [rsp+16], r10
-    mov    [rsp+24], r11
-    mov    [rsp+32], r12
-
-    // [rsp+40] <- BH + BL, rdx <- mask
-    xor    rdx, rdx
-    mov    r8, [reg_p2]
-    mov    r9, [reg_p2+8]
-    mov    rbx, [reg_p2+16]
-    mov    r13, [reg_p2+24] 
-    mov    r14, [reg_p2+32]    
-    add    r8, [reg_p2+40]
-    adc    r9, [reg_p2+48]
-    adc    rbx, [reg_p2+56]
-    adc    r13, [reg_p2+64]
-    adc    r14, [reg_p2+72]
-    sbb    rdx, 0
-    mov    [rsp+40], r8
-    mov    [rsp+48], r9
-    mov    [rsp+56], rbx
-    mov    [rsp+64], r13
-    mov    [rsp+72], r14     
-    
-    // [rcx] <- masked (BH + BL)
-    and    r8, rax
-    and    r9, rax
-    and    rbx, rax
-    and    r13, rax
-    and    r14, rax    
-    mov    [rcx], r8
-    mov    [rcx+8], r9
-
-    // r8-r12 <- masked (AH + AL)
-    mov    r8, [rsp]
-    mov    r9, [rsp+8]
-    and    r8, rdx
-    and    r9, rdx
-    and    r10, rdx
-    and    r11, rdx
-    and    r12, rdx
-
-    // [rsp+80] <- masked (AH + AL) + masked (BH + BL)
-    mov    rax, [rcx]
-    mov    rdx, [rcx+8]
-    add    r8, rax
-    adc    r9, rdx
-    adc    r10, rbx
-    adc    r11, r13
-    adc    r12, r14        
-    mov    [rsp+80], r8
-    mov    [rsp+88], r9
-    mov    [rsp+96], r10
-    mov    [rsp+104], r11
-
-    // [rcx] <- AL x BL
-    MUL320_SCHOOL  [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, rbx, r13, r14, r15     // Result C0-C4 
-
-    // [rcx+80] <- (AH+AL) x (BH+BL), low part 
-    MUL320_SCHOOL  [rsp], [rsp+40], [rcx+80], r8, r9, r10, r11, rbx, r13, r14, r15
-
-    // [rsp] <- AH x BH 
-    MUL320_SCHOOL  [reg_p1+40], [reg_p2+40], [rsp], r8, r9, r10, r11, rbx, r13, r14, r15
-    
-    // r8-r12 <- (AH+AL) x (BH+BL), final step
-    mov    r8, [rsp+80]
-    mov    r9, [rsp+88]
-    mov    r10, [rsp+96]
-    mov    r11, [rsp+104]
-    mov    rax, [rcx+120]
-    add    r8, rax
-    mov    rax, [rcx+128]
-    adc    r9, rax
-    mov    rax, [rcx+136]
-    adc    r10, rax
-    mov    rax, [rcx+144]
-    adc    r11, rax
-    mov    rax, [rcx+152]
-    adc    r12, rax
-    
-    // rdi,rdx,rbx,r13,r14,r8-r12 <- (AH+AL) x (BH+BL) - ALxBL
-    mov    rdi, [rcx+80]
-    sub    rdi, [rcx]
-    mov    rdx, [rcx+88]
-    sbb    rdx, [rcx+8]
-    mov    rbx, [rcx+96]
-    sbb    rbx, [rcx+16]
-    mov    r13, [rcx+104]
-    sbb    r13, [rcx+24]
-    mov    r14, [rcx+112]     
-    sbb    r14, [rcx+32]  
-    sbb    r8, [rcx+40]
-    sbb    r9, [rcx+48]
-    sbb    r10, [rcx+56]
-    sbb    r11, [rcx+64]
-    sbb    r12, [rcx+72]
-    
-    // rdi,rdx,rbx,r13,r14,r8-r12 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
-    sub    rdi, [rsp]
-    sbb    rdx, [rsp+8]
-    sbb    rbx, [rsp+16]
-    sbb    r13, [rsp+24]
-    sbb    r14, [rsp+32]  
-    sbb    r8, [rsp+40]
-    sbb    r9, [rsp+48]
-    sbb    r10, [rsp+56]
-    sbb    r11, [rsp+64]
-    sbb    r12, [rsp+72]
-    
-    mov    rax, [rcx+40]
-    add    rax, rdi
-    mov    [rcx+40], rax    // Result C5-C9
-    mov    rax, [rcx+48]
-    adc    rax, rdx
-    mov    [rcx+48], rax 
-    mov    rax, [rcx+56]
-    adc    rax, rbx
-    mov    [rcx+56], rax 
-    mov    rax, [rcx+64]
-    adc    rax, r13
-    mov    [rcx+64], rax 
-    mov    rax, [rcx+72]
-    adc    rax, r14           
-    mov    [rcx+72], rax 
-    mov    rax, [rsp]
-    adc    r8, rax 
-    mov    [rcx+80], r8    // Result C10-C19
-    mov    rax, [rsp+8]
-    adc    r9, rax
-    mov    [rcx+88], r9 
-    mov    rax, [rsp+16]
-    adc    r10, rax
-    mov    [rcx+96], r10 
-    mov    rax, [rsp+24]
-    adc    r11, rax
-    mov    [rcx+104], r11 
-    mov    rax, [rsp+32]
-    adc    r12, rax
-    mov    [rcx+112], r12 
-    mov    r8, [rsp+40]
-    mov    r9, [rsp+48]
-    mov    r10, [rsp+56]
-    mov    r11, [rsp+64]
-    mov    r12, [rsp+72]
-    adc    r8, 0
-    adc    r9, 0
-    adc    r10, 0
-    adc    r11, 0
-    adc    r12, 0
-    add    rsp, 112   
-    mov    [rcx+120], r8 
-    mov    [rcx+128], r9 
-    mov    [rcx+136], r10 
-    mov    [rcx+144], r11 
-    mov    [rcx+152], r12 
-      
-    pop    rbx
-    pop    r15
-    pop    r14
-    pop    r13
-    pop    r12
-    ret
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license 
+//
+// Abstract: field arithmetic in x64 assembly for P610 on Linux
+//******************************************************************************************* 
+
+.intel_syntax noprefix 
+
+// Format function and variable names for Mac OS X
+#if defined(__APPLE__)
+    #define fmt(f)    _##f
+#else
+    #define fmt(f)    f
+#endif
+
+// Registers that are used for parameter passing:
+#define reg_p1  rdi
+#define reg_p2  rsi
+#define reg_p3  rdx
+
+
+.text
+//***********************************************************************
+//  610-bit multiprecision addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//*********************************************************************** 
+.global fmt(mp_add610_asm)
+fmt(mp_add610_asm):  
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    rax, [reg_p1+32]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  adc    rax, [reg_p2+32] 
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9
+  mov    [reg_p3+16], r10
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], rax
+
+  mov    r8, [reg_p1+40]
+  mov    r9, [reg_p1+48] 
+  mov    r10, [reg_p1+56]
+  mov    r11, [reg_p1+64] 
+  mov    rax, [reg_p1+72] 
+  adc    r8, [reg_p2+40] 
+  adc    r9, [reg_p2+48]
+  adc    r10, [reg_p2+56] 
+  adc    r11, [reg_p2+64]
+  adc    rax, [reg_p2+72]
+  mov    [reg_p3+40], r8
+  mov    [reg_p3+48], r9
+  mov    [reg_p3+56], r10
+  mov    [reg_p3+64], r11
+  mov    [reg_p3+72], rax
+  ret
+
+
+//***********************************************************************
+//  Field addition
+//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
+//*********************************************************************** 
+.global fmt(fpadd610_asm)
+fmt(fpadd610_asm):
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  mov    r15, [reg_p1+56] 
+  mov    rcx, [reg_p1+64]
+  mov    rdi, [reg_p1+72]
+  add    r8, [reg_p2] 
+  adc    r9, [reg_p2+8] 
+  adc    r10, [reg_p2+16] 
+  adc    r11, [reg_p2+24] 
+  adc    r12, [reg_p2+32] 
+  adc    r13, [reg_p2+40] 
+  adc    r14, [reg_p2+48] 
+  adc    r15, [reg_p2+56]
+  adc    rcx, [reg_p2+64]
+  adc    rdi, [reg_p2+72]
+
+  mov    rax, [rip+fmt(p610x2)]
+  sub    r8, rax
+  mov    rax, [rip+fmt(p610x2)+8]
+  sbb    r9, rax
+  sbb    r10, rax
+  sbb    r11, rax
+  mov    rax, [rip+fmt(p610x2)+32]
+  sbb    r12, rax
+  mov    rax, [rip+fmt(p610x2)+40]
+  sbb    r13, rax
+  mov    rax, [rip+fmt(p610x2)+48]
+  sbb    r14, rax
+  mov    rax, [rip+fmt(p610x2)+56]
+  sbb    r15, rax
+  mov    rax, [rip+fmt(p610x2)+64]
+  sbb    rcx, rax
+  mov    rax, [rip+fmt(p610x2)+72]
+  sbb    rdi, rax
+  mov    [reg_p3+64], rcx
+  mov    [reg_p3+72], rdi
+  mov    rax, 0
+  sbb    rax, 0
+  
+  mov    rsi, [rip+fmt(p610x2)]
+  and    rsi, rax
+  mov    rdi, [rip+fmt(p610x2)+8]
+  and    rdi, rax
+  
+  add    r8, rsi  
+  adc    r9, rdi 
+  adc    r10, rdi 
+  adc    r11, rdi
+  mov    [reg_p3], r8 
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11 
+  setc   cl
+  
+  mov    rdi, [rip+fmt(p610x2)+32]
+  and    rdi, rax
+  mov    rsi, [rip+fmt(p610x2)+40]
+  and    rsi, rax
+  mov    r8, [rip+fmt(p610x2)+48]
+  and    r8, rax
+  mov    r9, [rip+fmt(p610x2)+56]
+  and    r9, rax
+  mov    r10, [rip+fmt(p610x2)+64]
+  and    r10, rax
+  mov    r11, [rip+fmt(p610x2)+72]
+  and    r11, rax
+  
+  bt     rcx, 0
+  adc    r12, rdi
+  adc    r13, rsi  
+  adc    r14, r8
+  adc    r15, r9
+  mov    rsi, [reg_p3+64]
+  mov    rdi, [reg_p3+72]
+  adc    rsi, r10  
+  adc    rdi, r11
+  mov    [reg_p3+32], r12  
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], r14
+  mov    [reg_p3+56], r15  
+  mov    [reg_p3+64], rsi
+  mov    [reg_p3+72], rdi
+
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+
+//***********************************************************************
+//  Field subtraction
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]
+//*********************************************************************** 
+.global fmt(fpsub610_asm)
+fmt(fpsub610_asm):
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  mov    r15, [reg_p1+56] 
+  mov    rcx, [reg_p1+64] 
+  mov    rdi, [reg_p1+72]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40] 
+  sbb    r14, [reg_p2+48] 
+  sbb    r15, [reg_p2+56]
+  sbb    rcx, [reg_p2+64]
+  sbb    rdi, [reg_p2+72]
+  mov    [reg_p3+64], rcx
+  mov    [reg_p3+72], rdi
+  mov    rax, 0
+  sbb    rax, 0
+    
+  mov    rsi, [rip+fmt(p610x2)]
+  and    rsi, rax
+  mov    rdi, [rip+fmt(p610x2)+8]
+  and    rdi, rax
+  
+  add    r8, rsi  
+  adc    r9, rdi 
+  adc    r10, rdi 
+  adc    r11, rdi
+  mov    [reg_p3], r8 
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11 
+  setc   cl
+  
+  mov    rdi, [rip+fmt(p610x2)+32]
+  and    rdi, rax
+  mov    rsi, [rip+fmt(p610x2)+40]
+  and    rsi, rax
+  mov    r8, [rip+fmt(p610x2)+48]
+  and    r8, rax
+  mov    r9, [rip+fmt(p610x2)+56]
+  and    r9, rax
+  mov    r10, [rip+fmt(p610x2)+64]
+  and    r10, rax
+  mov    r11, [rip+fmt(p610x2)+72]
+  and    r11, rax
+  
+  bt     rcx, 0
+  adc    r12, rdi
+  adc    r13, rsi  
+  adc    r14, r8
+  adc    r15, r9
+  mov    rsi, [reg_p3+64]
+  mov    rdi, [reg_p3+72]
+  adc    rsi, r10  
+  adc    rdi, r11
+  mov    [reg_p3+32], r12  
+  mov    [reg_p3+40], r13
+  mov    [reg_p3+48], r14
+  mov    [reg_p3+56], r15  
+  mov    [reg_p3+64], rsi
+  mov    [reg_p3+72], rdi
+  
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret 
+
+
+///////////////////////////////////////////////////////////////// MACRO
+.macro SUB610_PX  P0
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  
+  mov    r8, [reg_p1]
+  mov    r9, [reg_p1+8]
+  mov    r10, [reg_p1+16]
+  mov    r11, [reg_p1+24]
+  mov    r12, [reg_p1+32]
+  mov    r13, [reg_p1+40]
+  mov    r14, [reg_p1+48]
+  mov    r15, [reg_p1+56]
+  mov    rax, [reg_p1+64]
+  mov    rcx, [reg_p1+72]
+  sub    r8, [reg_p2] 
+  sbb    r9, [reg_p2+8] 
+  sbb    r10, [reg_p2+16] 
+  sbb    r11, [reg_p2+24] 
+  sbb    r12, [reg_p2+32] 
+  sbb    r13, [reg_p2+40]
+  sbb    r14, [reg_p2+48] 
+  sbb    r15, [reg_p2+56] 
+  sbb    rax, [reg_p2+64] 
+  sbb    rcx, [reg_p2+72] 
+
+  mov    rdi, [rip+\P0]
+  mov    rsi, [rip+\P0+8]
+  add    r8, rdi  
+  adc    r9, rsi  
+  adc    r10, rsi 
+  adc    r11, rsi 
+  mov    rdi, [rip+\P0+32]
+  mov    rsi, [rip+\P0+40]
+  adc    r12, rdi   
+  adc    r13, rsi   
+  mov    [reg_p3], r8
+  mov    [reg_p3+8], r9 
+  mov    [reg_p3+16], r10 
+  mov    [reg_p3+24], r11
+  mov    [reg_p3+32], r12 
+  mov    [reg_p3+40], r13
+  mov    rdi, [rip+\P0+48]
+  mov    rsi, [rip+\P0+56]
+  adc    r14, rdi  
+  adc    r15, rsi  
+  mov    rdi, [rip+\P0+64]
+  mov    rsi, [rip+\P0+72]
+  adc    rax, rdi 
+  adc    rcx, rsi  
+  mov    [reg_p3+48], r14
+  mov    [reg_p3+56], r15 
+  mov    [reg_p3+64], rax 
+  mov    [reg_p3+72], rcx
+  
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  .endm
+
+
+//***********************************************************************
+//  Multiprecision subtraction with correction with 2*p610
+//  Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] + 2*p610
+//*********************************************************************** 
+.global fmt(mp_sub610_p2_asm)
+fmt(mp_sub610_p2_asm):
+
+  SUB610_PX  fmt(p610x2)
+  ret
+
+
+#ifdef _MULX_
+#ifdef _ADX_
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x bi + z
+// Inputs: base memory pointer M1 (a),
+//         bi pre-stored in rdx,
+//         accumulator z in [M0:M2, Z3:Z10]
+// Output: [M0:M2, Z3:Z10]
+// Temps:  regs T0:T1
+/////////////////////////////////////////////////////////////////
+.macro MULADD64x640 M1, M, Z3, Z4, Z5, Z6, Z7, Z8, Z9, Z10, T0, T1, T2, C
+    mulx   \T0, \T1, \M1     // A0*B0
+	xor    \C, \C
+    adox   \T1, \M
+    adox   \T0, 8\M  
+	mov    \M, \T1
+    mulx   \T1, \T2, 8\M1    // A0*B1
+    adcx   \T0, \T2
+    adox   \T1, 16\M  
+	mov    8\M, \T0   
+    mulx   \T0, \T2, 16\M1   // A0*B2
+    adcx   \T1, \T2
+    adox   \Z3, \T0 
+	mov    16\M, \T1  
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    mulx   \T0, \T1, 32\M1   // A0*B4          
+    adcx   \Z4, \T1
+    adox   \Z5, \T0
+    mulx   \T0, \T1, 40\M1   // A0*B5          
+    adcx   \Z5, \T1
+    adox   \Z6, \T0
+    mulx   \T0, \T1, 48\M1   // A0*B6               
+    adcx   \Z6, \T1
+    adox   \Z7, \T0
+    mulx   \T0, \T1, 56\M1   // A0*B7         
+    adcx   \Z7, \T1
+    adox   \Z8, \T0
+    mulx   \T0, \T1, 64\M1   // A0*B8         
+    adcx   \Z8, \T1
+    adox   \Z9, \T0
+    mulx   \T0, \T1, 72\M1   // A0*B9         
+    adcx   \Z9, \T1
+    adox   \Z10, \T0
+    adc    \Z10, 0 
+.endm
+
+
+.macro MULADD64x640b M1, M, MM, Z3, Z4, Z5, Z6, Z7, Z8, Z9, Z10, T0, T1, T2, C
+    mulx   \T0, \T1, \M1     // A0*B0
+	xor    \C, \C
+    adox   \T1, \M
+    adox   \T0, 8\M  
+	mov    24\M, \T1
+    mulx   \T1, \T2, 8\M1    // A0*B1
+    adcx   \T0, \T2
+    adox   \T1, 16\M  
+	mov    \MM, \T0   
+    mulx   \T0, \T2, 16\M1   // A0*B2
+    adcx   \T1, \T2
+    adox   \Z3, \T0 
+	mov    8\MM, \T1  
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    mulx   \T0, \T1, 32\M1   // A0*B4    
+	mov    16\MM, \Z3        
+    adcx   \Z4, \T1
+    adox   \Z5, \T0
+    mulx   \T0, \T1, 40\M1   // A0*B5          
+    adcx   \Z5, \T1
+    adox   \Z6, \T0
+    mulx   \T0, \T1, 48\M1   // A0*B6               
+    adcx   \Z6, \T1
+    adox   \Z7, \T0
+    mulx   \T0, \T1, 56\M1   // A0*B7         
+    adcx   \Z7, \T1
+    adox   \Z8, \T0
+    mulx   \T0, \T1, 64\M1   // A0*B8         
+    adcx   \Z8, \T1
+    adox   \Z9, \T0
+    mulx   \T0, \T1, 72\M1   // A0*B9         
+    adcx   \Z9, \T1
+    adox   \Z10, \T0
+    adc    \Z10, 0 
+.endm
+
+
+.macro MULADD64x384 M1, Z0, Z1, Z2, Z3, Z4, Z5, Z6, T0, T1
+    mulx   \T0, \T1, \M1     // A0*B0
+	xor    rax, rax
+    adox   \Z0, \T1
+    adox   \Z1, \T0  
+    mulx   \T0, \T1, 8\M1    // A0*B1
+    adcx   \Z1, \T1
+    adox   \Z2, \T0    
+    mulx   \T0, \T1, 16\M1   // A0*B2
+    adcx   \Z2, \T1
+    adox   \Z3, \T0
+    mulx   \T0, \T1, 24\M1   // A0*B3          
+    adcx   \Z3, \T1
+    adox   \Z4, \T0
+    mulx   \T0, \T1, 32\M1   // A0*B4 
+    adcx   \Z4, \T1
+    adox   \Z5, \T0
+    mulx   \T0, \T1, 40\M1   // A0*B5 
+    adcx   \Z5, \T1
+    adox   \Z6, \T0
+    adc    \Z6, 0    
+.endm
+
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = a x b + c x d (mod p)
+// Inputs: base memory pointers M0 (a,c), M1 (b,d)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z7], pre-stores a0 x b
+// Output: [Z0:Z7]
+// Temps:  MM, regs T0:T2
+/////////////////////////////////////////////////////////////////
+.macro FPDBLMUL640x640 M00, M01, M10, M11, MM, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1, T2
+    mov    rdx, \M11    
+    MULADD64x640b \M01, \MM, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \T2        
+    // [MM0:MM16, Z1:Z7] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                  // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1
+
+    // [MM0:MM16, Z1:Z5, Z0] <- z = a0 x b01 - a1 x b11 + z 
+    mov    rdx, 8\M10
+    MULADD64x640 \M00, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \Z0
+    mov    rdx, 8\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z2:Z7, Z0] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1
+
+    // [MM0:MM16, Z2:Z7, Z0:Z1] <- z = a0 x b02 + a1 x b12 + z 
+    mov    rdx, 16\M10
+    MULADD64x640 \M00, \MM, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \T2, \Z1
+    mov    rdx, 16\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z3:Z7, Z0:Z1] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1
+
+    // [MM0:MM16, Z3:Z7, Z0:Z2] <- z = a0 x b03 + a1 x b13 + z 
+    mov    rdx, 24\M10
+    MULADD64x640 \M00, \MM, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \T2, \Z2
+    mov    rdx, 24\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z4:Z7, Z0:Z2] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1
+
+    // [MM0:MM16, Z4:Z7, Z0:Z3] <- z = a0 x b04 + a1 x b14 + z 
+    mov    rdx, 32\M10
+    MULADD64x640 \M00, \MM, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \Z3
+    mov    rdx, 32\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z5:Z7, Z0:Z3] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+
+    // [MM0:MM16, Z5:Z7, Z0:Z4] <- z = a0 x b05 + a1 x b15 + z 
+    mov    rdx, 40\M10
+    MULADD64x640 \M00, \MM, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T2, \Z4
+    mov    rdx, 40\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z6:Z7, Z0:Z4] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1
+
+    // [MM0:MM16, Z6:Z7, Z0:Z5] <- z = a0 x b06 + a1 x b16 + z 
+    mov    rdx, 48\M10
+    MULADD64x640 \M00, \MM, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T2, \Z5
+    mov    rdx, 48\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z7, Z0:Z5] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1
+
+    // [MM0:MM16, Z7, Z0:Z6] <- z = a0 x b07 + a1 x b17 + z 
+    mov    rdx, 56\M10
+    MULADD64x640 \M00, \MM, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T2, \Z6
+    mov    rdx, 56\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z0:Z6] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1
+
+    // [MM0:MM16, Z0:Z7] <- z = a0 x b08 + a1 x b18 + z 
+    mov    rdx, 64\M10
+    MULADD64x640 \M00, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \Z7
+    mov    rdx, 64\M11    
+    MULADD64x640b \M01, \MM, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z1:Z7] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1
+
+    // [MM0:MM16, Z1:Z7, Z0] <- z = a0 x b09 + a1 x b19 + z 
+    mov    rdx, 72\M10
+    MULADD64x640 \M00, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \Z0
+    mov    rdx, 72\M11    
+    MULADD64x640b \M01, \MM, [rcx], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \T2
+    // [MM0:MM16, Z2:Z7, Z0] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1
+.endm
+
+
+//***********************************************************************
+//  Multiplication in GF(p^2), non-complex part
+//  Operation: c [reg_p3] = a0 x b0 - a1 x b1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//          b = [b1, b0] stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fp2mul610_c0_asm)
+fmt(fp2mul610_c0_asm):      
+    push   r12 
+    push   r13 
+    push   r14 
+    push   r15     
+    push   rbx   
+    push   rbp  
+	sub    rsp, 80   
+    mov    rcx, reg_p3
+	
+	// [rsp0:rsp72] <- 8*p - b1
+	mov    r8, [rip+fmt(p610x8)]  
+	mov    r9, [rip+fmt(p610x8)+8]   
+	mov    r12, [rip+fmt(p610x8)+32]
+	mov    r13, [rip+fmt(p610x8)+40] 
+	mov    rax, [reg_p2+80]
+	mov    rdx, [reg_p2+88]
+	mov    r10, r9 
+	mov    r11, r9                         
+	sub    r8, rax
+	sbb    r9, rdx
+	mov    rax, [reg_p2+96]
+	mov    rdx, [reg_p2+104]
+	sbb    r10, rax
+	sbb    r11, rdx
+	mov    rax, [reg_p2+112]
+	mov    rdx, [reg_p2+120]
+	sbb    r12, rax
+	sbb    r13, rdx
+	mov    [rsp], r8
+	mov    [rsp+8], r9
+	mov    [rsp+16], r10
+	mov    [rsp+24], r11 
+	mov    [rsp+32], r12  
+	mov    [rsp+40], r13
+	mov    r8, [rip+fmt(p610x8)+48]
+	mov    r9, [rip+fmt(p610x8)+56]
+	mov    r10, [rip+fmt(p610x8)+64]
+	mov    r11, [rip+fmt(p610x8)+72]
+	mov    rax, [reg_p2+128]
+	mov    rdx, [reg_p2+136]
+	sbb    r8, rax 
+	sbb    r9, rdx 
+	mov    rax, [reg_p2+144]
+	mov    rdx, [reg_p2+152]
+	sbb    r10, rax 
+	sbb    r11, rdx 
+	mov    [rsp+48], r8
+	mov    [rsp+56], r9
+	mov    [rsp+64], r10
+	mov    [rsp+72], r11
+    
+    // [rcx0:rcx16, r11:r15, r8:r10] <- z = a0 x b00 - a1 x b10
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1] 
+    xor    rax, rax 
+    mulx   r10, r11, [reg_p1+8] 
+	mov    [rcx], r8
+    adcx   r9, r11 
+    mulx   r11, r12, [reg_p1+16]
+	mov    [rcx+8], r9  
+    adcx   r10, r12         
+    mulx   r12, r13, [reg_p1+24] 
+	mov    [rcx+16], r10  
+    adcx   r11, r13       
+    mulx   r13, r8, [reg_p1+32] 
+    adcx   r12, r8      
+    mulx   r14, r9, [reg_p1+40]
+    adcx   r13, r9      
+    mulx   r15, rax, [reg_p1+48]
+    adcx   r14, rax     
+    mulx   r8, r10, [reg_p1+56]
+    adcx   r15, r10     
+    mulx   r9, rax, [reg_p1+64]
+    adcx   r8, rax    
+    mulx   r10, rbx, [reg_p1+72] 
+    adcx   r9, rbx    
+    adc    r10, 0 
+
+	FPDBLMUL640x640 [reg_p1], [reg_p1+80], [reg_p2], [rsp], [rcx], r11, r12, r13, r14, r15, r8, r9, r10, rbx, rbp, rax
+         
+    mov    [rcx+24], r13 
+    mov    [rcx+32], r14   
+    mov    [rcx+40], r15 
+    mov    [rcx+48], r8   
+    mov    [rcx+56], r9      
+    mov    [rcx+64], r10                  
+    mov    [rcx+72], r11
+	add    rsp, 80
+    pop    rbp
+    pop    rbx
+    pop    r15 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Multiplication in GF(p^2), complex part
+//  Operation: c [reg_p3] = a0 x b1 + a1 x b0
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//          b = [b1, b0] stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fp2mul610_c1_asm)
+fmt(fp2mul610_c1_asm): 
+    push   r12    
+    push   r13     
+    push   r14   
+    push   r15        
+    push   rbx    
+    push   rbp 
+	sub    rsp, 32 
+    mov    rcx, reg_p3
+    
+    // [rsp0:rsp16, r11:r15, r8:r10] <- z = a0 x b10 + a1 x b00
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1+80] 
+    xor    rax, rax 
+    mulx   r10, r11, [reg_p1+88] 
+	mov    [rsp], r8
+    adcx   r9, r11 
+    mulx   r11, r12, [reg_p1+96]
+	mov    [rsp+8], r9  
+    adcx   r10, r12         
+    mulx   r12, r13, [reg_p1+104] 
+	mov    [rsp+16], r10  
+    adcx   r11, r13       
+    mulx   r13, r8, [reg_p1+112] 
+    adcx   r12, r8      
+    mulx   r14, r9, [reg_p1+120]
+    adcx   r13, r9      
+    mulx   r15, rax, [reg_p1+128]
+    adcx   r14, rax     
+    mulx   r8, r10, [reg_p1+136] 
+    adcx   r15, r10     
+    mulx   r9, rax, [reg_p1+144]
+    adcx   r8, rax    
+    mulx   r10, rbx, [reg_p1+152] 
+    adcx   r9, rbx    
+    adc    r10, 0  
+
+	FPDBLMUL640x640 [reg_p1+80], [reg_p1], [reg_p2], [reg_p2+80], [rsp], r11, r12, r13, r14, r15, r8, r9, r10, rbx, rbp, rax
+         
+    mov    [rcx+24], r13 
+    mov    [rcx+32], r14   
+    mov    [rcx+40], r15 
+    mov    [rcx+48], r8   
+    mov    [rcx+56], r9      
+    mov    [rcx+64], r10                  
+    mov    [rcx+72], r11
+	add    rsp, 32
+    pop    rbp
+    pop    rbx
+    pop    r15 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+#else
+
+# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE"
+
+#endif
+
+#else
+
+# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE USE_ADX=TRUE"
+
+#endif
+
+
+///////////////////////////////////////////////////////////////// MACRO
+// z = z + a x b
+// Inputs: base memory pointers M0 (a), M1 (b)
+//         bi pre-stored in rdx,
+//         accumulator z in [Z0:Z7], pre-stores a0 x b
+// Output: [Z0:Z7] and OUT
+// Temps:  regs T0:T2
+/////////////////////////////////////////////////////////////////
+.macro FPMUL640x640 M0, M1, MM, OUT, Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7, T0, T1, T2                   
+    // [Z4:Z7, Z0:Z2] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                  // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1
+
+    // [rsp8:rsp24, \Z4:\Z7, \Z0:\Z3] <- z = a0 x b11 + a1 x b01 + z 
+    mov    rdx, 8\M0
+    MULADD64x640b \M1, \MM, \MM, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \Z3
+    // [\Z5:\Z7, \Z0:\Z3] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+
+    // [rsp16:rsp32, \Z5:\Z7, \Z0:\Z4] <- z = a0 x b12 + a1 x b02 + z 
+    mov    rdx, 16\M0
+    MULADD64x640b \M1, \MM, \MM, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1, \T2, \Z4
+    // [rsp24:rsp40, \Z6:\Z7, \Z0:\Z4] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \T0, \T1
+
+    // [rsp24:rsp40, \Z6:\Z7, \Z0:\Z5] <- z = a0 x b13 + a1 x b03 + z 
+    mov    rdx, 24\M0
+    MULADD64x640b \M1, \MM, \MM, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1, \T2, \Z5
+    // [rsp32:rsp48, \Z7, \Z0:\Z5] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \T0, \T1
+
+    // [rsp32:rsp48, \Z7, \Z0:\Z6] <- z = a0 x b14 + a1 x b04 + z 
+    mov    rdx, 32\M0
+    MULADD64x640b \M1, \MM, \MM, \Z7, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1, \T2, \Z6
+    // [rsp40:rsp56, \Z0:\Z6] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \T0, \T1
+
+    // [rsp40:rsp56, \Z0:\Z7] <- z = a0 x b15 + a1 x b05 + z 
+    mov    rdx, 40\M0
+    MULADD64x640b \M1, \MM, \MM, \Z0, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1, \T2, \Z7
+    // [rsp48:rsp64, \Z1:\Z7] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \T0, \T1
+
+    // [rsp48:rsp64, \Z1:\Z7, \Z0] <- z = a0 x b16 + a1 x b06 + z 
+    mov    rdx, 48\M0
+    MULADD64x640b \M1, \MM, \MM, \Z1, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1, \T2, \Z0
+    // [rsp56:rsp72, \Z2:\Z7, \Z0] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \T0, \T1
+
+    // [rsp56:rsp72, \Z2:\Z7, \Z0:\Z1] <- z = a0 x b17 + a1 x b07 + z 
+    mov    rdx, 56\M0
+    MULADD64x640b \M1, \MM, \MM, \Z2, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1, \T2, \Z1
+    // [rsp64:rsp80, \Z3:\Z7, \Z0:\Z1] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \T0, \T1
+
+    // [rsp64:rsp80, \Z3:\Z7, \Z0:\Z2] <- z = a0 x b18 + a1 x b08 + z 
+    mov    rdx, 64\M0
+    MULADD64x640b \M1, \MM, \MM, \Z3, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1, \T2, \Z2
+    // [rsp72:rsp88, \Z4:\Z7, \Z0:\Z2] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \T0, \T1
+
+    // [rsi0:rsi16, \Z4:\Z7, \Z0:\Z3] <- z = a0 x b19 + a1 x b09 + z 
+    mov    rdx, 72\M0
+    MULADD64x640b \M1, \MM, \OUT, \Z4, \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1, \T2, \Z3
+    // [rsp80:rsp96, \Z5:\Z7, \Z0:\Z3] <- z = (z0 x p610p1 + z)/2^64
+    mov    rdx, 24\MM                 // rdx <- z0
+    MULADD64x384 [rip+fmt(p610p1)+32], \Z5, \Z6, \Z7, \Z0, \Z1, \Z2, \Z3, \T0, \T1
+.endm
 
-#else
 
 //***********************************************************************
-//  Integer multiplication
-//  Based on Karatsuba method
-//  Operation: c [reg_p3] = a [reg_p1] * b [reg_p2]
-//  NOTE: a=c or b=c are not allowed
-//***********************************************************************
-.global fmt(mul610_asm)
-fmt(mul610_asm):
-
-  ret
-
-# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE"
-
-#endif
-
-
-#ifdef _MULX_
-
-///////////////////////////////////////////////////////////////// MACRO
-// Schoolbook integer multiplication
-// Inputs:  memory pointers M0 and M1
-// Outputs: regs T0:T7
-// Temps:   regs T8
-/////////////////////////////////////////////////////////////////
-
-#ifdef _ADX_
-.macro MUL128x384_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8
-    mov    rdx, \M0
-    mulx   \T1, \T0, \M1       // T0 <- C0_final    
-    mulx   \T2, \T4, 8\M1
-    xor    rax, rax
-    mulx   \T3, \T5, 16\M1 
-    adox   \T1, \T4               
-    adox   \T2, \T5     
-    mulx   \T4, \T7, 24\M1
-    adox   \T3, \T7         
-    mulx   \T5, \T6, 32\M1 
-    adox   \T4, \T6         
-    mulx   \T7, \T8, 40\M1           
-    adox   \T5, \T8         
-    adox   \T7, rax   
-    
-    mov    rdx, 8\M0 
-    mulx   \T8, \T6, \M1 
-    adcx   \T1, \T6            // T1 <- C1_final 
-    adcx   \T2, \T8    
-    mulx   \T6, \T8, 8\M1
-    adox   \T2, \T8  
-    adcx   \T3, \T6        
-    mulx   \T6, \T8, 16\M1
-    adox   \T3, \T8
-    adcx   \T4, \T6     
-    mulx   \T6, \T8, 24\M1
-    adox   \T4, \T8     
-    adcx   \T5, \T6  
-    mulx   \T6, \T8, 32\M1 
-    adox   \T5, \T8 
-    adcx   \T6, \T7 
-    mulx   \T7, \T8, 40\M1
-    adcx   \T7, rax  
-    adox   \T6, \T8          
-    adox   \T7, rax
-.endm
-
-#else
-
-.macro MUL128x384_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8
-    mov    rdx, \M0
-    mulx   \T1, \T0, \M1       // T0 <- C0_final 
-    mulx   \T2, \T3, 8\M1
-    add    \T1, \T3               
-    adc    \T2, 0  
-
-    mov    rdx, 8\M0   
-    xor    \T5, \T5
-    mulx   \T3, \T4, \M1 
-    add    \T1, \T4               
-    adc    \T2, \T3  
-    adc    \T5, 0  
-      
-    xor    \T6, \T6
-    mulx   \T3, \T4, 8\M1
-    add    \T2, \T4  
-    adc    \T3, \T5           
-    adc    \T6, 0 
-        
-    mov    rdx, \M0         
-    mulx   \T4, \T5, 16\M1 
-    add    \T2, \T5  
-    adc    \T3, \T4           
-    adc    \T6, 0  
-        
-    xor    \T7, \T7        
-    mulx   \T4, \T5, 24\M1 
-    add    \T3, \T5  
-    adc    \T4, \T6           
-    adc    \T7, 0  
-
-    mov    rdx, 8\M0 
-    mulx   \T5, \T6, 16\M1 
-    add    \T3, \T6               
-    adc    \T4, \T5  
-    adc    \T7, 0    
-        
-    xor    \T6, \T6        
-    mulx   \T5, \T8, 24\M1 
-    add    \T4, \T8  
-    adc    \T5, \T7           
-    adc    \T6, 0  
-        
-    mov    rdx, \M0        
-    mulx   \T7, \T8, 32\M1 
-    add    \T4, \T8  
-    adc    \T5, \T7           
-    adc    \T6, 0      
-        
-    xor    \T7, \T7        
-    mulx   \T8, rax, 40\M1 
-    add    \T5, rax  
-    adc    \T6, \T8          
-    adc    \T7, 0  
-        
-    mov    rdx, 8\M0        
-    mulx   \T8, rax, 32\M1 
-    add    \T5, rax  
-    adc    \T6, \T8         
-    adc    \T7, 0   
-        
-    mov    rdx, 8\M0        
-    mulx   \T8, rax, 40\M1 
-    add    \T6, rax  
-    adc    \T7, \T8  
-.endm
-#endif
-
-  
-//**************************************************************************************
-//  Montgomery reduction
-//  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015  
-//  Operation: c [reg_p2] = a [reg_p1]
-//  NOTE: a=c is not allowed
-//************************************************************************************** 
-.global fmt(rdc610_asm)
-fmt(rdc610_asm):
-    push   r12
-    push   r13 
-    push   r14 
-    push   r15  
-
-    // a[0-1] x p610p1_nz --> result: r8:r15 
-    MUL128x384_SCHOOL [reg_p1], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx     
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+32]  
-    adc    r9, [reg_p1+40]  
-    adc    r10, [reg_p1+48]   
-    adc    r11, [reg_p1+56]   
-    adc    r12, [reg_p1+64]   
-    adc    r13, [reg_p1+72]   
-    adc    r14, [reg_p1+80]  
-    adc    r15, [reg_p1+88]   
-    adc    rcx, [reg_p1+96] 
-    mov    [reg_p1+32], r8  
-    mov    [reg_p1+40], r9  
-    mov    [reg_p1+48], r10  
-    mov    [reg_p1+56], r11  
-    mov    [reg_p1+64], r12  
-    mov    [reg_p1+72], r13  
-    mov    [reg_p1+80], r14
-    mov    [reg_p1+88], r15  
-    mov    [reg_p1+96], rcx  
-    mov    r8, [reg_p1+104]  
-    mov    r9, [reg_p1+112]  
-    mov    r10, [reg_p1+120]
-    mov    r11, [reg_p1+128]
-    mov    r12, [reg_p1+136]
-    mov    r13, [reg_p1+144]
-    mov    r14, [reg_p1+152]
-    adc    r8, 0
-    adc    r9, 0
-    adc    r10, 0
-    adc    r11, 0
-    adc    r12, 0
-    adc    r13, 0
-    adc    r14, 0
-    mov    [reg_p1+104], r8  
-    mov    [reg_p1+112], r9  
-    mov    [reg_p1+120], r10  
-    mov    [reg_p1+128], r11  
-    mov    [reg_p1+136], r12 
-    mov    [reg_p1+144], r13 
-    mov    [reg_p1+152], r14
-
-    // a[2-3] x p610p1_nz --> result: r8:r15
-    MUL128x384_SCHOOL [reg_p1+16], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx 
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+48]  
-    adc    r9, [reg_p1+56]  
-    adc    r10, [reg_p1+64]   
-    adc    r11, [reg_p1+72]  
-    adc    r12, [reg_p1+80]   
-    adc    r13, [reg_p1+88]   
-    adc    r14, [reg_p1+96]  
-    adc    r15, [reg_p1+104]
-    adc    rcx, [reg_p1+112]
-    mov    [reg_p1+48], r8  
-    mov    [reg_p1+56], r9  
-    mov    [reg_p1+64], r10  
-    mov    [reg_p1+72], r11   
-    mov    [reg_p1+80], r12  
-    mov    [reg_p1+88], r13  
-    mov    [reg_p1+96], r14
-    mov    [reg_p1+104], r15
-    mov    [reg_p1+112], rcx
-    mov    r8, [reg_p1+120]
-    mov    r9, [reg_p1+128]
-    mov    r10, [reg_p1+136] 
-    mov    r11, [reg_p1+144] 
-    mov    r12, [reg_p1+152] 
-    adc    r8, 0
-    adc    r9, 0
-    adc    r10, 0  
-    adc    r11, 0 
-    adc    r12, 0  
-    mov    [reg_p1+120], r8  
-    mov    [reg_p1+128], r9  
-    mov    [reg_p1+136], r10 
-    mov    [reg_p1+144], r11 
-    mov    [reg_p1+152], r12 
-
-    // a[4-5] x p610p1_nz --> result: r8:r15
-    MUL128x384_SCHOOL [reg_p1+32], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx 
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+64]  
-    adc    r9, [reg_p1+72]  
-    adc    r10, [reg_p1+80]   
-    adc    r11, [reg_p1+88]  
-    adc    r12, [reg_p1+96]   
-    adc    r13, [reg_p1+104]   
-    adc    r14, [reg_p1+112]  
-    adc    r15, [reg_p1+120]
-    adc    rcx, [reg_p1+128]
-    mov    [reg_p1+64], r8  
-    mov    [reg_p1+72], r9  
-    mov    [reg_p1+80], r10  
-    mov    [reg_p1+88], r11   
-    mov    [reg_p1+96], r12  
-    mov    [reg_p1+104], r13  
-    mov    [reg_p1+112], r14
-    mov    [reg_p1+120], r15
-    mov    [reg_p1+128], rcx
-    mov    r8, [reg_p1+136]
-    mov    r9, [reg_p1+144]
-    mov    r10, [reg_p1+152] 
-    adc    r8, 0
-    adc    r9, 0
-    adc    r10, 0 
-    mov    [reg_p1+136], r8  
-    mov    [reg_p1+144], r9  
-    mov    [reg_p1+152], r10 
-
-    // a[6-7] x p610p1_nz --> result: r8:r15
-    MUL128x384_SCHOOL [reg_p1+48], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx 
-
-    xor    rcx, rcx
-    add    r8, [reg_p1+80]  
-    adc    r9, [reg_p1+88]  
-    adc    r10, [reg_p1+96]   
-    adc    r11, [reg_p1+104]  
-    adc    r12, [reg_p1+112]   
-    adc    r13, [reg_p1+120]   
-    adc    r14, [reg_p1+128]  
-    adc    r15, [reg_p1+136]
-    adc    rcx, [reg_p1+144]
-    mov    [reg_p2], r8         // C0_final
-    mov    [reg_p2+8], r9       // C1_final
-    mov    [reg_p1+96], r10  
-    mov    [reg_p1+104], r11   
-    mov    [reg_p1+112], r12  
-    mov    [reg_p1+120], r13  
-    mov    [reg_p1+128], r14
-    mov    [reg_p1+136], r15
-    mov    [reg_p1+144], rcx
-    mov    r8, [reg_p1+152] 
-    adc    r8, 0
-    mov    [reg_p1+152], r8
-
-    // a[8-9] x p610p1_nz --> result: r8:r15
-    MUL128x384_SCHOOL [reg_p1+64], [rip+fmt(p610p1)+32], r8, r9, r10, r11, r12, r13, r14, r15, rcx
-    
-    // Final result C2:C9
-    add    r8, [reg_p1+96]  
-    adc    r9, [reg_p1+104]  
-    adc    r10, [reg_p1+112]   
-    adc    r11, [reg_p1+120]  
-    adc    r12, [reg_p1+128]   
-    adc    r13, [reg_p1+136]   
-    adc    r14, [reg_p1+144]   
-    adc    r15, [reg_p1+152] 
-    mov    [reg_p2+16], r8
-    mov    [reg_p2+24], r9  
-    mov    [reg_p2+32], r10   
-    mov    [reg_p2+40], r11  
-    mov    [reg_p2+48], r12  
-    mov    [reg_p2+56], r13 
-    mov    [reg_p2+64], r14 
-    mov    [reg_p2+72], r15
-
-    pop    r15
-    pop    r14
-    pop    r13
-    pop    r12
+//  Squaring in GF(p^2), non-complex part
+//  Operation: c [reg_p2] = (a0+a1) x (a0-a1)
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [reg_p2]
+//***********************************************************************
+.global fmt(fp2sqr610_c0_asm)
+fmt(fp2sqr610_c0_asm):  
+    push   r12
+    push   r13
+    push   r14  
+    push   r15  
+    push   rbx  
+    push   rbp
+	sub    rsp, 32
+
+	// a0 + a1
+	mov    r8, [reg_p1]
+	mov    r9, [reg_p1+8]
+	mov    r10, [reg_p1+16]
+	mov    r11, [reg_p1+24]
+	mov    r12, [reg_p1+32]
+	mov    r13, [reg_p1+40]
+	add    r8, [reg_p1+80]
+	adc    r9, [reg_p1+88] 
+	mov    [reg_p2], r8
+	adc    r10, [reg_p1+96]
+	adc    r11, [reg_p1+104]
+	mov    [reg_p2+8], r9
+	mov    [reg_p2+16], r10
+	adc    r12, [reg_p1+112]
+	adc    r13, [reg_p1+120]
+	mov    r14, [reg_p1+48]
+	mov    r15, [reg_p1+56]
+	adc    r14, [reg_p1+128]
+	adc    r15, [reg_p1+136]
+	mov    r9, [reg_p1+64]
+	mov    r10, [reg_p1+72]
+	adc    r9, [reg_p1+144]
+	adc    r10, [reg_p1+152]
+	mov    [reg_p2+24], r11
+	mov    [reg_p2+32], r12
+	mov    [reg_p2+40], r13
+	mov    [reg_p2+48], r14
+	mov    [reg_p2+56], r15
+	mov    [reg_p2+64], r9
+	mov    [reg_p2+72], r10
+	
+	// a0 - a1 + 4xp610
+	mov    rcx, [reg_p1]
+	mov    r10, [reg_p1+8]
+	mov    r12, [reg_p1+16]
+	mov    r13, [reg_p1+24]
+	mov    r14, [reg_p1+32]
+	mov    r15, [reg_p1+40]
+	sub    rcx, [reg_p1+80]
+	sbb    r10, [reg_p1+88]
+	sbb    r12, [reg_p1+96]
+	sbb    r13, [reg_p1+104] 
+	sbb    r14, [reg_p1+112]
+	sbb    r15, [reg_p1+120]
+	mov    rbx, [reg_p1+48]
+	mov    rbp, [reg_p1+56]
+	mov    r8, [reg_p1+64]
+	mov    rax, [reg_p1+72]
+	sbb    rbx, [reg_p1+128]
+	sbb    rbp, [reg_p1+136]
+	sbb    r8, [reg_p1+144]
+	sbb    rax, [reg_p1+152]
+	add    rcx, [rip+fmt(p610x4)]	                  
+	mov    rdx, [rip+fmt(p610x4)+8]
+	adc    r10, rdx
+	adc    r12, rdx
+	adc    r13, rdx
+	adc    r14, [rip+fmt(p610x4)+32]
+	adc    r15, [rip+fmt(p610x4)+40]
+	adc    rbx, [rip+fmt(p610x4)+48]
+	adc    rbp, [rip+fmt(p610x4)+56]
+	adc    r8, [rip+fmt(p610x4)+64]
+	adc    rax, [rip+fmt(p610x4)+72]
+	mov    [reg_p2+80], rcx                 
+	mov    [reg_p2+88], r10
+	mov    [reg_p2+96], r12 
+	mov    [reg_p2+104], r13 
+	mov    [reg_p2+112], r14 
+	mov    [reg_p2+144], r8 
+	mov    [reg_p2+152], rax
+    
+    // [rsp0:rsp16, r11:r15, r8:r10] <- z = a00 x a1
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, rcx
+	mov    [reg_p2+120], r15
+    xor    rax, rax 
+    mulx   r10, r11, r10 
+	mov    [reg_p2+128], rbx
+    adcx   r9, r11     
+    mulx   r11, r12, r12 
+	mov    [reg_p2+136], rbp 
+    adcx   r10, r12       
+    mulx   r12, r13, r13 
+	mov    [rsp+24], r8
+    adcx   r11, r13      
+    mulx   r13, r8, r14  
+	mov    [rsp], r9 
+    adcx   r12, r8      
+    mulx   r14, r9, r15 
+	mov    [rsp+8], r10  
+    adcx   r13, r9      
+    mulx   r15, rax, rbx 
+	mov    [rsp+16], r11  
+    adcx   r14, rax     
+    mulx   r8, r10, rbp  
+    adcx   r15, r10     
+    mulx   r9, rax, [reg_p2+144] 
+    adcx   r8, rax    
+    mulx   r10, rbx, [reg_p2+152] 
+    adcx   r9, rbx    
+    adc    r10, 0 
+           
+	FPMUL640x640 [reg_p2], [reg_p2+80], [rsp], [reg_p2], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp, rax
+           
+    mov    [reg_p2+24], r13   
+    mov    [reg_p2+32], r14 
+    mov    [reg_p2+40], r15  
+    mov    [reg_p2+48], r8      
+    mov    [reg_p2+56], r9                  
+    mov    [reg_p2+64], r10                 
+    mov    [reg_p2+72], r11
+	add    rsp, 32
+    pop    rbp
+    pop    rbx
+    pop    r15 
+    pop    r14
+    pop    r13
+    pop    r12
     ret
 
-  #else
-  
-//***********************************************************************
-//  Montgomery reduction
-//  Based on comba method
-//  Operation: c [reg_p2] = a [reg_p1]
-//  NOTE: a=c is not allowed
-//*********************************************************************** 
-.global fmt(rdc610_asm)
-fmt(rdc610_asm):
-
-  ret
-
-# error "CONFIGURATION NOT SUPPORTED. TRY USE_MULX=TRUE"
-
-  #endif
-
-
-//***********************************************************************
-//  610-bit multiprecision addition
-//  Operation: c [reg_p3] = a [reg_p1] + b [reg_p2]
-//*********************************************************************** 
-.global fmt(mp_add610_asm)
-fmt(mp_add610_asm):  
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    rax, [reg_p1+32]
-  add    r8, [reg_p2] 
-  adc    r9, [reg_p2+8] 
-  adc    r10, [reg_p2+16] 
-  adc    r11, [reg_p2+24] 
-  adc    rax, [reg_p2+32] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], rax
-
-  mov    r8, [reg_p1+40]
-  mov    r9, [reg_p1+48] 
-  mov    r10, [reg_p1+56]
-  mov    r11, [reg_p1+64] 
-  mov    rax, [reg_p1+72] 
-  adc    r8, [reg_p2+40] 
-  adc    r9, [reg_p2+48]
-  adc    r10, [reg_p2+56] 
-  adc    r11, [reg_p2+64]
-  adc    rax, [reg_p2+72]
-  mov    [reg_p3+40], r8
-  mov    [reg_p3+48], r9
-  mov    [reg_p3+56], r10
-  mov    [reg_p3+64], r11
-  mov    [reg_p3+72], rax
-  ret
-
-
-//***********************************************************************
-//  2x610-bit multiprecision subtraction/addition
-//  Operation: c [x2] = a [x0] - b [x1]. If c < 0, add p610*2^640
-//*********************************************************************** 
-.global fmt(mp_subadd610x2_asm)
-fmt(mp_subadd610x2_asm):
-  push   r12
-  push   r13 
-  push   r14 
-  push   r15
-  push   rbx
-  xor    rax, rax
-  mov    r8, [reg_p1]
-  mov    r9, [reg_p1+8]
-  mov    r10, [reg_p1+16]
-  mov    r11, [reg_p1+24]
-  mov    rcx, [reg_p1+32]
-  sub    r8, [reg_p2] 
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    rcx, [reg_p2+32] 
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], rcx
-
-  mov    r8, [reg_p1+40]
-  mov    r9, [reg_p1+48]
-  mov    r10, [reg_p1+56] 
-  mov    r11, [reg_p1+64]
-  mov    rcx, [reg_p1+72] 
-  sbb    r8, [reg_p2+40] 
-  sbb    r9, [reg_p2+48] 
-  sbb    r10, [reg_p2+56]
-  sbb    r11, [reg_p2+64] 
-  sbb    rcx, [reg_p2+72]
-  mov    [reg_p3+40], r8
-  mov    [reg_p3+48], r9
-  mov    [reg_p3+56], r10
-  mov    [reg_p3+64], r11
-  mov    [reg_p3+72], rcx
-  
-  mov    r8, [reg_p1+80]
-  mov    r9, [reg_p1+88] 
-  mov    r10, [reg_p1+96]
-  mov    r11, [reg_p1+104]
-  mov    rcx, [reg_p1+112]
-  sbb    r8, [reg_p2+80]
-  sbb    r9, [reg_p2+88]
-  sbb    r10, [reg_p2+96] 
-  sbb    r11, [reg_p2+104] 
-  sbb    rcx, [reg_p2+112]
-  mov    [reg_p3+80], r8 
-  mov    [reg_p3+88], r9
-  mov    [reg_p3+96], r10
-  mov    [reg_p3+104], r11
-  mov    [reg_p3+112], rcx
-  
-  mov    r8, [reg_p1+120]
-  mov    r9, [reg_p1+128]
-  mov    r10, [reg_p1+136]
-  mov    r11, [reg_p1+144]
-  mov    rcx, [reg_p1+152]
-  sbb    r8, [reg_p2+120] 
-  sbb    r9, [reg_p2+128] 
-  sbb    r10, [reg_p2+136] 
-  sbb    r11, [reg_p2+144] 
-  sbb    rcx, [reg_p2+152]
-  sbb    rax, 0
-  
-  // Add p610 anded with the mask in rax 
-  mov    r12, [rip+fmt(p610)]
-  mov    r13, [rip+fmt(p610)+32]
-  mov    r14, [rip+fmt(p610)+40]
-  mov    r15, [rip+fmt(p610)+48]
-  mov    rdi, [rip+fmt(p610)+56]
-  mov    rsi, [rip+fmt(p610)+64]
-  mov    rbx, [rip+fmt(p610)+72]
-  and    r12, rax
-  and    r13, rax
-  and    r14, rax
-  and    r15, rax
-  and    rdi, rax
-  and    rsi, rax
-  and    rbx, rax
-  mov    rax, [reg_p3+80]
-  add    rax, r12
-  mov    [reg_p3+80], rax
-  mov    rax, [reg_p3+88]
-  adc    rax, r12
-  mov    [reg_p3+88], rax
-  mov    rax, [reg_p3+96]
-  adc    rax, r12
-  mov    [reg_p3+96], rax
-  adc    r12, [reg_p3+104]
-  adc    r13, [reg_p3+112]
-  mov    [reg_p3+104], r12
-  mov    [reg_p3+112], r13
-  adc    r8, r14
-  adc    r9, r15
-  adc    r10, rdi
-  adc    r11, rsi
-  adc    rcx, rbx
-  
-  mov    [reg_p3+120], r8
-  mov    [reg_p3+128], r9
-  mov    [reg_p3+136], r10
-  mov    [reg_p3+144], r11
-  mov    [reg_p3+152], rcx
-  pop    rbx
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
-
-
 //***********************************************************************
-//  Double 2x610-bit multiprecision subtraction
-//  Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2]
-//*********************************************************************** 
-.global fmt(mp_dblsub610x2_asm)
-fmt(mp_dblsub610x2_asm):
-  push   r12
-  push   r13
-  push   r14
-  push   r15
-  
-  mov    r8, [reg_p3]
-  mov    r9, [reg_p3+8]
-  mov    r10, [reg_p3+16]
-  mov    r11, [reg_p3+24]
-  mov    r12, [reg_p3+32]
-  mov    r13, [reg_p3+40]
-  mov    r14, [reg_p3+48]
-  mov    r15, [reg_p3+56]
-  sub    r8, [reg_p1]
-  sbb    r9, [reg_p1+8] 
-  sbb    r10, [reg_p1+16] 
-  sbb    r11, [reg_p1+24] 
-  sbb    r12, [reg_p1+32] 
-  sbb    r13, [reg_p1+40] 
-  sbb    r14, [reg_p1+48] 
-  sbb    r15, [reg_p1+56]
-  setc   al
-  sub    r8, [reg_p2]
-  sbb    r9, [reg_p2+8] 
-  sbb    r10, [reg_p2+16] 
-  sbb    r11, [reg_p2+24] 
-  sbb    r12, [reg_p2+32] 
-  sbb    r13, [reg_p2+40] 
-  sbb    r14, [reg_p2+48] 
-  sbb    r15, [reg_p2+56]
-  setc   cl
-  mov    [reg_p3], r8
-  mov    [reg_p3+8], r9
-  mov    [reg_p3+16], r10
-  mov    [reg_p3+24], r11
-  mov    [reg_p3+32], r12
-  mov    [reg_p3+40], r13
-  mov    [reg_p3+48], r14
-  mov    [reg_p3+56], r15
-    
-  mov    r8, [reg_p3+64]
-  mov    r9, [reg_p3+72]
-  mov    r10, [reg_p3+80]
-  mov    r11, [reg_p3+88]
-  mov    r12, [reg_p3+96]
-  mov    r13, [reg_p3+104]
-  mov    r14, [reg_p3+112]
-  mov    r15, [reg_p3+120]
-  bt     rax, 0 
-  sbb    r8, [reg_p1+64] 
-  sbb    r9, [reg_p1+72] 
-  sbb    r10, [reg_p1+80] 
-  sbb    r11, [reg_p1+88] 
-  sbb    r12, [reg_p1+96] 
-  sbb    r13, [reg_p1+104] 
-  sbb    r14, [reg_p1+112] 
-  sbb    r15, [reg_p1+120]
-  setc   al 
-  bt     rcx, 0  
-  sbb    r8, [reg_p2+64] 
-  sbb    r9, [reg_p2+72] 
-  sbb    r10, [reg_p2+80] 
-  sbb    r11, [reg_p2+88] 
-  sbb    r12, [reg_p2+96] 
-  sbb    r13, [reg_p2+104] 
-  sbb    r14, [reg_p2+112] 
-  sbb    r15, [reg_p2+120]
-  setc   cl 
-  mov    [reg_p3+64], r8
-  mov    [reg_p3+72], r9
-  mov    [reg_p3+80], r10
-  mov    [reg_p3+88], r11
-  mov    [reg_p3+96], r12
-  mov    [reg_p3+104], r13
-  mov    [reg_p3+112], r14
-  mov    [reg_p3+120], r15
-  
-  mov    r8, [reg_p3+128]
-  mov    r9, [reg_p3+136]
-  mov    r10, [reg_p3+144]
-  mov    r11, [reg_p3+152]
-  bt     rax, 0 
-  sbb    r8, [reg_p1+128] 
-  sbb    r9, [reg_p1+136] 
-  sbb    r10, [reg_p1+144] 
-  sbb    r11, [reg_p1+152]
-  bt     rcx, 0 
-  sbb    r8, [reg_p2+128] 
-  sbb    r9, [reg_p2+136] 
-  sbb    r10, [reg_p2+144] 
-  sbb    r11, [reg_p2+152]
-  mov    [reg_p3+128], r8
-  mov    [reg_p3+136], r9
-  mov    [reg_p3+144], r10
-  mov    [reg_p3+152], r11
-  
-  pop    r15
-  pop    r14
-  pop    r13
-  pop    r12
-  ret
\ No newline at end of file
+//  Squaring in GF(p^2), complex part
+//  Operation: c [reg_p2] = 2a0 x a1
+//  Inputs: a = [a1, a0] stored in [reg_p1] 
+//  Output: c stored in [reg_p2]
+//***********************************************************************
+.global fmt(fp2sqr610_c1_asm)
+fmt(fp2sqr610_c1_asm):  
+    push   r12     
+    push   r13     
+    push   r14 
+    push   r15 
+	
+	mov    r8, [reg_p1]
+	mov    r9, [reg_p1+8]
+	mov    r10, [reg_p1+16]
+	mov    r11, [reg_p1+24]
+	mov    r12, [reg_p1+32]
+	mov    r13, [reg_p1+40] 
+	add    r8, r8
+	adc    r9, r9
+    push   rbx 
+	adc    r10, r10
+	adc    r11, r11  
+    push   rbp
+	adc    r12, r12
+	adc    r13, r13
+	mov    r14, [reg_p1+48] 
+	mov    r15, [reg_p1+56] 
+	adc    r14, r14
+	adc    r15, r15
+	mov    rbx, [reg_p1+64] 
+	mov    rbp, [reg_p1+72] 
+	adc    rbx, rbx
+	adc    rbp, rbp
+	sub    rsp, 112
+	mov    [rsp+8], r9
+	mov    [rsp+16], r10
+	mov    [rsp+24], r11
+    
+    // [rsp24, rsp0:rsp16, r11:r15, r8:r10] <- z = a00 x a1
+    mov    rdx, r8
+    mulx   r9, r8, [reg_p1+80]
+	mov    [rsp+32], r12
+    xor    rax, rax
+    mulx   r10, r11, [reg_p1+88] 
+	mov    [rsp+40], r13
+    adcx   r9, r11     
+    mulx   r11, r12, [reg_p1+96]
+	mov    [rsp+48], r14
+    adcx   r10, r12         
+    mulx   r12, r13, [reg_p1+104] 
+	mov    [rsp+104], r8
+    adcx   r11, r13         
+    mulx   r13, r8, [reg_p1+112]
+	mov    [rsp+80], r9   
+    adcx   r12, r8      
+    mulx   r14, r9, [reg_p1+120]
+	mov    [rsp+56], r15
+    adcx   r13, r9      
+    mulx   r15, rax, [reg_p1+128]
+	mov    [rsp+88], r10  
+    adcx   r14, rax     
+    mulx   r8, r10, [reg_p1+136] 
+	mov    [rsp+96], r11 
+    adcx   r15, r10     
+    mulx   r9, rax, [reg_p1+144] 
+	mov    [rsp+64], rbx
+    adcx   r8, rax    
+    mulx   r10, rbx, [reg_p1+152] 
+	mov    [rsp+72], rbp
+    adcx   r9, rbx    
+    adc    r10, 0  
+           
+	FPMUL640x640 [rsp], [reg_p1+80], [rsp+80], [reg_p2], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp, rax
+         
+    mov    [reg_p2+24], r13   
+    mov    [reg_p2+32], r14 
+    mov    [reg_p2+40], r15  
+    mov    [reg_p2+48], r8      
+    mov    [reg_p2+56], r9                  
+    mov    [reg_p2+64], r10                 
+    mov    [reg_p2+72], r11
+	add    rsp, 112
+    pop    rbp
+    pop    rbx
+    pop    r15 
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
+
+
+//***********************************************************************
+//  Field multiplication in GF(p)
+//  Operation: c = a x b mod p
+//  Inputs: a stored in [reg_p1], b stored in [reg_p2] 
+//  Output: c stored in [reg_p3]
+//***********************************************************************
+.global fmt(fpmul610_asm)
+fmt(fpmul610_asm): 
+    mov    rcx, reg_p3  
+    push   r12
+    push   r13 
+    push   r14 
+    push   r15 
+    push   rbx    
+    push   rbp
+	sub    rsp, 32
+     
+    // [r8:r15] <- z = a x b0
+    mov    rdx, [reg_p2]
+    mulx   r9, r8, [reg_p1]
+    xor    rax, rax 
+	mov    [rsp+24], r8
+    mulx   r10, r11, [reg_p1+8]
+    adcx   r9, r11        
+    mulx   r11, r12, [reg_p1+16]
+    adcx   r10, r12        
+    mulx   r12, r13, [reg_p1+24]
+    adcx   r11, r13       
+    mulx   r13, r8, [reg_p1+32]
+    adcx   r12, r8      
+    mulx   r14, rax, [reg_p1+40]
+    adcx   r13, rax     
+    mulx   r15, rax, [reg_p1+48]   
+	mov    [rsp], r9
+    adcx   r14, rax      
+    mulx   r8, rbx, [reg_p1+56]  
+	mov    [rsp+8], r10    
+    adcx   r15, rbx     
+    mulx   r9, rax, [reg_p1+64] 
+	mov    [rsp+16], r11    
+    adcx   r8, rax     
+    mulx   r10, rbx, [reg_p1+72]  
+    adcx   r9, rbx 
+    adc    r10, 0 
+
+	FPMUL640x640 [reg_p2], [reg_p1], [rsp], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp, rax
+	
+    mov    [rcx+24], r13   
+    mov    [rcx+32], r14 
+    mov    [rcx+40], r15  
+    mov    [rcx+48], r8      
+    mov    [rcx+56], r9                  
+    mov    [rcx+64], r10                 
+    mov    [rcx+72], r11
+	add    rsp, 32
+    pop    rbp
+    pop    rbx
+    pop    r15
+    pop    r14
+    pop    r13
+    pop    r12
+    ret
\ No newline at end of file
diff --git a/src/P610/ARM64/fp_arm64.c b/src/P610/ARM64/fp_arm64.c
index bd72d88..ebf051b 100644
--- a/src/P610/ARM64/fp_arm64.c
+++ b/src/P610/ARM64/fp_arm64.c
@@ -1,10 +1,15 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P610
 *********************************************************************************************/
 
 #include "../P610_internal.h"
+#include "../../internal.h"
 
 // Global constants
 extern const uint64_t p610[NWORDS_FIELD];
@@ -13,21 +18,21 @@ extern const uint64_t p610x2[NWORDS_FIELD];
 extern const uint64_t p610x4[NWORDS_FIELD];
 
 
-__inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub610_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. 
     
     mp_sub610_p2_asm(a, b, c); 
 } 
 
 
-__inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub610_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. 
     
     mp_sub610_p4_asm(a, b, c);
 }
 
 
-__inline void fpadd610(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpadd610(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular addition, c = a+b mod p610.
   // Inputs: a, b in [0, 2*p610-1] 
   // Output: c in [0, 2*p610-1]
@@ -36,7 +41,7 @@ __inline void fpadd610(const digit_t* a, const digit_t* b, digit_t* c)
 } 
 
 
-__inline void fpsub610(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpsub610(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular subtraction, c = a-b mod p610.
   // Inputs: a, b in [0, 2*p610-1] 
   // Output: c in [0, 2*p610-1] 
@@ -45,7 +50,7 @@ __inline void fpsub610(const digit_t* a, const digit_t* b, digit_t* c)
 }
 
 
-__inline void fpneg610(digit_t* a)
+inline void fpneg610(digit_t* a)
 { // Modular negation, a = -a mod p610.
   // Input/output: a in [0, 2*p610-1] 
     unsigned int i, borrow = 0;
diff --git a/src/P610/ARM64/fp_arm64_asm.S b/src/P610/ARM64/fp_arm64_asm.S
index b1ecf43..06a3190 100644
--- a/src/P610/ARM64/fp_arm64_asm.S
+++ b/src/P610/ARM64/fp_arm64_asm.S
@@ -1,5 +1,9 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license
 //
 // Abstract: field arithmetic in 64-bit ARMv8 assembly for P610 on Linux
 //*******************************************************************************************
diff --git a/src/P610/P610.c b/src/P610/P610.c
index a4638de..6ea3aa8 100644
--- a/src/P610/P610.c
+++ b/src/P610/P610.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P610
 *********************************************************************************************/  
@@ -27,12 +31,10 @@ const uint64_t p610x2[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF
                                                      0x62F09BD154B5605C, 0x35CF7E8A091FF357, 0x64AB65F421884A55, 0x03202184A3CFB119, 0x00000004F7ED4ED1 };
 const uint64_t p610x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xB807FFFFFFFFFFFF, 
                                                      0xC5E137A2A96AC0B9, 0x6B9EFD14123FE6AE, 0xC956CBE8431094AA, 0x06404309479F6232, 0x00000009EFDA9DA2 };
+const uint64_t p610x8[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x700FFFFFFFFFFFFF, 
+                                                     0x8BC26F4552D58173, 0xD73DFA28247FCD5D, 0x92AD97D086212954, 0x0C8086128F3EC465, 0x00000013DFB53B44 };
 const uint64_t p610p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x6E02000000000000,
                                                      0xB1784DE8AA5AB02E, 0x9AE7BF45048FF9AB, 0xB255B2FA10C4252A, 0x819010C251E7D88C, 0x000000027BF6A768 };   
-const uint64_t p610x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x3FC0000000000000, 
-                                                     0xD0F642EAB4A9FA32, 0xA308175F6E00CA89, 0xB549A0BDE77B5AAC, 0xCDFDE7B5C304EE69, 0x7FDB7FF0812B12EF, 
-                                                     0xE09BA529B9FE1167, 0xD249C196DAB8CD7F, 0xD4E22754A3F20928, 0x97825638B19A7CCE, 0x05E04550FC4CCE0D, 
-                                                     0x8FB5DA1152CDE50C, 0xF9649BA3EA408644, 0x4473C93E6441063D, 0xBE190269D1337B7B, 0x0000000000000062 }; 
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0002000000000000 }; 
 // Order of Bob's subgroup
@@ -98,6 +100,7 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fpneg                         fpneg610
 #define fpdiv2                        fpdiv2_610
 #define fpcorrection                  fpcorrection610
+#define fpmul                         fpmul610
 #define fpmul_mont                    fpmul610_mont
 #define fpsqr_mont                    fpsqr610_mont
 #define fpinv_mont                    fpinv610_mont
@@ -115,6 +118,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fp2correction                 fp2correction610
 #define fp2mul_mont                   fp2mul610_mont
 #define fp2sqr_mont                   fp2sqr610_mont
+#define fp2mul_c0_mont                fp2mul610_c0_mont
+#define fp2mul_c1_mont                fp2mul610_c1_mont
+#define fp2sqr_c0_mont                fp2sqr610_c0_mont
+#define fp2sqr_c1_mont                fp2sqr610_c1_mont
 #define fp2inv_mont                   fp2inv610_mont
 #define fp2inv_mont_bingcd            fp2inv610_mont_bingcd
 #define fpequal_non_constant_time     fpequal610_non_constant_time
diff --git a/src/P610/P610_api.h b/src/P610/P610_api.h
index 41a3ac3..ee71516 100644
--- a/src/P610/P610_api.h
+++ b/src/P610/P610_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P610
 *********************************************************************************************/  
diff --git a/src/P610/P610_compressed.c b/src/P610/P610_compressed.c
index e07b35f..de0481e 100644
--- a/src/P610/P610_compressed.c
+++ b/src/P610/P610_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * Supersingular Isogeny Key Encapsulation Library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P610_compressed
 *********************************************************************************************/
@@ -28,12 +32,10 @@ const uint64_t p610x2[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFE, 0xFFFFF
                                                      0x62F09BD154B5605C, 0x35CF7E8A091FF357, 0x64AB65F421884A55, 0x03202184A3CFB119, 0x00000004F7ED4ED1 };
 const uint64_t p610x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xB807FFFFFFFFFFFF, 
                                                      0xC5E137A2A96AC0B9, 0x6B9EFD14123FE6AE, 0xC956CBE8431094AA, 0x06404309479F6232, 0x00000009EFDA9DA2 };
+const uint64_t p610x8[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFF8, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x700FFFFFFFFFFFFF, 
+                                                     0x8BC26F4552D58173, 0xD73DFA28247FCD5D, 0x92AD97D086212954, 0x0C8086128F3EC465, 0x00000013DFB53B44 };
 const uint64_t p610p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x6E02000000000000,
                                                      0xB1784DE8AA5AB02E, 0x9AE7BF45048FF9AB, 0xB255B2FA10C4252A, 0x819010C251E7D88C, 0x000000027BF6A768 };   
-const uint64_t p610x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x3FC0000000000000, 
-                                                     0xD0F642EAB4A9FA32, 0xA308175F6E00CA89, 0xB549A0BDE77B5AAC, 0xCDFDE7B5C304EE69, 0x7FDB7FF0812B12EF, 
-                                                     0xE09BA529B9FE1167, 0xD249C196DAB8CD7F, 0xD4E22754A3F20928, 0x97825638B19A7CCE, 0x05E04550FC4CCE0D, 
-                                                     0x8FB5DA1152CDE50C, 0xF9649BA3EA408644, 0x4473C93E6441063D, 0xBE190269D1337B7B, 0x0000000000000062 };
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0002000000000000 }; 
 // Order of Bob's subgroup
@@ -341,6 +343,7 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fpneg                         fpneg610
 #define fpdiv2                        fpdiv2_610
 #define fpcorrection                  fpcorrection610
+#define fpmul                         fpmul610
 #define fpmul_mont                    fpmul610_mont
 #define fpsqr_mont                    fpsqr610_mont
 #define fpinv_mont                    fpinv610_mont
@@ -358,6 +361,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fp2correction                 fp2correction610
 #define fp2mul_mont                   fp2mul610_mont
 #define fp2sqr_mont                   fp2sqr610_mont
+#define fp2mul_c0_mont                fp2mul610_c0_mont
+#define fp2mul_c1_mont                fp2mul610_c1_mont
+#define fp2sqr_c0_mont                fp2sqr610_c0_mont
+#define fp2sqr_c1_mont                fp2sqr610_c1_mont
 #define fp2inv_mont                   fp2inv610_mont
 #define fp2inv_mont_bingcd            fp2inv610_mont_bingcd
 #define fpequal_non_constant_time     fpequal610_non_constant_time
diff --git a/src/P610/P610_compressed_api.h b/src/P610/P610_compressed_api.h
index 8956bef..4f8035b 100644
--- a/src/P610/P610_compressed_api.h
+++ b/src/P610/P610_compressed_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P610 using compression
 *********************************************************************************************/  
diff --git a/src/P610/P610_compressed_dlog_tables.c b/src/P610/P610_compressed_dlog_tables.c
index ed63f3a..bf29c0f 100644
--- a/src/P610/P610_compressed_dlog_tables.c
+++ b/src/P610/P610_compressed_dlog_tables.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for Pohlig-Hellman when using compression
 *********************************************************************************************/ 
diff --git a/src/P610/P610_compressed_pair_tables.c b/src/P610/P610_compressed_pair_tables.c
index 1a8f560..7b674e1 100644
--- a/src/P610/P610_compressed_pair_tables.c
+++ b/src/P610/P610_compressed_pair_tables.c
@@ -1,5 +1,9 @@
 /**************************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression
 ***************************************************************************************************/  
diff --git a/src/P610/P610_internal.h b/src/P610/P610_internal.h
index b933c42..d1e69d6 100644
--- a/src/P610/P610_internal.h
+++ b/src/P610/P610_internal.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: internal header file for P610
 *********************************************************************************************/  
@@ -170,6 +174,8 @@ void rdc610_asm(digit_t* ma, digit_t* mc);
             
 // Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p610, where R=2^640
 void fpmul610_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fpmul610(const digit_t* a, const digit_t* b, digit_t* c);
+void fpmul610_asm(const digit_t* a, const digit_t* b, digit_t* c);
 void mul610_asm(const digit_t* a, const digit_t* b, digit_t* c);
    
 // Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p610, where R=2^640
@@ -209,9 +215,17 @@ void fp2correction610(f2elm_t a);
             
 // GF(p610^2) squaring using Montgomery arithmetic, c = a^2 in GF(p610^2)
 void fp2sqr610_mont(const f2elm_t a, f2elm_t c);
+void fp2sqr610_c0_mont(const digit_t* a, digit_t* c);
+void fp2sqr610_c0_asm(const digit_t* a, digit_t* c);
+void fp2sqr610_c1_mont(const digit_t* a, digit_t* c);
+void fp2sqr610_c1_asm(const digit_t* a, digit_t* c);
  
 // GF(p610^2) multiplication using Montgomery arithmetic, c = a*b in GF(p610^2)
 void fp2mul610_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
+void fp2mul610_c0_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul610_c0_asm(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul610_c1_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul610_c1_asm(const digit_t* a, const digit_t* b, digit_t* c);
 
 // GF(p610^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
 void fp2inv610_mont(f2elm_t a);
diff --git a/src/P610/generic/fp_generic.c b/src/P610/generic/fp_generic.c
index e56a343..aa7f68a 100755
--- a/src/P610/generic/fp_generic.c
+++ b/src/P610/generic/fp_generic.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: portable modular arithmetic for P610
 *********************************************************************************************/
diff --git a/src/P751/AMD64/fp_x64.c b/src/P751/AMD64/fp_x64.c
index d9e47fa..ac50804 100644
--- a/src/P751/AMD64/fp_x64.c
+++ b/src/P751/AMD64/fp_x64.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for x64 platforms for P751
 *********************************************************************************************/
@@ -17,7 +21,7 @@ extern const uint64_t p751x4[NWORDS_FIELD];
 
 inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 751)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
@@ -39,7 +43,7 @@ inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c)
 
 inline void mp_sub751_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) || (TARGET == TARGET_ARM64 && NBITS_FIELD == 751)
+#if (OS_TARGET == OS_WIN)
     unsigned int i, borrow = 0;
 
     for (i = 0; i < NWORDS_FIELD; i++) {
diff --git a/src/P751/AMD64/fp_x64_asm.S b/src/P751/AMD64/fp_x64_asm.S
index f3612f4..0452fa5 100644
--- a/src/P751/AMD64/fp_x64_asm.S
+++ b/src/P751/AMD64/fp_x64_asm.S
@@ -1,5 +1,9 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license
 //
 // Abstract: field arithmetic in x64 assembly for P751 on Linux 
 //*******************************************************************************************  
diff --git a/src/P751/ARM64/fp_arm64.c b/src/P751/ARM64/fp_arm64.c
index b3a8365..1a49eb3 100644
--- a/src/P751/ARM64/fp_arm64.c
+++ b/src/P751/ARM64/fp_arm64.c
@@ -1,10 +1,15 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P751
 *********************************************************************************************/
 
 #include "../P751_internal.h"
+#include "../../internal.h"
 
 // Global constants
 extern const uint64_t p751[NWORDS_FIELD];
@@ -13,21 +18,21 @@ extern const uint64_t p751x2[NWORDS_FIELD];
 extern const uint64_t p751x4[NWORDS_FIELD];
 
 
-__inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub751_p2(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 2*p, c = a-b+2p. 
     
     mp_sub751_p2_asm(a, b, c); 
 } 
 
 
-__inline void mp_sub751_p4(const digit_t* a, const digit_t* b, digit_t* c)
+inline void mp_sub751_p4(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction with correction with 4*p, c = a-b+4p. 
     
     mp_sub751_p4_asm(a, b, c);
 }
 
 
-__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular addition, c = a+b mod p751.
   // Inputs: a, b in [0, 2*p751-1] 
   // Output: c in [0, 2*p751-1]
@@ -36,7 +41,7 @@ __inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c)
 } 
 
 
-__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c)
+inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c)
 { // Modular subtraction, c = a-b mod p751.
   // Inputs: a, b in [0, 2*p751-1] 
   // Output: c in [0, 2*p751-1] 
@@ -45,7 +50,7 @@ __inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c)
 }
 
 
-__inline void fpneg751(digit_t* a)
+inline void fpneg751(digit_t* a)
 { // Modular negation, a = -a mod p751.
   // Input/output: a in [0, 2*p751-1] 
     unsigned int i, borrow = 0;
diff --git a/src/P751/ARM64/fp_arm64_asm.S b/src/P751/ARM64/fp_arm64_asm.S
index 216467e..c75278c 100644
--- a/src/P751/ARM64/fp_arm64_asm.S
+++ b/src/P751/ARM64/fp_arm64_asm.S
@@ -1,5 +1,9 @@
 //*******************************************************************************************
 // SIDH: an efficient supersingular isogeny cryptography library
+// Copyright (c) Microsoft Corporation
+//
+// Website: https://github.com/microsoft/PQCrypto-SIDH
+// Released under MIT license
 //
 // Abstract: field arithmetic in 64-bit ARMv8 assembly for P751 on Linux
 //*******************************************************************************************
diff --git a/src/P751/P751.c b/src/P751/P751.c
index b35b3e2..917f98b 100644
--- a/src/P751/P751.c
+++ b/src/P751/P751.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P751
 *********************************************************************************************/  
@@ -29,10 +33,6 @@ const uint64_t p751x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFF
                                                      0x8FB25A1527E1E2A3, 0x6A566C684FDF31DB, 0x213A619F5BAFA1DB, 0x158AD41172C95D20, 0x384A427E5EEB719A, 0x0001BF975507DC70 }; 
 const uint64_t p751p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000,
                                                      0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C };   
-const uint64_t p751x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x2A00000000000000, 
-                                                     0x826D2F56C0F0EAE2, 0xAD4C9CBD81067123, 0xF62CF3052282F124, 0x53A95F7469B516FE, 0x3DADEC0D08A4732F, 0x58AD934557C11C7E, 
-                                                     0x7F731B89B2DA43F2, 0x51AE9F5F5F6AFF3B, 0xD74319A6C9BCA375, 0x5BAB790796CF84D4, 0xA421554FE2E49CA8, 0x20AD617C8DF437CF, 
-                                                     0x3AB06E7A12F5FF7B, 0x70A25E037E40347E, 0x51F1D323FB4C1151, 0xAE0D99AA4835FED9, 0xDF5429960D2536B6, 0x000000030E91D466 };
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; 
 // Order of Bob's subgroup
@@ -117,6 +117,10 @@ const unsigned int strat_Bob[MAX_Bob-1] = {
 #define fp2correction                 fp2correction751
 #define fp2mul_mont                   fp2mul751_mont
 #define fp2sqr_mont                   fp2sqr751_mont
+#define fp2mul_c0_mont                fp2mul751_c0_mont
+#define fp2mul_c1_mont                fp2mul751_c1_mont
+#define fp2sqr_c0_mont                fp2sqr751_c0_mont
+#define fp2sqr_c1_mont                fp2sqr751_c1_mont
 #define fp2inv_mont                   fp2inv751_mont
 #define fp2inv_mont_bingcd            fp2inv751_mont_bingcd
 #define fpequal_non_constant_time     fpequal751_non_constant_time
diff --git a/src/P751/P751_api.h b/src/P751/P751_api.h
index 8fc09e5..6a50273 100644
--- a/src/P751/P751_api.h
+++ b/src/P751/P751_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P751
 *********************************************************************************************/  
diff --git a/src/P751/P751_compressed.c b/src/P751/P751_compressed.c
index a3ef462..44b9596 100644
--- a/src/P751/P751_compressed.c
+++ b/src/P751/P751_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * Supersingular Isogeny Key Encapsulation Library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny parameters and generation of functions for P751_compressed
 *********************************************************************************************/ 
@@ -30,10 +34,6 @@ const uint64_t p751x4[NWORDS64_FIELD]            = { 0xFFFFFFFFFFFFFFFC, 0xFFFFF
                                                      0x8FB25A1527E1E2A3, 0x6A566C684FDF31DB, 0x213A619F5BAFA1DB, 0x158AD41172C95D20, 0x384A427E5EEB719A, 0x0001BF975507DC70 }; 
 const uint64_t p751p1[NWORDS64_FIELD]            = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000,
                                                      0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C };   
-const uint64_t p751x16p[2*NWORDS64_FIELD]        = { 0x0000000000000010, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x2A00000000000000, 
-                                                     0x826D2F56C0F0EAE2, 0xAD4C9CBD81067123, 0xF62CF3052282F124, 0x53A95F7469B516FE, 0x3DADEC0D08A4732F, 0x58AD934557C11C7E, 
-                                                     0x7F731B89B2DA43F2, 0x51AE9F5F5F6AFF3B, 0xD74319A6C9BCA375, 0x5BAB790796CF84D4, 0xA421554FE2E49CA8, 0x20AD617C8DF437CF, 
-                                                     0x3AB06E7A12F5FF7B, 0x70A25E037E40347E, 0x51F1D323FB4C1151, 0xAE0D99AA4835FED9, 0xDF5429960D2536B6, 0x000000030E91D466 }; 
 // Order of Alice's subgroup
 const uint64_t Alice_order[NWORDS64_ORDER]       = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; 
 // Order of Bob's subgroup
@@ -376,6 +376,10 @@ const uint64_t v_3_torsion[TABLE_V3_LEN][2 * NWORDS64_FIELD] =
 #define fp2correction                 fp2correction751
 #define fp2mul_mont                   fp2mul751_mont
 #define fp2sqr_mont                   fp2sqr751_mont
+#define fp2mul_c0_mont                fp2mul751_c0_mont
+#define fp2mul_c1_mont                fp2mul751_c1_mont
+#define fp2sqr_c0_mont                fp2sqr751_c0_mont
+#define fp2sqr_c1_mont                fp2sqr751_c1_mont
 #define fp2inv_mont                   fp2inv751_mont
 #define fp2inv_mont_bingcd            fp2inv751_mont_bingcd
 #define fpequal_non_constant_time     fpequal751_non_constant_time
diff --git a/src/P751/P751_compressed_api.h b/src/P751/P751_compressed_api.h
index 3bc08ed..ea7bc92 100644
--- a/src/P751/P751_compressed_api.h
+++ b/src/P751/P751_compressed_api.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: API header file for P751 using compression
 *********************************************************************************************/  
diff --git a/src/P751/P751_compressed_dlog_tables.c b/src/P751/P751_compressed_dlog_tables.c
index d660d07..425466d 100644
--- a/src/P751/P751_compressed_dlog_tables.c
+++ b/src/P751/P751_compressed_dlog_tables.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for Pohlig-Hellman when using compression
 *********************************************************************************************/ 
diff --git a/src/P751/P751_compressed_pair_tables.c b/src/P751/P751_compressed_pair_tables.c
index f8a8704..f40b4f5 100644
--- a/src/P751/P751_compressed_pair_tables.c
+++ b/src/P751/P751_compressed_pair_tables.c
@@ -1,5 +1,9 @@
 /**************************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: precomputed tables for pairing computation on E0: y^2 = x^3 + x when using compression
 ***************************************************************************************************/  
diff --git a/src/P751/P751_internal.h b/src/P751/P751_internal.h
index d636e7c..6cb25e0 100644
--- a/src/P751/P751_internal.h
+++ b/src/P751/P751_internal.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: internal header file for P751
 *********************************************************************************************/  
@@ -208,9 +212,17 @@ void fp2correction751(f2elm_t a);
             
 // GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2)
 void fp2sqr751_mont(const f2elm_t a, f2elm_t c);
+void fp2sqr751_c0_mont(const digit_t* a, digit_t* c);
+void fp2sqr751_c0_asm(const digit_t* a, digit_t* c);
+void fp2sqr751_c1_mont(const digit_t* a, digit_t* c);
+void fp2sqr751_c1_asm(const digit_t* a, digit_t* c);
  
 // GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2)
 void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
+void fp2mul751_c0_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul751_c0_asm(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul751_c1_mont(const digit_t* a, const digit_t* b, digit_t* c);
+void fp2mul751_c1_asm(const digit_t* a, const digit_t* b, digit_t* c);
 
 // GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
 void fp2inv751_mont(f2elm_t a);
diff --git a/src/P751/generic/fp_generic.c b/src/P751/generic/fp_generic.c
index d07750e..bd89064 100755
--- a/src/P751/generic/fp_generic.c
+++ b/src/P751/generic/fp_generic.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: portable modular arithmetic for P751
 *********************************************************************************************/
diff --git a/src/compression/dlog.c b/src/compression/dlog.c
index a99da68..4cd815c 100644
--- a/src/compression/dlog.c
+++ b/src/compression/dlog.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: Pohlig-Hellman with optimal strategy
 *********************************************************************************************/
diff --git a/src/compression/pairing.c b/src/compression/pairing.c
index 23e1374..fc7e05b 100644
--- a/src/compression/pairing.c
+++ b/src/compression/pairing.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: pairing computation for compression
 *********************************************************************************************/
diff --git a/src/compression/sidh_compressed.c b/src/compression/sidh_compressed.c
index e3357d6..36fe4f0 100644
--- a/src/compression/sidh_compressed.c
+++ b/src/compression/sidh_compressed.c
@@ -1,5 +1,9 @@
 /*************************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) using compression
 **************************************************************************************************/ 
diff --git a/src/compression/sike_compressed.c b/src/compression/sike_compressed.c
index 0279f3d..f8ebadf 100644
--- a/src/compression/sike_compressed.c
+++ b/src/compression/sike_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny key encapsulation (SIKE) protocol using compression
 *********************************************************************************************/ 
diff --git a/src/compression/torsion_basis.c b/src/compression/torsion_basis.c
index f1e391d..e5829fb 100644
--- a/src/compression/torsion_basis.c
+++ b/src/compression/torsion_basis.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: Torsion basis generation for compression
 *********************************************************************************************/
diff --git a/src/config.h b/src/config.h
index 58a5121..d2fcc77 100644
--- a/src/config.h
+++ b/src/config.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: configuration file and platform-dependent macros
 *********************************************************************************************/  
diff --git a/src/ec_isogeny.c b/src/ec_isogeny.c
index b373fc5..09160d0 100644
--- a/src/ec_isogeny.c
+++ b/src/ec_isogeny.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: elliptic curve and isogeny functions
 *********************************************************************************************/
diff --git a/src/fpx.c b/src/fpx.c
index 6eadbd0..c65988a 100644
--- a/src/fpx.c
+++ b/src/fpx.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: core functions over GF(p) and GF(p^2)
 *********************************************************************************************/
@@ -136,19 +140,27 @@ void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords)
 
 void fpmul_mont(const digit_t* ma, const digit_t* mb, digit_t* mc)
 { // Multiprecision multiplication, c = a*b mod p.
+#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751)
+    fpmul(ma, mb, mc);
+#else
     dfelm_t temp = {0};
 
     mp_mul(ma, mb, temp, NWORDS_FIELD);
     rdc_mont(temp, mc);
+#endif
 }
 
 
 void fpsqr_mont(const digit_t* ma, digit_t* mc)
 { // Multiprecision squaring, c = a^2 mod p.
+#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751)
+    fpmul(ma, ma, mc);
+#else
     dfelm_t temp = {0};
 
     mp_mul(ma, ma, temp, NWORDS_FIELD);
     rdc_mont(temp, mc);
+#endif
 }
 
 
@@ -215,7 +227,7 @@ void fp2correction(f2elm_t a)
 
 inline static void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision addition, c = a+b.    
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM)
+#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION)
 
     mp_add(a, b, c, NWORDS_FIELD);
     
@@ -256,7 +268,14 @@ inline unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const
 void fp2sqr_mont(const f2elm_t a, f2elm_t c)
 { // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
   // Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] 
-  // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] 
+  // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]  
+#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751)
+    dfelm_t tt1; 
+    
+    fp2sqr_c0_mont(a[0], (digit_t*)tt1);            // c0 = (a0+a1)(a0-a1)
+    fp2sqr_c1_mont(a[0], c[1]);                     // c1 = 2a0*a1
+    fpcopy((digit_t*)tt1, c[0]);
+#else
     felm_t t1, t2, t3;
     
     mp_addfast(a[0], a[1], t1);                      // t1 = a0+a1 
@@ -264,6 +283,7 @@ void fp2sqr_mont(const f2elm_t a, f2elm_t c)
     mp_addfast(a[0], a[0], t3);                      // t3 = 2a0
     fpmul_mont(t1, t2, c[0]);                        // c0 = (a0+a1)(a0-a1)
     fpmul_mont(t3, a[1], c[1]);                      // c1 = 2a0*a1
+#endif
 }
 
 
@@ -280,7 +300,7 @@ inline unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const
 
 inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction followed by addition with p*2^MAXBITS_FIELD, c = a-b+(p*2^MAXBITS_FIELD) if a-b < 0, otherwise c=a-b. 
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM)
+#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION)
     felm_t t1;
 
     digit_t mask = 0 - (digit_t)mp_sub(a, b, c, 2*NWORDS_FIELD);
@@ -288,7 +308,7 @@ inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c)
         t1[i] = ((digit_t*)PRIME)[i] & mask;
     mp_addfast((digit_t*)&c[NWORDS_FIELD], t1, (digit_t*)&c[NWORDS_FIELD]);
 
-#elif (OS_TARGET == OS_NIX)               
+#elif (OS_TARGET == OS_NIX) && (TARGET == TARGET_ARM64 || NBITS_FIELD == 751)               
 
     mp_subaddx2_asm(a, b, c);     
 
@@ -298,12 +318,12 @@ inline static void mp_subaddfast(const digit_t* a, const digit_t* b, digit_t* c)
 
 inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c)
 { // Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD.
-#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM)
+#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION)
 
     mp_sub(c, a, c, 2*NWORDS_FIELD);
     mp_sub(c, b, c, 2*NWORDS_FIELD);
 
-#elif (OS_TARGET == OS_NIX)                 
+#elif (OS_TARGET == OS_NIX) && (TARGET == TARGET_ARM64 || NBITS_FIELD == 751)                 
 
     mp_dblsubx2_asm(a, b, c);
 
@@ -315,6 +335,13 @@ void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c)
 { // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2).
   // Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] 
   // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] 
+#if defined(_MULX_) && defined(_ADX_) && (OS_TARGET == OS_NIX) && (NBITS_FIELD != 751)
+    felm_t t1;
+    
+    fp2mul_c0_mont(a[0], b[0], t1);                  // c0 = a0*b0 - a1*b1
+    fp2mul_c1_mont(a[0], b[0], c[1]);                // c1 = a0*b1 + a1*b0 
+    fpcopy(t1, c[0]);
+#else
     felm_t t1, t2;
     dfelm_t tt1, tt2, tt3; 
     
@@ -325,8 +352,9 @@ void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c)
     mp_mul(t1, t2, tt3, NWORDS_FIELD);               // tt3 = (a0+a1)*(b0+b1)
     mp_dblsubfast(tt1, tt2, tt3);                    // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
     mp_subaddfast(tt1, tt2, tt1);                    // tt1 = a0*b0 - a1*b1 + p*2^MAXBITS_FIELD if a0*b0 - a1*b1 < 0, else tt1 = a0*b0 - a1*b1
-    rdc_mont(tt3, c[1]);                             // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 
-    rdc_mont(tt1, c[0]);                             // c[0] = a0*b0 - a1*b1
+    rdc_mont(tt3, c[1]);                             // c1 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 
+    rdc_mont(tt1, c[0]);                             // c0 = a0*b0 - a1*b1
+#endif
 }
 
 
diff --git a/src/internal.h b/src/internal.h
index 924f246..0e46f3c 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: internal header file for function definitions
 *********************************************************************************************/  
diff --git a/src/random/random.c b/src/random/random.c
index 028acbe..b3b99b5 100644
--- a/src/random/random.c
+++ b/src/random/random.c
@@ -21,7 +21,7 @@
 
 static inline void delay(unsigned int count)
 {
-    while (count--) {}
+    while (count>0) { count--; }
 }
 
 
diff --git a/src/sidh.c b/src/sidh.c
index b6661f6..133f9c2 100644
--- a/src/sidh.c
+++ b/src/sidh.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH)
 *********************************************************************************************/ 
diff --git a/src/sike.c b/src/sike.c
index 36d7293..3c78806 100644
--- a/src/sike.c
+++ b/src/sike.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: supersingular isogeny key encapsulation (SIKE) protocol
 *********************************************************************************************/ 
diff --git a/tests/arith_tests-p434.c b/tests/arith_tests-p434.c
index c3fd2a3..c812c68 100644
--- a/tests/arith_tests-p434.c
+++ b/tests/arith_tests-p434.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: testing code for field arithmetic, elliptic curve and isogeny functions
 *********************************************************************************************/
@@ -12,12 +16,12 @@
 
 
 // Benchmark and test parameters  
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) 
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM) 
     #define BENCH_LOOPS           100       // Number of iterations per bench
     #define SMALL_BENCH_LOOPS     100       // Number of iterations per bench
     #define TEST_LOOPS             10       // Number of iterations per test
 #else
-    #define BENCH_LOOPS        100000 
+    #define BENCH_LOOPS       1000000 
     #define SMALL_BENCH_LOOPS   10000       
     #define TEST_LOOPS            100   
 #endif
@@ -350,7 +354,6 @@ bool fp_run()
     int n;
     unsigned long long cycles, cycles1, cycles2;
     felm_t a, b, c;
-    dfelm_t aa;
         
     printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
     printf("Benchmarking field arithmetic over GF(p434): \n\n"); 
@@ -393,20 +396,6 @@ bool fp_run()
     printf("  GF(p) multiplication runs in .................................... %7lld ", cycles/BENCH_LOOPS); print_unit;
     printf("\n");
 
-    // GF(p) reduction using p434
-    cycles = 0;
-    for (n=0; n<BENCH_LOOPS; n++)
-    {
-        mp_mul(a, b, aa, NWORDS_FIELD);
-
-        cycles1 = cpucycles(); 
-        rdc_mont(aa, c);
-        cycles2 = cpucycles();
-        cycles = cycles+(cycles2-cycles1);
-    }
-    printf("  GF(p) reduction runs in ......................................... %7lld ", cycles/BENCH_LOOPS); print_unit;
-    printf("\n");
-
     // GF(p) inversion
     cycles = 0;
     for (n=0; n<SMALL_BENCH_LOOPS; n++)
diff --git a/tests/arith_tests-p503.c b/tests/arith_tests-p503.c
index 53cf252..c145de0 100644
--- a/tests/arith_tests-p503.c
+++ b/tests/arith_tests-p503.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: testing code for field arithmetic, elliptic curve and isogeny functions
 *********************************************************************************************/
@@ -12,12 +16,12 @@
 
 
 // Benchmark and test parameters  
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) 
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM) 
     #define BENCH_LOOPS           100       // Number of iterations per bench
     #define SMALL_BENCH_LOOPS     100       // Number of iterations per bench
     #define TEST_LOOPS             10       // Number of iterations per test
 #else
-    #define BENCH_LOOPS        100000 
+    #define BENCH_LOOPS       1000000 
     #define SMALL_BENCH_LOOPS   10000       
     #define TEST_LOOPS            100   
 #endif
@@ -350,7 +354,6 @@ bool fp_run()
     int n;
     unsigned long long cycles, cycles1, cycles2;
     felm_t a, b, c;
-    dfelm_t aa;
         
     printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
     printf("Benchmarking field arithmetic over GF(p503): \n\n"); 
@@ -393,20 +396,6 @@ bool fp_run()
     printf("  GF(p) multiplication runs in .................................... %7lld ", cycles/BENCH_LOOPS); print_unit;
     printf("\n");
 
-    // GF(p) reduction using p503
-    cycles = 0;
-    for (n=0; n<BENCH_LOOPS; n++)
-    {
-        mp_mul(a, b, aa, NWORDS_FIELD);
-
-        cycles1 = cpucycles(); 
-        rdc_mont(aa, c);
-        cycles2 = cpucycles();
-        cycles = cycles+(cycles2-cycles1);
-    }
-    printf("  GF(p) reduction runs in ......................................... %7lld ", cycles/BENCH_LOOPS); print_unit;
-    printf("\n");
-
     // GF(p) inversion
     cycles = 0;
     for (n=0; n<SMALL_BENCH_LOOPS; n++)
diff --git a/tests/arith_tests-p610.c b/tests/arith_tests-p610.c
index 997e291..d86710d 100644
--- a/tests/arith_tests-p610.c
+++ b/tests/arith_tests-p610.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: testing code for field arithmetic, elliptic curve and isogeny functions
 *********************************************************************************************/
@@ -12,12 +16,12 @@
 
 
 // Benchmark and test parameters  
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) 
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM) 
     #define BENCH_LOOPS           100       // Number of iterations per bench
     #define SMALL_BENCH_LOOPS     100       // Number of iterations per bench
     #define TEST_LOOPS             10       // Number of iterations per test
 #else
-    #define BENCH_LOOPS        100000 
+    #define BENCH_LOOPS       1000000 
     #define SMALL_BENCH_LOOPS   10000       
     #define TEST_LOOPS            100   
 #endif
@@ -351,7 +355,6 @@ bool fp_run()
     int n;
     unsigned long long cycles, cycles1, cycles2;
     felm_t a, b, c;
-    dfelm_t aa;
         
     printf("\n--------------------------------------------------------------------------------------------------------\n\n"); 
     printf("Benchmarking field arithmetic over GF(p610): \n\n"); 
@@ -394,20 +397,6 @@ bool fp_run()
     printf("  GF(p) multiplication runs in .................................... %7lld ", cycles/BENCH_LOOPS); print_unit;
     printf("\n");
 
-    // GF(p) reduction using p610
-    cycles = 0;
-    for (n=0; n<BENCH_LOOPS; n++)
-    {
-        mp_mul(a, b, aa, NWORDS_FIELD);
-
-        cycles1 = cpucycles(); 
-        rdc_mont(aa, c);
-        cycles2 = cpucycles();
-        cycles = cycles+(cycles2-cycles1);
-    }
-    printf("  GF(p) reduction runs in ......................................... %7lld ", cycles/BENCH_LOOPS); print_unit;
-    printf("\n");
-
     // GF(p) inversion
     cycles = 0;
     for (n=0; n<SMALL_BENCH_LOOPS; n++)
diff --git a/tests/arith_tests-p751.c b/tests/arith_tests-p751.c
index 8297167..142b601 100644
--- a/tests/arith_tests-p751.c
+++ b/tests/arith_tests-p751.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: testing code for field arithmetic, elliptic curve and isogeny functions
 *********************************************************************************************/
@@ -12,12 +16,12 @@
 
 
 // Benchmark and test parameters  
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM)
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM)
     #define BENCH_LOOPS           100       // Number of iterations per bench
     #define SMALL_BENCH_LOOPS     100       // Number of iterations per bench
     #define TEST_LOOPS             10       // Number of iterations per test
 #else
-    #define BENCH_LOOPS        100000 
+    #define BENCH_LOOPS       1000000 
     #define SMALL_BENCH_LOOPS   10000       
     #define TEST_LOOPS            100  
 #endif
diff --git a/tests/test_SIDHp434.c b/tests/test_SIDHp434.c
index 2c6ba59..ad7da68 100644
--- a/tests/test_SIDHp434.c
+++ b/tests/test_SIDHp434.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp434
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp434_compressed.c b/tests/test_SIDHp434_compressed.c
index e4f02f7..3b3074d 100644
--- a/tests/test_SIDHp434_compressed.c
+++ b/tests/test_SIDHp434_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp434_compressed
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp503.c b/tests/test_SIDHp503.c
index b752925..c9c9ef4 100644
--- a/tests/test_SIDHp503.c
+++ b/tests/test_SIDHp503.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp503
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp503_compressed.c b/tests/test_SIDHp503_compressed.c
index 7723615..a23bda4 100644
--- a/tests/test_SIDHp503_compressed.c
+++ b/tests/test_SIDHp503_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp503_compressed
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp610.c b/tests/test_SIDHp610.c
index 3f43b08..7d886fd 100644
--- a/tests/test_SIDHp610.c
+++ b/tests/test_SIDHp610.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp610
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp610_compressed.c b/tests/test_SIDHp610_compressed.c
index 8e5812b..d442a71 100644
--- a/tests/test_SIDHp610_compressed.c
+++ b/tests/test_SIDHp610_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp610_compressed
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp751.c b/tests/test_SIDHp751.c
index 1ccda28..a13f600 100644
--- a/tests/test_SIDHp751.c
+++ b/tests/test_SIDHp751.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp751
 *********************************************************************************************/ 
diff --git a/tests/test_SIDHp751_compressed.c b/tests/test_SIDHp751_compressed.c
index 5ad29c9..80c9579 100644
--- a/tests/test_SIDHp751_compressed.c
+++ b/tests/test_SIDHp751_compressed.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange SIDHp751_compressed
 *********************************************************************************************/ 
diff --git a/tests/test_SIKEp434.c b/tests/test_SIKEp434.c
index f23833b..fead186 100644
--- a/tests/test_SIKEp434.c
+++ b/tests/test_SIKEp434.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp434
 *********************************************************************************************/ 
diff --git a/tests/test_SIKEp434_compressed.c b/tests/test_SIKEp434_compressed.c
index 1b36781..69c432c 100644
--- a/tests/test_SIKEp434_compressed.c
+++ b/tests/test_SIKEp434_compressed.c
@@ -1,5 +1,9 @@
 /**********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp434_compressed
 ***********************************************************************************************/ 
diff --git a/tests/test_SIKEp503.c b/tests/test_SIKEp503.c
index 7269ba7..7e73a9f 100644
--- a/tests/test_SIKEp503.c
+++ b/tests/test_SIKEp503.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp503
 *********************************************************************************************/ 
diff --git a/tests/test_SIKEp503_compressed.c b/tests/test_SIKEp503_compressed.c
index 324cb34..85a1eb0 100644
--- a/tests/test_SIKEp503_compressed.c
+++ b/tests/test_SIKEp503_compressed.c
@@ -1,5 +1,9 @@
 /**********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp503_compressed
 ***********************************************************************************************/ 
diff --git a/tests/test_SIKEp610.c b/tests/test_SIKEp610.c
index b21a788..c7cda6c 100644
--- a/tests/test_SIKEp610.c
+++ b/tests/test_SIKEp610.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp610
 *********************************************************************************************/ 
diff --git a/tests/test_SIKEp610_compressed.c b/tests/test_SIKEp610_compressed.c
index 66e781e..0acff6d 100644
--- a/tests/test_SIKEp610_compressed.c
+++ b/tests/test_SIKEp610_compressed.c
@@ -1,5 +1,9 @@
 /**********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp610_compressed
 ***********************************************************************************************/ 
diff --git a/tests/test_SIKEp751.c b/tests/test_SIKEp751.c
index b451cb5..3ac6b39 100644
--- a/tests/test_SIKEp751.c
+++ b/tests/test_SIKEp751.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp751
 *********************************************************************************************/ 
diff --git a/tests/test_SIKEp751_compressed.c b/tests/test_SIKEp751_compressed.c
index e1d8728..af05c13 100644
--- a/tests/test_SIKEp751_compressed.c
+++ b/tests/test_SIKEp751_compressed.c
@@ -1,5 +1,9 @@
 /**********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism SIKEp751_compressed
 ***********************************************************************************************/ 
diff --git a/tests/test_extras.c b/tests/test_extras.c
index 70a6285..87f4cb1 100644
--- a/tests/test_extras.c
+++ b/tests/test_extras.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: utility functions for testing and benchmarking
 *********************************************************************************************/
diff --git a/tests/test_extras.h b/tests/test_extras.h
index 8ba7a73..682d0e0 100644
--- a/tests/test_extras.h
+++ b/tests/test_extras.h
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: utility header file for tests
 *********************************************************************************************/  
diff --git a/tests/test_sidh.c b/tests/test_sidh.c
index 8f6e5c1..e2103ac 100644
--- a/tests/test_sidh.c
+++ b/tests/test_sidh.c
@@ -1,16 +1,20 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key exchange
 *********************************************************************************************/ 
 
 
 // Benchmark and test parameters  
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) 
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM)
     #define BENCH_LOOPS        5      // Number of iterations per bench 
     #define TEST_LOOPS         5      // Number of iterations per test
 #else
-    #define BENCH_LOOPS       100       
+    #define BENCH_LOOPS     1000       
     #define TEST_LOOPS        10      
 #endif
 
diff --git a/tests/test_sike.c b/tests/test_sike.c
index 33007ae..91b9c4a 100644
--- a/tests/test_sike.c
+++ b/tests/test_sike.c
@@ -1,5 +1,9 @@
 /********************************************************************************************
 * SIDH: an efficient supersingular isogeny cryptography library
+* Copyright (c) Microsoft Corporation
+*
+* Website: https://github.com/microsoft/PQCrypto-SIDH
+* Released under MIT license
 *
 * Abstract: benchmarking/testing isogeny-based key encapsulation mechanism
 *********************************************************************************************/ 
@@ -13,17 +17,17 @@
 #ifdef DO_VALGRIND_CHECK
     #define TEST_LOOPS   1
 #else 
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) 
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM) 
     #define TEST_LOOPS         5      // Number of iterations per test
 #else
     #define TEST_LOOPS        10      
 #endif     
 #endif
 
-#if defined(GENERIC_IMPLEMENTATION) || (TARGET == TARGET_ARM) 
+#if defined(GENERIC_IMPLEMENTATION) || (OS_TARGET == OS_WIN) || (TARGET == TARGET_ARM) 
     #define BENCH_LOOPS        5      // Number of iterations per bench 
 #else
-    #define BENCH_LOOPS      100    
+    #define BENCH_LOOPS     1000    
 #endif