diff --git a/b.bat b/b.bat
deleted file mode 100644
index 606db8a92..000000000
--- a/b.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-nasm -f elf timer.asm
-gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
\ No newline at end of file
diff --git a/bn.c b/bn.c
deleted file mode 100644
index 8175c3306..000000000
--- a/bn.c
+++ /dev/null
@@ -1,3908 +0,0 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision 
- * integer arithmetic as well as number theoretic functionality.
- * 
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.  
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
- */
-#include "bn.h"
-
-/* chars used in radix conversions */
-static const char *s_rmap = 
-  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-#undef MIN
-#define MIN(x,y) ((x)<(y)?(x):(y))
-#undef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-
-#ifdef DEBUG
-
-/* timing data */
-#ifdef TIMER_X86
-extern ulong64 gettsc(void);
-#else
-ulong64 gettsc(void) { return clock(); }
-#endif
-
-/* structure to hold timing data */
-struct {
-   char *func;
-   ulong64 start, end, tot;
-} timings[1000000];
-
-/* structure to hold consolidated timing data */
-struct _functime { 
-   char *func;
-   ulong64 tot;
-} functime[1000];
-
-static char *_funcs[1000];
-int _ifuncs, _itims;
-
-#define REGFUNC(name) int __IX = _itims++; _funcs[_ifuncs++] = name; timings[__IX].func = name; timings[__IX].start = gettsc();
-#define DECFUNC()     timings[__IX].end = gettsc(); --_ifuncs;
-#define VERIFY(val)   _verify(val, #val, __LINE__);
-
-/* sort the consolidated timings */
-int qsort_helper(const void *A, const void *B)
-{
-   struct _functime *a, *b;
-   
-   a = (struct _functime *)A;
-   b = (struct _functime *)B;
-   
-   if (a->tot > b->tot) return -1;
-   if (a->tot < b->tot) return 1;
-   return 0;
-}
-
-/* reset debugging information */
-void reset_timings(void)
-{
-   _ifuncs = _itims = 0;
-}   
-
-/* dump the timing data */
-void dump_timings(void)
-{
-   int x, y;
-   ulong64 total;
-   
-   /* first for every find the total time */
-   printf("Phase I  ... Finding totals (%d samples)...\n", _itims);
-   for (x = 0; x < _itims; x++) {
-       timings[x].tot = timings[x].end - timings[x].start;
-   }
-   
-   /* now subtract the time for each function where nested functions occured */
-   printf("Phase II ... Finding dependencies...\n");
-   for (x = 0; x < _itims-1; x++) {
-       for (y = x+1; y < _itims && timings[y].start <= timings[x].end; y++) {
-           timings[x].tot -= timings[y].tot;
-           if (timings[x].tot > ((ulong64)1 << (ulong64)40)) {
-              timings[x].tot = 0;
-           }
-       }
-   }
-   
-   /* now consolidate all the entries */
-   printf("Phase III... Consolidation...\n");
-   
-   memset(&functime, 0, sizeof(functime));
-   total = 0;
-   for (x = 0; x < _itims; x++) {
-       if (strcmp(timings[x].func, "_verify")) 
-          total += timings[x].tot;
-       
-       /* try to find this entry */
-       for (y = 0; functime[y].func != NULL; y++) {
-           if (strcmp(timings[x].func, functime[y].func) == 0) {
-              break;
-           }
-       }
-       
-       if (functime[y].func == NULL) {
-          /* new entry */
-          functime[y].func = timings[x].func;
-          functime[y].tot  = timings[x].tot;
-       } else { 
-          functime[y].tot  += timings[x].tot;
-       }
-   }
-   
-   for (x = 0; functime[x].func != NULL; x++);
-   
-   /* sort and dump */
-   qsort(&functime, x, sizeof(functime[0]), &qsort_helper);
-   
-   for (x = 0; functime[x].func != NULL; x++) {
-      if (functime[x].tot > 0 && strcmp(functime[x].func, "_verify") != 0) {
-         printf("%30s: %20llu (%3llu.%03llu %%)\n", functime[x].func, functime[x].tot, (functime[x].tot * (ulong64)100) / total, ((functime[x].tot * (ulong64)100000) / total) % (ulong64)1000);
-      }
-   }
-}   
-
-static void _verify(mp_int *a, char *name, int line)
-{
-  int n, y;
-  static const char *err[] = { "Null DP", "alloc < used", "digits above used" };
-  
-  REGFUNC("_verify");
-
-  /* dp null ? */
-  y = 0;
-  if (a->dp == NULL) goto error;
-  
-  /* used should be <= alloc */
-  ++y;
-  if (a->alloc < a->used) goto error;
-  
-  /* digits above used should be zero */
-  ++y;
-  for (n = a->used; n < a->alloc; n++) {
-     if (a->dp[n]) goto error;
-  }
-  
-  /* ok */
-  DECFUNC();
-  return;
-error:
-  printf("Error (%s) with variable {%s} on line %d\n", err[y], name, line);
-  for (n = _ifuncs - 1; n >= 0; n--) {
-      if (_funcs[n] != NULL) {
-         printf("> %s\n", _funcs[n]);
-      }
-  }
-  printf("\n");
-  exit(0);
-}
-
-#else /* don't use DEBUG stuff so these macros are blank */
-
-#define REGFUNC(name)
-#define DECFUNC()
-#define VERIFY(val)
-
-#endif 
-
-/* init a new bigint */
-int mp_init(mp_int *a)
-{
-    REGFUNC("mp_init");
-    
-    /* allocate ram required and clear it */
-    a->dp = calloc(sizeof(mp_digit), MP_PREC);
-    if (a->dp == NULL) {
-       DECFUNC();
-       return MP_MEM;
-    }
-    
-    /* set the used to zero, allocated digit to the default precision 
-     * and sign to positive */
-    a->used  = 0;
-    a->alloc = MP_PREC;
-    a->sign  = MP_ZPOS;
-    
-    VERIFY(a);
-    DECFUNC();
-    return MP_OKAY;
-}
-
-/* clear one (frees)  */
-void mp_clear(mp_int *a)
-{
-   REGFUNC("mp_clear");
-   if (a->dp != NULL) {
-      VERIFY(a);
-      
-      /* first zero the digits */
-      memset(a->dp, 0, sizeof(mp_digit) * a->used);
-      
-      /* free ram */
-      free(a->dp);
-      
-      /* reset members to make debugging easier */
-      a->dp = NULL;
-      a->alloc = a->used = 0;
-   }
-   DECFUNC();
-}
-
-void mp_exch(mp_int *a, mp_int *b)
-{
-   mp_int t;
-  
-   REGFUNC("mp_exch");
-   VERIFY(a);
-   VERIFY(b);
-   t = *a; *a = *b; *b = t;
-   DECFUNC();
-}   
-
-/* grow as required */
-static int mp_grow(mp_int *a, int size)
-{
-   int i, n;
-   
-   REGFUNC("mp_grow");
-   VERIFY(a);
-   
-   /* if the alloc size is smaller alloc more ram */
-   if (a->alloc < size) {
-      size += (MP_PREC*2) - (size & (MP_PREC-1));           /* ensure there are always at least 16 digits extra on top */
-     
-      a->dp = realloc(a->dp, sizeof(mp_digit)*size);
-      if (a->dp == NULL) {
-         DECFUNC();
-         return MP_MEM;
-      }
-
-      n = a->alloc;
-      a->alloc = size;
-      for (i = n; i < a->alloc; i++) {
-          a->dp[i] = 0;
-      }
-   }
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* shrink a bignum */
-int mp_shrink(mp_int *a)
-{
-   REGFUNC("mp_shrink");
-   VERIFY(a);
-   if (a->alloc != a->used) {
-      if ((a->dp = realloc(a->dp, sizeof(mp_digit) * a->used)) == NULL) {
-         DECFUNC();
-         return MP_MEM;
-      }
-      a->alloc = a->used;
-   }
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* trim unused digits */
-static void mp_clamp(mp_int *a)
-{
-   REGFUNC("mp_clamp");
-   VERIFY(a);
-   while (a->used > 0 && a->dp[a->used-1] == 0) --(a->used);
-   if (a->used == 0) {
-      a->sign = MP_ZPOS;
-   }      
-   DECFUNC();
-}   
-   
-/* set to zero */
-void mp_zero(mp_int *a)
-{
-   REGFUNC("mp_zero");
-   VERIFY(a);
-   a->sign = MP_ZPOS;
-   a->used = 0;
-   memset(a->dp, 0, sizeof(mp_digit) * a->alloc);
-   DECFUNC();
-}
-
-/* set to a digit */
-void mp_set(mp_int *a, mp_digit b)
-{
-   REGFUNC("mp_set");
-   VERIFY(a);
-   mp_zero(a);
-   a->dp[0] = b & MP_MASK;
-   a->used  = (a->dp[0] != 0) ? 1: 0;
-   DECFUNC();
-}
-
-/* set a 32-bit const */
-int mp_set_int(mp_int *a, unsigned long b)
-{
-   int x, res;
-   
-   REGFUNC("mp_set_int");
-   VERIFY(a);
-   mp_zero(a);
-
-   /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
-   for (x = 0; x < 8; x++) {
-   
-      /* shift the number up four bits */
-      if ((res = mp_mul_2d(a, 4, a)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-      
-      /* OR in the top four bits of the source */      
-      a->dp[0] |= (b>>28)&15;
-      
-      /* shift the source up to the next four bits */
-      b <<= 4;
-      
-      /* ensure that digits are not clamped off */      
-      a->used += 32/DIGIT_BIT + 1;
-   }
-   
-   mp_clamp(a);
-   DECFUNC();
-   return MP_OKAY;
-}   
-
-/* init a mp_init and grow it to a given size */
-int mp_init_size(mp_int *a, int size)
-{
-   REGFUNC("mp_init_size");
-   
-   /* pad up so there are at least 16 zero digits */
-   size += (MP_PREC*2) - (size & (MP_PREC-1));           /* ensure there are always at least 16 digits extra on top */
-   a->dp = calloc(sizeof(mp_digit), size);
-   if (a->dp == NULL) {
-      DECFUNC();
-      return MP_MEM;
-   }
-   a->used  = 0;
-   a->alloc = size;
-   a->sign  = MP_ZPOS;
-
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* copy, b = a */
-int mp_copy(mp_int *a, mp_int *b)
-{
-   int res, n;
-   
-   REGFUNC("mp_copy");
-   VERIFY(a);
-   VERIFY(b);
-
-   /* if dst == src do nothing */
-   if (a == b || a->dp == b->dp) {
-      DECFUNC();
-      return MP_OKAY;
-   }
-   
-   /* grow dest */
-   if ((res = mp_grow(b, a->used)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   /* zero b and copy the parameters over */
-   b->used = a->used;
-   b->sign = a->sign;
-   
-   /* copy all the digits */
-   for (n = 0; n < a->used; n++) {
-       b->dp[n] = a->dp[n];
-   }
-   
-   /* clear high digits */
-   for (n = b->used; n < b->alloc; n++) {
-       b->dp[n] = 0;
-   }
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* creates "a" then copies b into it */
-int mp_init_copy(mp_int *a, mp_int *b)
-{
-  int res;
-  
-  REGFUNC("mp_init_copy");
-  VERIFY(b);
-  if ((res = mp_init(a)) != MP_OKAY) {
-     DECFUNC();
-     return res;
-  }
-  res = mp_copy(b, a);
-  DECFUNC();
-  return res;
-}
-
-/* b = |a| */
-int mp_abs(mp_int *a, mp_int *b)
-{
-   int res;
-   REGFUNC("mp_abs");
-   VERIFY(a);
-   VERIFY(b);
-   if ((res = mp_copy(a, b)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   b->sign = MP_ZPOS;
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* b = -a */
-int mp_neg(mp_int *a, mp_int *b)
-{
-   int res;
-   REGFUNC("mp_neg");
-   VERIFY(a);
-   VERIFY(b);
-   if ((res = mp_copy(a, b)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* compare maginitude of two ints (unsigned) */
-int mp_cmp_mag(mp_int *a, mp_int *b) 
-{
-   int n;
-
-   REGFUNC("mp_cmp_mag");
-   VERIFY(a);
-   VERIFY(b);
-   
-   /* compare based on # of non-zero digits */   
-   if (a->used > b->used) {
-      DECFUNC();
-      return MP_GT;
-   } else if (a->used < b->used) {
-      DECFUNC();
-      return MP_LT;
-   }
-   
-   /* compare based on digits  */
-   for (n = a->used - 1; n >= 0; n--) {
-       if (a->dp[n] > b->dp[n]) {
-          DECFUNC();
-          return MP_GT;
-       } else if (a->dp[n] < b->dp[n]) {
-          DECFUNC();
-          return MP_LT;
-       }
-   }
-   DECFUNC();
-   return MP_EQ;
-}
-
-/* compare two ints (signed)*/
-int mp_cmp(mp_int *a, mp_int *b)
-{
-   int res;
-   REGFUNC("mp_cmp");
-   VERIFY(a);
-   VERIFY(b);
-   /* compare based on sign */
-   if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
-      DECFUNC();
-      return MP_LT;
-   } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
-      DECFUNC();
-      return MP_GT;
-   }
-   res = mp_cmp_mag(a, b);
-   DECFUNC();
-   return res;
-}
-
-/* compare a digit */
-int mp_cmp_d(mp_int *a, mp_digit b)
-{
-   REGFUNC("mp_cmp_d");
-   VERIFY(a);
-   
-   if (a->sign == MP_NEG) {
-      DECFUNC();
-      return MP_LT;
-   }
-   
-   if (a->used > 1) {
-      DECFUNC();
-      return MP_GT;
-   }
-   
-   if (a->dp[0] > b) {
-      DECFUNC();
-      return MP_GT;
-   } else if (a->dp[0] < b) {
-      DECFUNC();
-      return MP_LT;
-   } else {
-      DECFUNC();
-      return MP_EQ;
-   }
-}
-
-/* shift right a certain amount of digits */
-void mp_rshd(mp_int *a, int b)
-{
-   int x;
-   
-   REGFUNC("mp_rshd");
-   VERIFY(a);
-   
-   /* if b <= 0 then ignore it */
-   if (b <= 0) {
-      DECFUNC();
-      return;
-   }
-   
-   /* if b > used then simply zero it and return */
-   if (a->used < b) {
-      mp_zero(a);
-      DECFUNC();
-      return;
-   }
-   
-   /* shift the digits down */
-   for (x = 0; x < (a->used - b); x++) {
-       a->dp[x] = a->dp[x + b];
-   }
-   
-   /* zero the top digits */
-   for (; x < a->used; x++) {
-       a->dp[x] = 0;
-   }
-   mp_clamp(a);
-   DECFUNC();
-}
-
-/* shift left a certain amount of digits */
-int mp_lshd(mp_int *a, int b)
-{
-   int x, res;
-   
-   REGFUNC("mp_lshd");
-   VERIFY(a);
-   
-   /* if its less than zero return */
-   if (b <= 0) {
-      DECFUNC();
-      return MP_OKAY;
-   }
-      
-   /* grow to fit the new digits */
-   if ((res = mp_grow(a, a->used + b)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   /* increment the used by the shift amount than copy upwards */
-   a->used += b;
-   for (x = a->used-1; x >= b; x--) {
-       a->dp[x] = a->dp[x - b];
-   }
-   
-   /* zero the lower digits */
-   for (x = 0; x < b; x++) {
-       a->dp[x] = 0;
-   }
-   mp_clamp(a);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* calc a value mod 2^b */
-int mp_mod_2d(mp_int *a, int b, mp_int *c)
-{
-   int x, res;
-   
-   REGFUNC("mp_mod_2d");
-   VERIFY(a);
-   VERIFY(c);
-   
-   /* if b is <= 0 then zero the int */
-   if (b <= 0) {
-      mp_zero(c);
-      DECFUNC();
-      return MP_OKAY;
-   }
-   
-   /* if the modulus is larger than the value than return */
-   if (b > (int)(a->used * DIGIT_BIT)) {
-      res = mp_copy(a, c);
-      DECFUNC();
-      return res;      
-   }
-   
-   /* copy */
-   if ((res = mp_copy(a, c)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-
-   /* zero digits above the last digit of the modulus */
-   for (x = (b/DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
-       c->dp[x] = 0;
-   }
-   /* clear the digit that is not completely outside/inside the modulus */
-   c->dp[b/DIGIT_BIT] &= (mp_digit)((((mp_digit)1)<<(((mp_digit)b) % DIGIT_BIT)) - ((mp_digit)1));
-   mp_clamp(c);
-   DECFUNC();
-   return MP_OKAY;
-}
-   
-/* shift right by a certain bit count (store quotient in c, remainder in d) */
-int  mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d)
-{
-   mp_digit D, r, rr;
-   int x, res;
-   mp_int t;
-   
-   REGFUNC("mp_div_2d");
-   VERIFY(a);
-   VERIFY(c);
-   if (d != NULL) { VERIFY(d); }
-   
-   /* if the shift count is <= 0 then we do no work */
-   if (b <= 0) {
-      res = mp_copy(a, c);
-      if (d != NULL) { mp_zero(d); }
-      DECFUNC();
-      return res;
-   }      
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   /* get the remainder */
-   if (d != NULL) {
-      if ((res = mp_mod_2d(a, b, &t)) != MP_OKAY) {
-         mp_clear(&t);
-         DECFUNC();
-         return res;
-      }
-   }
-   
-   /* copy */
-   if ((res = mp_copy(a, c)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   
-   /* shift by as many digits in the bit count */
-   mp_rshd(c, b/DIGIT_BIT);
-   
-   /* shift any bit count < DIGIT_BIT */
-   D = (mp_digit)(b % DIGIT_BIT);
-   if (D != 0) {
-      r = 0;
-      for (x = c->used - 1; x >= 0; x--) {
-          /* get the lower  bits of this word in a temp */
-          rr = c->dp[x] & ((mp_digit)((1U<<D)-1U));
-          
-          /* shift the current word and mix in the carry bits from the previous word */
-          c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT-D));
-          
-          /* set the carry to the carry bits of the current word found above */
-          r  = rr;
-      }
-   }
-   mp_clamp(c);
-   res = MP_OKAY;
-   if (d != NULL) {
-      mp_exch(&t, d);
-   }
-   mp_clear(&t);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* shift left by a certain bit count */
-int mp_mul_2d(mp_int *a, int b, mp_int *c)
-{
-   mp_digit d, r, rr;
-   int x, res;
-   
-   REGFUNC("mp_mul_2d");
-   VERIFY(a);
-   VERIFY(c);
-   
-   /* copy */
-   if ((res = mp_copy(a, c)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-
-   if ((res = mp_grow(c, c->used + b/DIGIT_BIT + 1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   /* shift by as many digits in the bit count */
-   if ((res = mp_lshd(c, b/DIGIT_BIT)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   c->used = c->alloc;
-   
-   /* shift any bit count < DIGIT_BIT */
-   d = (mp_digit)(b % DIGIT_BIT);   
-   if (d != 0) {
-      r = 0;
-      for (x = 0; x < c->used; x++) {
-          /* get the higher bits of the current word */
-          rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit)((1U<<d)-1U));
-          
-          /* shift the current word and OR in the carry */
-          c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
-          
-          /* set the carry to the carry bits of the current word */
-          r  = rr;
-      }
-   }
-   mp_clamp(c);
-   DECFUNC();
-   return MP_OKAY;
-}   
-
-/* b = a/2 */
-int mp_div_2(mp_int *a, mp_int *b)
-{
-   mp_digit r, rr;
-   int x, res;
-   
-   REGFUNC("mp_div_2");
-   VERIFY(a);
-   VERIFY(b);
-
-   /* copy */
-   if ((res = mp_copy(a, b)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   r = 0;
-   for (x = b->used - 1; x >= 0; x--) {
-       rr = b->dp[x] & 1;
-       b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT-1));
-       r  = rr;
-   }
-   mp_clamp(b);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* b = a*2 */
-int mp_mul_2(mp_int *a, mp_int *b)
-{
-   mp_digit r, rr;
-   int x, res;
-   
-   REGFUNC("mp_mul_2");
-   VERIFY(a);
-   VERIFY(b);
-   
-   /* copy */
-   if ((res = mp_copy(a, b)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-
-   if ((res = mp_grow(b, b->used + 1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   ++b->used;
-   
-   /* shift any bit count < DIGIT_BIT */
-   r = 0;
-   for (x = 0; x < b->used; x++) {
-       rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1;
-       b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK;
-       r  = rr;
-   }
-   mp_clamp(b);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* low level addition, based on HAC pp.594, Algorithm 14.7 */
-static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
-{
-   mp_int *x;
-   int olduse, res, min, max, i;
-   mp_digit u;
-   
-   REGFUNC("s_mp_add");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   /* find sizes, we let |a| <= |b| which means we have to sort
-    * them.  "x" will point to the input with the most digits
-    */
-   if (a->used > b->used) {
-      min = b->used;
-      max = a->used;
-      x   = a;
-   } else if (a->used < b->used) {
-      min = a->used;
-      max = b->used;
-      x   = b;
-   } else {
-      min = max = a->used;
-      x = NULL;
-   }
-   
-   /* init result */
-   if (c->alloc < max+1) {
-      if ((res = mp_grow(c, max+1)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
- 
-   olduse  = c->used;
-   c->used = max + 1;
-
-   /* add digits from lower part */
-   
-   /* set the carry to zero */
-   u = 0;
-   for (i = 0; i < min; i++) {
-       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */   
-       c->dp[i] = a->dp[i] + b->dp[i] + u;
-       
-       /* U = carry bit of T[i] */       
-       u = (c->dp[i] >> DIGIT_BIT) & 1;
-       
-       /* take away carry bit from T[i] */
-       c->dp[i] &= MP_MASK;
-   }
-   
-   /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
-   if (min != max) {
-      for (; i < max; i++) { 
-         /* T[i] = X[i] + U */
-         c->dp[i] = x->dp[i] + u;
-         
-         /* U = carry bit of T[i] */
-         u = (c->dp[i] >> DIGIT_BIT) & 1;
-         
-         /* take away carry bit from T[i] */
-         c->dp[i] &= MP_MASK;
-      }
-   }
-   
-   /* add carry */
-   c->dp[i] = u;
-   
-   /* clear digits above used (since we may not have grown result above) */
-   for (i = c->used; i < olduse; i++) {
-      c->dp[i] = 0;
-   }
-   
-   mp_clamp(c);
-   DECFUNC();
-   return MP_OKAY;       
-}
-
-/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */
-static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
-{
-   int olduse, res, min, max, i;
-   mp_digit u;
-   
-   REGFUNC("s_mp_sub");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   /* find sizes */
-   min = b->used;
-   max = a->used;
-   
-   /* init result */
-   if (c->alloc < max) {
-      if ((res = mp_grow(c, max)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
-   olduse  = c->used;
-   c->used = max;
-   
-   /* sub digits from lower part */
-   
-   /* set carry to zero */
-   u = 0;
-   for (i = 0; i < min; i++) {
-       /* T[i] = A[i] - B[i] - U */
-       c->dp[i] = a->dp[i] - (b->dp[i] + u);
-       
-       /* U = carry bit of T[i] */
-       u = (c->dp[i] >> DIGIT_BIT) & 1;
-       
-       /* Clear carry from T[i] */
-       c->dp[i] &= MP_MASK;
-   }
-   
-   /* now copy higher words if any, e.g. if A has more digits than B  */
-   if (min != max) {
-      for (; i < max; i++) { 
-         /* T[i] = A[i] - U */
-         c->dp[i] = a->dp[i] - u;
-
-         /* U = carry bit of T[i] */
-         u = (c->dp[i] >> DIGIT_BIT) & 1;
-
-         /* Clear carry from T[i] */
-         c->dp[i] &= MP_MASK;
-      }
-   }
-   
-   /* clear digits above used (since we may not have grown result above) */
-   for (i = c->used; i < olduse; i++) {
-      c->dp[i] = 0;
-   }
-
-   mp_clamp(c);
-   DECFUNC();
-   return MP_OKAY;       
-}
-
-/* low level multiplication */
-#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
-
-/* Fast (comba) multiplier
- *
- * This is the fast column-array [comba] multiplier.  It is designed to compute
- * the columns of the product first then handle the carries afterwards.  This 
- * has the effect of making the nested loops that compute the columns very
- * simple and schedulable on super-scalar processors.
- *
- */
-static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
-{
-   int olduse, res, pa, ix;
-   mp_word W[512];
-   
-   REGFUNC("fast_s_mp_mul_digs");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   if (c->alloc < digs) {
-      if ((res = mp_grow(c, digs)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
-   
-   /* clear temp buf (the columns) */
-   memset(W, 0, sizeof(mp_word) * digs);
-   
-   /* calculate the columns */
-   pa = a->used;
-   for (ix = 0; ix < pa; ix++) {
-   
-       /* this multiplier has been modified to allow you to control how many digits 
-        * of output are produced.  So at most we want to make upto "digs" digits
-        * of output
-        */
-       
-       
-       /* this adds products to distinct columns (at ix+iy) of W
-        * note that each step through the loop is not dependent on
-        * the previous which means the compiler can easily unroll
-        * the loop without scheduling problems
-        */
-       {
-          register mp_digit tmpx, *tmpy;
-          register mp_word  *_W;
-          register int      iy, pb;
-          
-          /* alias for the the word on the left e.g. A[ix] * A[iy] */
-          tmpx = a->dp[ix];
-          
-          /* alias for the right side */
-          tmpy = b->dp;
-
-          /* alias for the columns, each step through the loop adds a new
-             term to each column 
-           */
-          _W   = W + ix;
-          
-          /* the number of digits is limited by their placement.  E.g. 
-             we avoid multiplying digits that will end up above the # of
-             digits of precision requested
-           */
-          pb = MIN(b->used, digs - ix);
-          
-          for (iy = 0; iy < pb; iy++) {
-              *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); 
-          }
-       }
-
-   }
-   
-   /* setup dest */
-   olduse  = c->used;
-   c->used = digs;
-
-   
-   /* At this point W[] contains the sums of each column.  To get the
-    * correct result we must take the extra bits from each column and
-    * carry them down
-    *
-    * Note that while this adds extra code to the multiplier it saves time
-    * since the carry propagation is removed from the above nested loop.
-    * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-    * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-    * this is slower but on most cryptographic size numbers it is faster.
-    */
-    
-   for (ix = 1; ix < digs; ix++) {
-       W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
-   }
-   c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
-   
-   /* clear unused */
-   for (; ix < olduse; ix++) {
-      c->dp[ix] = 0;
-   }
-  
-   mp_clamp(c);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* multiplies |a| * |b| and only computes upto digs digits of result 
- * HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits of 
- * output are created.  
- */
-static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
-{
-   mp_int t;
-   int res, pa, pb, ix, iy;
-   mp_digit u;
-   mp_word r;
-   mp_digit tmpx, *tmpt, *tmpy;
-   
-   REGFUNC("s_mp_mul_digs");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   /* can we use the fast multiplier? 
-    *
-    * The fast multiplier can be used if the output will have less than 
-    * 512 digits and the number of digits won't affect carry propagation
-    */
-   if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
-      res = fast_s_mp_mul_digs(a,b,c,digs);
-      DECFUNC();
-      return res;
-   }  
-   
-   if ((res = mp_init_size(&t, digs)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   t.used = digs;
-   
-   /* compute the digits of the product directly */
-   pa = a->used;
-   for (ix = 0; ix < pa; ix++) {
-       /* set the carry to zero */
-       u = 0;
-       
-       /* limit ourselves to making digs digits of output */
-       pb = MIN(b->used, digs - ix);
-              
-       /* setup some aliases */
-       tmpx = a->dp[ix];
-       tmpt = &(t.dp[ix]);
-       tmpy = b->dp;
-       
-       /* compute the columns of the output and propagate the carry */
-       for (iy = 0; iy < pb; iy++) {
-           /* compute the column as a mp_word */
-           r       = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
-           
-           /* the new column is the lower part of the result */           
-           *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
-           
-           /* get the carry word from the result */
-           u       = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
-       }
-       if (ix+iy<digs)
-          *tmpt = u;
-   }
-   
-   mp_clamp(&t);
-   mp_exch(&t, c);
-
-   mp_clear(&t);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* this is a modified version of fast_s_mp_mul_digs that only produces
- * output digits *above* digs.  See the comments for fast_s_mp_mul_digs 
- * to see how it works.
- *
- * This is used in the Barrett reduction since for one of the multiplications
- * only the higher digits were needed.  This essentially halves the work.
- */
-static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
-{
-   int oldused, newused, res, pa, pb, ix;
-   mp_word W[512];
-   
-   REGFUNC("fast_s_mp_mul_high_digs");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   newused = a->used + b->used + 1;
-   if (c->alloc < newused) {
-      if ((res = mp_grow(c, newused)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
-   
-   /* like the other comba method we compute the columns first */
-   pa = a->used;
-   pb = b->used;
-   memset(&W[digs], 0, (pa + pb + 1 - digs) * sizeof(mp_word));
-   for (ix = 0; ix < pa; ix++) {
-       {
-	      register mp_digit tmpx, *tmpy;
-	      register int      iy;
-	      register mp_word  *_W;
-
-          /* work todo, that is we only calculate digits that are at "digs" or above  */
-          iy   = digs - ix;
-          
-          /* copy of word on the left of A[ix] * B[iy] */
-          tmpx = a->dp[ix];
-          
-          /* alias for right side */
-          tmpy = b->dp + iy;
-          
-          /* alias for the columns of output.  Offset to be equal to or above the 
-           * smallest digit place requested 
-           */
-          _W   = &(W[digs]);
-       
-          /* compute column products for digits above the minimum */
-          for (; iy < pb; iy++) {
-              *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
-          }
-       }
-   }
-   
-   /* setup dest */
-   oldused = c->used;
-   c->used = newused;
-   
-   /* now convert the array W downto what we need */
-   for (ix = digs+1; ix < newused; ix++) {
-       W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
-   }
-   c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
-   
-   for (; ix < oldused; ix++) {
-      c->dp[ix] = 0;
-   }
-   mp_clamp(c);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* multiplies |a| * |b| and does not compute the lower digs digits 
- * [meant to get the higher part of the product]
- */
-static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
-{
-   mp_int t;
-   int res, pa, pb, ix, iy;
-   mp_digit u;
-   mp_word r;
-   mp_digit tmpx, *tmpt, *tmpy;
-   
-   REGFUNC("s_mp_mul_high_digs");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   /* can we use the fast multiplier? */
-   if (((a->used + b->used + 1) < 512) && MAX(a->used, b->used) < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
-      res = fast_s_mp_mul_high_digs(a,b,c,digs);
-      DECFUNC();
-      return res;
-   }  
-
-   if ((res = mp_init_size(&t, a->used + b->used + 1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   t.used = a->used + b->used + 1;
-   
-   pa = a->used;
-   pb = b->used;
-   for (ix = 0; ix < pa; ix++) {
-       /* clear the carry */
-       u = 0;
-       
-       /* left hand side of A[ix] * B[iy] */
-       tmpx = a->dp[ix];
-       
-       /* alias to the address of where the digits will be stored */       
-       tmpt = &(t.dp[digs]);
-       
-       /* alias for where to read the right hand side from */       
-       tmpy = b->dp + (digs - ix);
-       
-       for (iy = digs - ix; iy < pb; iy++) {
-           /* calculate the double precision result */
-           r       = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
-           
-           /* get the lower part */
-           *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
-           
-           /* carry the carry */
-           u       = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
-       }
-       *tmpt = u;
-   }
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* fast squaring 
- *
- * This is the comba method where the columns of the product are computed first 
- * then the carries are computed.  This has the effect of making a very simple
- * inner loop that is executed the most
- *
- * W2 represents the outer products and W the inner.  
- *
- * A further optimizations is made because the inner products are of the form
- * "A * B * 2".  The *2 part does not need to be computed until the end which is
- * good because 64-bit shifts are slow!
- *
- *
- */
-static int fast_s_mp_sqr(mp_int *a, mp_int *b)
-{
-   int olduse, newused, res, ix, pa;
-   mp_word  W2[512], W[512];
-   
-   REGFUNC("fast_s_mp_sqr");
-   VERIFY(a);
-   VERIFY(b);
-
-   pa = a->used;
-   newused = pa + pa + 1;
-   if (b->alloc < newused) {
-      if ((res = mp_grow(b, newused)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }   
-   
-   /* zero temp buffer (columns) */
-   memset(W, 0, (pa+pa+1)*sizeof(mp_word));
-   memset(W2, 0, (pa+pa+1)*sizeof(mp_word));
-   
-   for (ix = 0; ix < pa; ix++) {
-       /* compute the outer product */
-       W2[ix+ix]   += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
-       
-       {
-          register mp_digit tmpx, *tmpy;
-          register mp_word  *_W;
-          register int      iy;
-          
-          /* copy of left side */
-          tmpx = a->dp[ix];
-          
-          /* alias for right side */
-          tmpy = a->dp + (ix + 1);
-          
-	      _W   = &(W[ix+ix+1]);
-	   
-	      /* inner products */
-          for (iy = ix + 1; iy < pa; iy++) {
-	          *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
-          }
-       }
-   }
-   
-   /* double first value, since the inner products are half of what they should be */
-   W[0] += W[0] + W2[0];
-   
-   /* setup dest */
-   olduse  = b->used;
-   b->used = newused;
-   
-   /* now compute digits */
-   for (ix = 1; ix < newused; ix++) {
-       /* double/add next digit */
-       W[ix]       += W[ix] + W2[ix];
-
-       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
-   }
-   b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
-   
-   /* clear high */
-   for (; ix < olduse; ix++) {
-       b->dp[ix] = 0;
-   }
-   
-   /* fix the sign (since we no longer make a fresh temp) */
-   b->sign = MP_ZPOS;
-   
-   mp_clamp(b);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-static int s_mp_sqr(mp_int *a, mp_int *b)
-{
-   mp_int t;
-   int res, ix, iy, pa;
-   mp_word  r, u;
-   mp_digit tmpx, *tmpt;
-   
-   REGFUNC("s_mp_sqr");
-   VERIFY(a);
-   VERIFY(b);
-   
-   /* can we use the fast multiplier? */  
-   if (((a->used * 2 + 1) < 512) && a->used < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT) - 1))) {
-      res = fast_s_mp_sqr(a,b);
-      DECFUNC();
-      return res;
-   }  
-   
-   pa = a->used;
-   if ((res = mp_init_size(&t, pa + pa + 1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   t.used = pa + pa + 1;
-   
-   for (ix = 0; ix < pa; ix++) {
-       /* first calculate the digit at 2*ix */
-       /* calculate double precision result */   
-       r           = ((mp_word)t.dp[ix+ix]) + ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
-       
-       /* store lower part in result */       
-       t.dp[ix+ix] = (mp_digit)(r & ((mp_word)MP_MASK));
-       
-       /* get the carry */
-	   u           = (r >> ((mp_word)DIGIT_BIT));
-	   
-	   /* left hand side of A[ix] * A[iy] */
-	   tmpx = a->dp[ix];
-	   
-	   /* alias for where to store the results */
-	   tmpt = &(t.dp[ix+ix+1]);
-	   for (iy = ix + 1; iy < pa; iy++) {
-	       /* first calculate the product */
-	       r           = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
-	       
-	       /* now calculate the double precision result, note we use
-	        * addition instead of *2 since its easier to optimize
-	        */
-	       r           = ((mp_word)*tmpt) + r + r + ((mp_word)u);
-	       
-	       /* store lower part */
-           *tmpt++     = (mp_digit)(r & ((mp_word)MP_MASK));
-           
-           /* get carry */
-    	   u           = (r >> ((mp_word)DIGIT_BIT));
-       }
-       r           = ((mp_word)*tmpt) + u;
-       *tmpt       = (mp_digit)(r & ((mp_word)MP_MASK));
-  	   u           = (r >> ((mp_word)DIGIT_BIT));
-       /* propagate upwards */
-       ++tmpt;
-       while (u != ((mp_word)0)) { 
-          r                = ((mp_word)*tmpt) + ((mp_word)1);
-          *tmpt++          = (mp_digit)(r & ((mp_word)MP_MASK));
-          u                = (r >> ((mp_word)DIGIT_BIT));
-       }
-   }
-   
-   mp_clamp(&t);
-   mp_exch(&t, b);
-   mp_clear(&t);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* high level addition (handles signs) */
-int mp_add(mp_int *a, mp_int *b, mp_int *c)
-{
-   int sa, sb, res;
- 
-   REGFUNC("mp_add");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-
-   sa = a->sign;
-   sb = b->sign;
-
-   /* handle four cases */
-   if (sa == MP_ZPOS && sb == MP_ZPOS) {
-      /* both positive */
-      res = s_mp_add(a, b, c);
-      c->sign = MP_ZPOS;
-   } else if (sa == MP_ZPOS && sb == MP_NEG) {
-      /* a + -b == a - b, but if b>a then we do it as -(b-a) */
-      if (mp_cmp_mag(a, b) == MP_LT) {
-         res = s_mp_sub(b, a, c);
-         c->sign = MP_NEG;
-      } else {
-         res = s_mp_sub(a, b, c);
-         c->sign = MP_ZPOS;
-      }
-   } else if (sa == MP_NEG && sb == MP_ZPOS) {
-      /* -a + b == b - a, but if a>b then we do it as -(a-b) */
-      if (mp_cmp_mag(a, b) == MP_GT) {
-         res = s_mp_sub(a, b, c);
-         c->sign = MP_NEG;
-      } else {
-         res = s_mp_sub(b, a, c);
-         c->sign = MP_ZPOS;
-      }
-   } else {
-      /* -a + -b == -(a + b) */
-      res = s_mp_add(a, b, c);
-      c->sign = MP_NEG;
-   }
-   DECFUNC();
-   return res;
-}
-
-/* high level subtraction (handles signs) */
-int mp_sub(mp_int *a, mp_int *b, mp_int *c)
-{
-   int sa, sb, res;
-   
-   REGFUNC("mp_sub");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   sa = a->sign;
-   sb = b->sign;
-
-   /* handle four cases */
-   if (sa == MP_ZPOS && sb == MP_ZPOS) {
-      /* both positive, a - b, but if b>a then we do -(b - a) */
-      if (mp_cmp_mag(a, b) == MP_LT) {
-         /* b>a */
-         res = s_mp_sub(b, a, c); 
-         c->sign = MP_NEG;
-      } else {
-         res = s_mp_sub(a, b, c);
-         c->sign = MP_ZPOS;
-      }
-   } else if (sa == MP_ZPOS && sb == MP_NEG) {
-      /* a - -b == a + b  */
-      res = s_mp_add(a, b, c);
-      c->sign = MP_ZPOS;
-   } else if (sa == MP_NEG && sb == MP_ZPOS) {
-      /* -a - b == -(a + b) */
-      res = s_mp_add(a, b, c);
-      c->sign = MP_NEG;
-   } else {
-      /* -a - -b == b - a, but if a>b == -(a - b) */
-      if (mp_cmp_mag(a, b) == MP_GT) {
-         res = s_mp_sub(a, b, c);
-         c->sign = MP_NEG;
-      } else {
-         res = s_mp_sub(b, a, c);
-         c->sign = MP_ZPOS;
-      }
-   }
-
-   DECFUNC();
-   return res;
-}
-
-/* c = |a| * |b| using Karatsuba Multiplication */
-static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
-{
-   mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1;
-   int B, err, x;
-
-   REGFUNC("mp_karatsuba_mul");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-
-   err = MP_MEM;
-
-   /* min # of digits */
-   B = MIN(a->used, b->used);
- 
-   /* now divide in two */
-   B = B/2;
-
-   /* init copy all the temps */
-   if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
-   if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
-   if (mp_init_size(&y0, B) != MP_OKAY) goto X1;
-   if (mp_init_size(&y1, b->used - B) != MP_OKAY) goto Y0;
-
-   /* init temps */
-   if (mp_init(&t1) != MP_OKAY)         goto Y1;
-   if (mp_init(&t2) != MP_OKAY)         goto T1;
-   if (mp_init(&x0y0) != MP_OKAY)       goto T2;
-   if (mp_init(&x1y1) != MP_OKAY)       goto X0Y0;
-
-   /* now shift the digits */
-   x0.sign = x1.sign = a->sign;
-   y0.sign = y1.sign = b->sign;
-   
-   x0.used = y0.used = B;
-   x1.used = a->used - B;
-   y1.used = b->used - B;
-   
-   for (x = 0; x < B; x++) {
-      x0.dp[x] = a->dp[x];
-      y0.dp[x] = b->dp[x];
-   }
-   for (x = B; x < a->used; x++) {
-      x1.dp[x-B] = a->dp[x];
-   }
-   for (x = B; x < b->used; x++) {
-      y1.dp[x-B] = b->dp[x];
-   }
-   
-   mp_clamp(&x0);
-   mp_clamp(&y0);
-   
-   /* now calc the products x0y0 and x1y1 */
-   if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;             /* x0y0 = x0*y0 */
-   if (mp_mul(&x1, &y1, &x1y1) != MP_OKAY) goto X1Y1;             /* x1y1 = x1*y1 */
-
-   /* now calc x1-x0 and y1-y0 */
-   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1;               /* t1 = x1 - x0 */
-   if (mp_sub(&y1, &y0, &t2) != MP_OKAY) goto X1Y1;               /* t2 = y1 - y0 */
-   if (mp_mul(&t1, &t2, &t1) != MP_OKAY) goto X1Y1;               /* t1 = (x1 - x0) * (y1 - y0) */
-
-   /* add x0y0 */
-   if (mp_add(&x0y0, &x1y1, &t2) != MP_OKAY) goto X1Y1;           /* t2 = x0y0 + x1y1 */
-   if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1Y1;               /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-
-   /* shift by B */
-   if (mp_lshd(&t1, B) != MP_OKAY) goto X1Y1;                     /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-   if (mp_lshd(&x1y1, B*2) != MP_OKAY) goto X1Y1;                 /* x1y1 = x1y1 << 2*B */
-
-   if (mp_add(&x0y0, &t1, &t1) != MP_OKAY) goto X1Y1;             /* t1 = x0y0 + t1 */
-   if (mp_add(&t1, &x1y1, c) != MP_OKAY) goto X1Y1;               /* t1 = x0y0 + t1 + x1y1 */
-
-   err = MP_OKAY;
-
-X1Y1: mp_clear(&x1y1);
-X0Y0: mp_clear(&x0y0);
-T2  : mp_clear(&t2);
-T1  : mp_clear(&t1);
-Y1  : mp_clear(&y1);
-Y0  : mp_clear(&y0);
-X1  : mp_clear(&x1);
-X0  : mp_clear(&x0);
-ERR :
-    DECFUNC();
-    return err;
-}
-
-/* high level multiplication (handles sign) */
-int mp_mul(mp_int *a, mp_int *b, mp_int *c)
-{
-   int res, neg;
-   REGFUNC("mp_mul");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-   if (MIN(a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
-      res = mp_karatsuba_mul(a, b, c);
-   } else {
-      res = s_mp_mul(a, b, c);
-   }
-   c->sign = neg;
-   DECFUNC();
-   return res;
-}
-
-/* Karatsuba squaring, computes b = a*a */
-static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
-{
-   mp_int x0, x1, t1, t2, x0x0, x1x1;
-   int B, err, x;
-   
-   REGFUNC("mp_karatsuba_sqr");
-   VERIFY(a);
-   VERIFY(b);
-
-   err = MP_MEM;
-
-   /* min # of digits */
-   B = a->used;
-
-   /* now divide in two */
-   B = B/2;
-
-   /* init copy all the temps */
-   if (mp_init_size(&x0, B) != MP_OKAY) goto ERR;
-   if (mp_init_size(&x1, a->used - B) != MP_OKAY) goto X0;
-
-   /* init temps */
-   if (mp_init(&t1) != MP_OKAY)         goto X1;
-   if (mp_init(&t2) != MP_OKAY)         goto T1;
-   if (mp_init(&x0x0) != MP_OKAY)       goto T2;
-   if (mp_init(&x1x1) != MP_OKAY)       goto X0X0;
-
-   /* now shift the digits */
-   for (x = 0; x < B; x++) {
-       x0.dp[x] = a->dp[x];
-   }
-
-   for (x = B; x < a->used; x++) {
-       x1.dp[x-B] = a->dp[x];
-   }
-   
-   x0.used = B;
-   x1.used = a->used - B;
-   
-   mp_clamp(&x0);
-   
-   /* now calc the products x0*x0 and x1*x1 */
-   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                  /* x0x0 = x0*x0 */
-   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                  /* x1x1 = x1*x1 */
-
-   /* now calc x1-x0 and y1-y0 */
-   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */
-   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                    /* t1 = (x1 - x0) * (y1 - y0) */
-
-   /* add x0y0 */
-   if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */
-   if (mp_sub(&t2, &t1, &t1) != MP_OKAY) goto X1X1;               /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-
-   /* shift by B */
-   if (mp_lshd(&t1, B) != MP_OKAY) goto X1X1;                     /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-   if (mp_lshd(&x1x1, B*2) != MP_OKAY) goto X1X1;                 /* x1y1 = x1y1 << 2*B */
-
-   if (mp_add(&x0x0, &t1, &t1) != MP_OKAY) goto X1X1;             /* t1 = x0y0 + t1 */
-   if (mp_add(&t1, &x1x1, b) != MP_OKAY) goto X1X1;               /* t1 = x0y0 + t1 + x1y1 */
-
-   err = MP_OKAY;
-   
-X1X1: mp_clear(&x1x1);
-X0X0: mp_clear(&x0x0);
-T2  : mp_clear(&t2);
-T1  : mp_clear(&t1);
-X1  : mp_clear(&x1);
-X0  : mp_clear(&x0);
-ERR :
-    DECFUNC();
-    return err;
-}
-
-/* computes b = a*a */
-int mp_sqr(mp_int *a, mp_int *b)
-{
-   int res;
-   REGFUNC("mp_sqr");
-   VERIFY(a);
-   VERIFY(b);
-   if (a->used > KARATSUBA_SQR_CUTOFF) {
-      res = mp_karatsuba_sqr(a, b);
-   } else {
-      res = s_mp_sqr(a, b);
-   }
-   b->sign = MP_ZPOS;
-   DECFUNC();
-   return res;
-}
-
-
-/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] 
- * HAC pp.598 Algorithm 14.20
- *
- * Note that the description in HAC is horribly incomplete.  For example, 
- * it doesn't consider the case where digits are removed from 'x' in the inner
- * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
- *
- * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
-*/
-int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
-{
-   mp_int q, x, y, t1, t2;
-   int res, n, t, i, norm, neg;
-   
-   REGFUNC("mp_div");
-   VERIFY(a);
-   VERIFY(b);
-   if (c != NULL) { VERIFY(c); }
-   if (d != NULL) { VERIFY(d); }
-   
-   /* is divisor zero ? */
-   if (mp_iszero(b) == 1) {
-      DECFUNC();
-      return MP_VAL;
-   }
-   
-   /* if a < b then q=0, r = a */
-   if (mp_cmp_mag(a, b) == MP_LT) {
-      if (d != NULL) {
-           res = mp_copy(a, d);
-      } else {
-           res = MP_OKAY;
-      }
-      if (c != NULL) {
-         mp_zero(c);
-      }
-      DECFUNC();
-      return res;
-   }
- 
-   if ((res = mp_init_size(&q, a->used + 2)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   q.used = a->used + 2;
-   
-   if ((res = mp_init(&t1)) != MP_OKAY) {
-      goto __Q;
-   }
-   
-   if ((res = mp_init(&t2)) != MP_OKAY) {
-      goto __T1;
-   }
-   
-   if ((res = mp_init_copy(&x, a)) != MP_OKAY) {
-      goto __T2;
-   }
-
-   if ((res = mp_init_copy(&y, b)) != MP_OKAY) {
-      goto __X;
-   }
-   
-   /* fix the sign */
-   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-   x.sign = y.sign = MP_ZPOS;
-   
-   /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
-   norm = 0;
-   while ((y.dp[y.used-1] & (((mp_digit)1)<<(DIGIT_BIT-1))) == ((mp_digit)0)) {
-      ++norm;
-      if ((res = mp_mul_2d(&x, 1, &x)) != MP_OKAY) {
-         goto __Y;
-      }
-      if ((res = mp_mul_2d(&y, 1, &y)) != MP_OKAY) {
-         goto __Y;
-      }
-   }
-   
-   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-   n = x.used - 1;
-   t = y.used - 1;
-
-   /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
-   if ((res = mp_lshd(&y, n - t)) != MP_OKAY) {                            /* y = y*b^{n-t} */
-      goto __Y;
-   }
- 
-   while (mp_cmp(&x, &y) != MP_LT) {
-       ++(q.dp[n - t]);
-       if ((res = mp_sub(&x, &y, &x)) != MP_OKAY) {
-          goto __Y;
-       }
-   }
-   
-   /* reset y by shifting it back down */
-   mp_rshd(&y, n - t);
-   
-   /* step 3. for i from n down to (t + 1) */
-   for (i = n; i >= (t + 1); i--) {
-       if (i > x.alloc) continue;
-       
-       /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
-       if (x.dp[i] == y.dp[t]) {
-          q.dp[i - t - 1] = ((1UL<<DIGIT_BIT)-1UL);
-       } else {
-          mp_word tmp;
-          tmp  = ((mp_word)x.dp[i]) << ((mp_word)DIGIT_BIT);
-          tmp |= ((mp_word)x.dp[i-1]);
-          tmp /= ((mp_word)y.dp[t]);
-          if (tmp > (mp_word)MP_MASK) tmp = MP_MASK;
-          q.dp[i - t - 1] = (mp_digit)(tmp & (mp_word)(MP_MASK));
-       }
-       
-       /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
-       q.dp[i-t-1] = (q.dp[i-t-1] + 1) & MP_MASK;
-       do {
-          q.dp[i-t-1] = (q.dp[i-t-1] - 1) & MP_MASK;
-          
-          /* find left hand */
-          mp_zero(&t1);
-          t1.dp[0] = (t-1 < 0) ? 0 : y.dp[t-1];
-          t1.dp[1] = y.dp[t];
-          t1.used = 2;
-          if ((res = mp_mul_d(&t1, q.dp[i-t-1], &t1)) != MP_OKAY) {
-             goto __Y;
-          }
-          
-          /* find right hand */
-          t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i-2];
-          t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i-1];
-          t2.dp[2] = x.dp[i];
-          t2.used = 3;
-       } while (mp_cmp(&t1, &t2) == MP_GT);
-        
-       /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
-       if ((res = mp_mul_d(&y, q.dp[i-t-1], &t1)) != MP_OKAY) {
-          goto __Y;
-       }
-       
-       if ((res = mp_lshd(&t1, i - t - 1)) != MP_OKAY) {
-          goto __Y;
-       }
-       
-       if ((res = mp_sub(&x, &t1, &x)) != MP_OKAY) { 
-          goto __Y;
-       }
-       
-       /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
-       if (x.sign == MP_NEG) {
-          if ((res = mp_copy(&y, &t1)) != MP_OKAY) {
-             goto __Y;
-          }
-          if ((res = mp_lshd(&t1, i-t-1)) != MP_OKAY) {
-             goto __Y;
-          }
-          if ((res = mp_add(&x, &t1, &x)) != MP_OKAY) {
-             goto __Y;
-          }
-          
-          q.dp[i-t-1] = (q.dp[i-t-1] - 1UL) & MP_MASK;
-       }
-    }
-    
-    /* now q is the quotient and x is the remainder [which we have to normalize] */
-    /* get sign before writing to c */
-    x.sign = a->sign;
-    if (c != NULL) {
-       mp_clamp(&q);
-       mp_exch(&q, c);
-       c->sign = neg;
-    }
-    
-    if (d != NULL) {
-       mp_div_2d(&x, norm, &x, NULL);
-       mp_clamp(&x);
-       mp_exch(&x, d);
-    }
-        
-   res = MP_OKAY;
-
-__Y:   mp_clear(&y);
-__X:   mp_clear(&x);
-__T2:  mp_clear(&t2);
-__T1:  mp_clear(&t1);
-__Q:   mp_clear(&q);
-   DECFUNC();
-   return res;   
-}
-
-/* c = a mod b, 0 <= c < b */
-int mp_mod(mp_int *a, mp_int *b, mp_int *c)
-{
-   mp_int t;
-   int res;
-   
-   REGFUNC("mp_mod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-
-   if ((res = mp_div(a, b, NULL, &t)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   
-   if (t.sign == MP_NEG) {
-      res = mp_add(b, &t, c);
-   } else {
-      res = MP_OKAY;
-      mp_exch(&t, c);
-   }
-
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}
-
-/* single digit addition */
-int mp_add_d(mp_int *a, mp_digit b, mp_int *c)
-{
-   mp_int t;
-   int res;
-   
-   REGFUNC("mp_add_d");
-   VERIFY(a);
-   VERIFY(c);
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   mp_set(&t, b);
-   res = mp_add(a, &t, c);
-
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}   
-   
-/* single digit subtraction */
-int mp_sub_d(mp_int *a, mp_digit b, mp_int *c)
-{
-   mp_int t;
-   int res;
-   
-   REGFUNC("mp_sub_d");
-   VERIFY(a);
-   VERIFY(c);
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   mp_set(&t, b);
-   res = mp_sub(a, &t, c);
-
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}
-
-/* multiply by a digit */
-int mp_mul_d(mp_int *a, mp_digit b, mp_int *c)
-{
-   int res, pa, ix;
-   mp_word  r;
-   mp_digit u;
-   mp_int   t;
-   
-   REGFUNC("mp_mul_d");
-   VERIFY(a);
-   VERIFY(c);
-   
-   pa = a->used;
-   if ((res = mp_init_size(&t, pa + 2)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   t.used = pa + 2;
-   
-   u = 0;
-   for (ix = 0; ix < pa; ix++) {
-       r = ((mp_word)u) + ((mp_word)a->dp[ix]) * ((mp_word)b);
-       t.dp[ix] = (mp_digit)(r & ((mp_word)MP_MASK));
-	   u        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
-   }
-   t.dp[ix] = u;
-   
-   t.sign = a->sign;
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* single digit division */
-int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
-{
-   mp_int t, t2;
-   int res;
-   
-   REGFUNC("mp_div_d");
-   VERIFY(a);
-   if (c != NULL) { VERIFY(c); }
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-
-   if ((res = mp_init(&t2)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-
-   mp_set(&t, b);
-   res = mp_div(a, &t, c, &t2);
-
-   if (d != NULL) {
-      *d = t2.dp[0];
-   }
-   
-   mp_clear(&t);
-   mp_clear(&t2);
-   DECFUNC();
-   return res;
-}
-
-int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c)
-{
-   mp_int t, t2;
-   int res;
-   
-   REGFUNC("mp_mod_d");
-   VERIFY(a);
-      
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_init(&t2)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   
-   mp_set(&t, b);
-   mp_div(a, &t, NULL, &t2);
-   
-   if (t2.sign == MP_NEG) {
-      if ((res = mp_add_d(&t2, b, &t2)) != MP_OKAY) {
-         mp_clear(&t);
-         mp_clear(&t2);
-         DECFUNC();
-         return res;
-      }
-   }
-   *c = t2.dp[0];
-   mp_clear(&t);
-   mp_clear(&t2);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-int mp_expt_d(mp_int *a, mp_digit b, mp_int *c)
-{
-   int res, x;
-   mp_int g;
-   
-   REGFUNC("mp_expt_d");
-   VERIFY(a);
-   VERIFY(c);
-   
-   if ((res = mp_init_copy(&g, a)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   /* set initial result */
-   mp_set(c, 1);
-   
-   for (x = 0; x < (int)DIGIT_BIT; x++) {
-       if ((res = mp_sqr(c, c)) != MP_OKAY) {
-          mp_clear(&g);
-          DECFUNC();
-          return res;
-       }
-       
-       if ((b & (mp_digit)(1<<(DIGIT_BIT-1))) != 0) {
-          if ((res = mp_mul(c, &g, c)) != MP_OKAY) {
-             mp_clear(&g);
-             DECFUNC();
-             return res;
-          }
-       }
-       
-       b <<= 1;
-   }
-   
-   mp_clear(&g);
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* simple modular functions */
-
-/* d = a + b (mod c) */
-int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
-{
-   int res;
-   mp_int t;
-   
-   REGFUNC("mp_addmod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   VERIFY(d);
-   
-   if ((res = mp_init(&t)) != MP_OKAY) { 
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_add(a, b, &t)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   res = mp_mod(&t, c, d);
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}
-
-/* d = a - b (mod c) */
-int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
-{
-   int res;
-   mp_int t;
-   
-   REGFUNC("mp_submod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   VERIFY(d);
-      
-   if ((res = mp_init(&t)) != MP_OKAY) { 
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_sub(a, b, &t)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   res = mp_mod(&t, c, d);
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}
-
-/* d = a * b (mod c) */
-int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
-{
-   int res;
-   mp_int t;
-   
-   REGFUNC("mp_mulmod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   VERIFY(d);
-   
-   if ((res = mp_init(&t)) != MP_OKAY) { 
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_mul(a, b, &t)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   res = mp_mod(&t, c, d);
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}
-
-/* c = a * a (mod b) */
-int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c)
-{
-   int res;
-   mp_int t;
-   
-   REGFUNC("mp_sqrmod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-
-   if ((res = mp_init(&t)) != MP_OKAY) { 
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_sqr(a, &t)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   res = mp_mod(&t, b, c);
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}
-
-/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP] 
- */
-int mp_gcd(mp_int *a, mp_int *b, mp_int *c)
-{
-   mp_int u, v, t;
-   int k, res, neg;
-   
-   REGFUNC("mp_gcd");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   /* either zero than gcd is the largest */
-   if (mp_iszero(a) == 1 && mp_iszero(b) == 0) {
-      DECFUNC();
-      return mp_copy(b, c);
-   }
-   if (mp_iszero(a) == 0 && mp_iszero(b) == 1) {
-      DECFUNC();
-      return mp_copy(a, c);
-   }
-   if (mp_iszero(a) == 1 && mp_iszero(b) == 1) {
-      mp_set(c, 1);
-      DECFUNC();
-      return MP_OKAY;
-   }
-   
-   /* if both are negative they share (-1) as a common divisor */
-   neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
-   
-   if ((res = mp_init_copy(&u, a)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_init_copy(&v, b)) != MP_OKAY) {
-      goto __U;
-   }
-   
-   /* must be positive for the remainder of the algorithm */
-   u.sign = v.sign = MP_ZPOS;
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      goto __V;
-   }
-   
-   /* B1.  Find power of two */
-   k = 0;
-   while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
-       ++k;
-       if ((res = mp_div_2d(&u, 1, &u, NULL)) != MP_OKAY) {
-          goto __T;
-       }
-       if ((res = mp_div_2d(&v, 1, &v, NULL)) != MP_OKAY) {
-          goto __T;
-       }
-   }
-   
-   /* B2.  Initialize */
-   if ((u.dp[0] & 1) == 1) {
-      if ((res = mp_copy(&v, &t)) != MP_OKAY) {
-         goto __T;
-      }
-      t.sign = MP_NEG;
-   } else {
-      if ((res = mp_copy(&u, &t)) != MP_OKAY) {
-         goto __T;
-      }
-   }
-   
-   do {
-      /* B3 (and B4).  Halve t, if even */
-      while (t.used != 0 && (t.dp[0] & 1) == 0) {
-          if ((res = mp_div_2d(&t, 1, &t, NULL)) != MP_OKAY) {
-             goto __T;
-          }
-      }
-      
-      /* B5.  if t>0 then u=t otherwise v=-t */
-      if (t.used != 0 && t.sign != MP_NEG) {
-         if ((res = mp_copy(&t, &u)) != MP_OKAY) {
-            goto __T;
-         }
-      } else {
-         if ((res = mp_copy(&t, &v)) != MP_OKAY) {
-            goto __T;
-         }
-         v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-      }
-      
-      /* B6.  t = u - v, if t != 0 loop otherwise terminate */
-      if ((res = mp_sub(&u, &v, &t)) != MP_OKAY) {
-         goto __T;
-      }
-   } while (t.used != 0);
-   
-   if ((res = mp_mul_2d(&u, k, &u)) != MP_OKAY) {
-      goto __T;
-   }
-   
-   mp_exch(&u, c);
-   c->sign = neg;
-   res = MP_OKAY;
-__T:   mp_clear(&t);
-__V:   mp_clear(&u);
-__U:   mp_clear(&v);
-   DECFUNC();
-   return res;
-}
-         
-/* computes least common multipble as a*b/(a, b) */
-int mp_lcm(mp_int *a, mp_int *b, mp_int *c)
-{
-   int res;
-   mp_int t;
-   
-   REGFUNC("mp_lcm");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_mul(a, b, &t)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   
-   if ((res = mp_gcd(a, b, c)) != MP_OKAY) {
-      mp_clear(&t);
-      DECFUNC();
-      return res;
-   }
-   
-   res = mp_div(&t, c, c, NULL);
-   mp_clear(&t);
-   DECFUNC();
-   return res;
-}   
-
-/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
-static int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c)
-{
-   mp_int x, y, u, v, B, D;
-   int res, neg;
-   
-   REGFUNC("fast_mp_invmod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   if ((res = mp_init(&x)) != MP_OKAY) {
-      goto __ERR;
-   }
-   
-   if ((res = mp_init(&y)) != MP_OKAY) {
-      goto __X;
-   }
-   
-   if ((res = mp_init(&u)) != MP_OKAY) {
-      goto __Y;
-   }
-   
-   if ((res = mp_init(&v)) != MP_OKAY) {
-      goto __U;
-   }
-   
-   if ((res = mp_init(&B)) != MP_OKAY) {
-      goto __V;
-   }
- 
-   if ((res = mp_init(&D)) != MP_OKAY) {
-      goto __B;
-   }
-   
-   /* x == modulus, y == value to invert */
-   if ((res = mp_copy(b, &x)) != MP_OKAY) {
-      goto __D;
-   }
-   if ((res = mp_copy(a, &y)) != MP_OKAY) {
-      goto __D;
-   }
-   
-   if ((res = mp_abs(&y, &y)) != MP_OKAY) {
-      goto __D;
-   }
-   
-   /* 2. [modified] if x,y are both even then return an error! */
-   if (mp_iseven(&x) == 1 && mp_iseven(&y) == 1) {
-      res = MP_VAL;
-      goto __D;
-   }
-   
-   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-   if ((res = mp_copy(&x, &u)) != MP_OKAY) {
-      goto __D;
-   }
-   if ((res = mp_copy(&y, &v)) != MP_OKAY) {
-      goto __D;
-   }
-   mp_set(&D, 1);
-   
-
-top:   
-   /* 4.  while u is even do */
-   while (mp_iseven(&u) == 1) {
-      /* 4.1 u = u/2 */
-      if ((res = mp_div_2(&u, &u)) != MP_OKAY) {
-         goto __D;
-      }
-      /* 4.2 if A or B is odd then */
-      if (mp_iseven(&B) == 0) {
-         if ((res = mp_sub(&B, &x, &B)) != MP_OKAY) {
-            goto __D;
-         }
-      }
-      /* A = A/2, B = B/2 */
-	  if ((res = mp_div_2(&B, &B)) != MP_OKAY) {
-	     goto __D;
-	  }
-   }
-   
-  
-   /* 5.  while v is even do */
-   while (mp_iseven(&v) == 1) {
-      /* 5.1 v = v/2 */
-      if ((res = mp_div_2(&v, &v)) != MP_OKAY) {
-         goto __D;
-      }
-      /* 5.2 if C,D are even then */
-      if (mp_iseven(&D) == 0) {
-         /* C = (C+y)/2, D = (D-x)/2 */
-         if ((res = mp_sub(&D, &x, &D)) != MP_OKAY) {
-            goto __D;
-         }
-      }
-      /* C = C/2, D = D/2 */
-	  if ((res = mp_div_2(&D, &D)) != MP_OKAY) {
-	     goto __D;
-	  }
-   }
-   
-   /* 6.  if u >= v then */
-   if (mp_cmp(&u, &v) != MP_LT) {
-      /* u = u - v, A = A - C, B = B - D */
-      if ((res = mp_sub(&u, &v, &u)) != MP_OKAY) {
-         goto __D;
-      }
-   
-      if ((res = mp_sub(&B, &D, &B)) != MP_OKAY) {
-         goto __D;
-      }
-   } else {
-      /* v - v - u, C = C - A, D = D - B */
-      if ((res = mp_sub(&v, &u, &v)) != MP_OKAY) {
-         goto __D;
-      }
-   
-      if ((res = mp_sub(&D, &B, &D)) != MP_OKAY) {
-         goto __D;
-      }
-   }
-   
-   /* if not zero goto step 4 */
-   if (mp_iszero(&u) == 0) goto top;
-   
-   /* now a = C, b = D, gcd == g*v */
- 
-   /* if v != 1 then there is no inverse */
-   if (mp_cmp_d(&v, 1) != MP_EQ) {
-      res = MP_VAL;
-      goto __D;
-   }
-   
-   /* b is now the inverse */
-   neg = a->sign;
-   while (D.sign == MP_NEG) {
-      if ((res = mp_add(&D, b, &D)) != MP_OKAY) {
-         goto __D;
-      }
-   }
-   mp_exch(&D, c);
-   c->sign = neg;
-   res = MP_OKAY;
-   
-__D:   mp_clear(&D);
-__B:   mp_clear(&B);
-__V:   mp_clear(&v);
-__U:   mp_clear(&u);
-__Y:   mp_clear(&y);
-__X:   mp_clear(&x);
-__ERR:
-   DECFUNC();
-   return res;
-}
-
-int mp_invmod(mp_int *a, mp_int *b, mp_int *c)
-{
-   mp_int x, y, u, v, A, B, C, D;
-   int res;
-   
-   REGFUNC("mp_invmod");
-   VERIFY(a);
-   VERIFY(b);
-   VERIFY(c);
-   
-   /* b cannot be negative */
-   if (b->sign == MP_NEG) {
-      return MP_VAL;
-   }
-   
-   /* if the modulus is odd we can use a faster routine instead */
-   if (mp_iseven(b) == 0) {
-      res = fast_mp_invmod(a,b,c);
-      DECFUNC();
-      return res;
-   }
-
-   if ((res = mp_init(&x)) != MP_OKAY) {
-      goto __ERR;
-   }
-   
-   if ((res = mp_init(&y)) != MP_OKAY) {
-      goto __X;
-   }
-   
-   if ((res = mp_init(&u)) != MP_OKAY) {
-      goto __Y;
-   }
-   
-   if ((res = mp_init(&v)) != MP_OKAY) {
-      goto __U;
-   }
-   
-   if ((res = mp_init(&A)) != MP_OKAY) {
-      goto __V;
-   }
-   
-   if ((res = mp_init(&B)) != MP_OKAY) {
-      goto __A;
-   }
-   
-   if ((res = mp_init(&C)) != MP_OKAY) {
-      goto __B;
-   }
-   
-   if ((res = mp_init(&D)) != MP_OKAY) {
-      goto __C;
-   }
-   
-   /* x = a, y = b */
-   if ((res = mp_copy(a, &x)) != MP_OKAY) {
-      goto __D;
-   }
-   if ((res = mp_copy(b, &y)) != MP_OKAY) {
-      goto __D;
-   }
-   
-   if ((res = mp_abs(&x, &x)) != MP_OKAY) {
-      goto __D;
-   }
-   
-   /* 2. [modified] if x,y are both even then return an error! */
-   if (mp_iseven(&x) == 1 && mp_iseven(&y) == 1) {
-      res = MP_VAL;
-      goto __D;
-   }
-   
-   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-   if ((res = mp_copy(&x, &u)) != MP_OKAY) {
-      goto __D;
-   }
-   if ((res = mp_copy(&y, &v)) != MP_OKAY) {
-      goto __D;
-   }
-   mp_set(&A, 1);
-   mp_set(&D, 1);
-   
-
-top:   
-   /* 4.  while u is even do */
-   while (mp_iseven(&u) == 1) {
-      /* 4.1 u = u/2 */
-      if ((res = mp_div_2(&u, &u)) != MP_OKAY) {
-         goto __D;
-      }
-      /* 4.2 if A or B is odd then */
-      if (mp_iseven(&A) == 0 || mp_iseven(&B) == 0) {
-         /* A = (A+y)/2, B = (B-x)/2 */
-         if ((res = mp_add(&A, &y, &A)) != MP_OKAY) {
-            goto __D;
-         }
-         if ((res = mp_sub(&B, &x, &B)) != MP_OKAY) {
-            goto __D;
-         }
-      }
-      /* A = A/2, B = B/2 */
-	  if ((res = mp_div_2(&A, &A)) != MP_OKAY) {
-	     goto __D;
-	  }
-	  if ((res = mp_div_2(&B, &B)) != MP_OKAY) {
-	     goto __D;
-	  }
-   }
-   
-  
-   /* 5.  while v is even do */
-   while (mp_iseven(&v) == 1) {
-      /* 5.1 v = v/2 */
-      if ((res = mp_div_2(&v, &v)) != MP_OKAY) {
-         goto __D;
-      }
-      /* 5.2 if C,D are even then */
-      if (mp_iseven(&C) == 0 || mp_iseven(&D) == 0) {
-         /* C = (C+y)/2, D = (D-x)/2 */
-         if ((res = mp_add(&C, &y, &C)) != MP_OKAY) {
-            goto __D;
-         }
-         if ((res = mp_sub(&D, &x, &D)) != MP_OKAY) {
-            goto __D;
-         }
-      }
-      /* C = C/2, D = D/2 */
-	  if ((res = mp_div_2(&C, &C)) != MP_OKAY) {
-	     goto __D;
-	  }
-	  if ((res = mp_div_2(&D, &D)) != MP_OKAY) {
-	     goto __D;
-	  }
-   }
-   
-   /* 6.  if u >= v then */
-   if (mp_cmp(&u, &v) != MP_LT) {
-      /* u = u - v, A = A - C, B = B - D */
-      if ((res = mp_sub(&u, &v, &u)) != MP_OKAY) {
-         goto __D;
-      }
-   
-      if ((res = mp_sub(&A, &C, &A)) != MP_OKAY) {
-         goto __D;
-      }
-   
-      if ((res = mp_sub(&B, &D, &B)) != MP_OKAY) {
-         goto __D;
-      }
-   } else {
-      /* v - v - u, C = C - A, D = D - B */
-      if ((res = mp_sub(&v, &u, &v)) != MP_OKAY) {
-         goto __D;
-      }
-   
-      if ((res = mp_sub(&C, &A, &C)) != MP_OKAY) {
-         goto __D;
-      }
-   
-      if ((res = mp_sub(&D, &B, &D)) != MP_OKAY) {
-         goto __D;
-      }
-   }
-   
-   /* if not zero goto step 4 */
-   if (mp_iszero(&u) == 0) goto top;
-   
-   /* now a = C, b = D, gcd == g*v */
- 
-   /* if v != 1 then there is no inverse */
-   if (mp_cmp_d(&v, 1) != MP_EQ) {
-      res = MP_VAL;
-      goto __D;
-   }
-   
-   /* a is now the inverse */
-   mp_exch(&C, c);
-   res = MP_OKAY;
-   
-__D:   mp_clear(&D);
-__C:   mp_clear(&C);
-__B:   mp_clear(&B);
-__A:   mp_clear(&A);
-__V:   mp_clear(&v);
-__U:   mp_clear(&u);
-__Y:   mp_clear(&y);
-__X:   mp_clear(&x);
-__ERR:
-   DECFUNC();
-   return res;
-}
-
-/* pre-calculate the value required for Barrett reduction 
- * For a given modulus "b" it calulates the value required in "a"
- */
-int mp_reduce_setup(mp_int *a, mp_int *b)
-{
-   int res;
-   
-   REGFUNC("mp_reduce_setup");
-   VERIFY(a);
-   VERIFY(b);
-   
-   if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
-      DECFUNC();
-      return res;
-   }
-   res = mp_div(a, b, a, NULL);
-   DECFUNC();
-   return res;   
-}
-
-/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup 
- * From HAC pp.604 Algorithm 14.42 
- */
-int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
-{
-  mp_int   q;
-  int      res, um = m->used;
-  
-  REGFUNC("mp_reduce");
-  VERIFY(x);
-  VERIFY(m);
-  VERIFY(mu);
-  
-  if((res = mp_init_copy(&q, x)) != MP_OKAY) {
-    DECFUNC();
-    return res;
-  }
-
-  mp_rshd(&q, um - 1);       /* q1 = x / b^(k-1)  */
-  
-  /* according to HAC this is optimization is ok */
-  if (((unsigned long)m->used) > (1UL<<(unsigned long)(DIGIT_BIT-1UL))) {
-     if ((res = mp_mul(&q, mu, &q)) != MP_OKAY) {
-        goto CLEANUP;
-     }
-  } else {
-     if ((res = s_mp_mul_high_digs(&q, mu, &q, um-1)) != MP_OKAY) {
-        goto CLEANUP;
-     }
-  }
-
-  mp_rshd(&q, um + 1);       /* q3 = q2 / b^(k+1) */
-
-  /* x = x mod b^(k+1), quick (no division) */
-  if ((res = mp_mod_2d(x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
-     goto CLEANUP;
-  }
-
-  /* q = q * m mod b^(k+1), quick (no division) */
-  if ((res = s_mp_mul_digs(&q, m, &q, um + 1)) != MP_OKAY) {
-     goto CLEANUP;
-  }
-
-  /* x = x - q */
-  if((res = mp_sub(x, &q, x)) != MP_OKAY)
-    goto CLEANUP;
-
-  /* If x < 0, add b^(k+1) to it */
-  if(mp_cmp_d(x, 0) == MP_LT) {
-    mp_set(&q, 1);
-    if((res = mp_lshd(&q, um + 1)) != MP_OKAY)
-      goto CLEANUP;
-    if((res = mp_add(x, &q, x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  /* Back off if it's too big */
-  while(mp_cmp(x, m) != MP_LT) {
-    if((res = s_mp_sub(x, m, x)) != MP_OKAY)
-      break;
-  }
-
- CLEANUP:
-  mp_clear(&q);
-  DECFUNC();
-
-  return res;
-}
-
-/* setups the montgomery reduction stuff */
-int mp_montgomery_setup(mp_int *a, mp_digit *mp)
-{
-   mp_int t, tt;
-   int res;
-   
-   if ((res = mp_init(&t)) != MP_OKAY) {
-      return res;
-   }
-   
-   if ((res = mp_init(&tt)) != MP_OKAY) {
-      goto __T;
-   }
-   
-   /* tt = b */
-   tt.dp[0] = 0;
-   tt.dp[1] = 1;
-   tt.used  = 2;
-
-   /* t = m mod b */
-   t.dp[0] = a->dp[0];
-   t.used  = 1;
-
-   /* t = 1/m mod b */
-   if ((res = mp_invmod(&t, &tt, &t)) != MP_OKAY) {
-      goto __TT;
-   }
-   
-   /* t = -1/m mod b */
-   *mp = ((mp_digit)1 << ((mp_digit)DIGIT_BIT)) - t.dp[0];
-   
-   res = MP_OKAY;
-__TT: mp_clear(&tt);   
-__T:  mp_clear(&t);
-   return res;
-}   
-
-/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
-static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
-{
-   int ix, res, olduse;
-   mp_digit ui;
-   mp_word  W[512];
-   
-   REGFUNC("fast_mp_montgomery_reduce");
-   VERIFY(a);
-   VERIFY(m);
-   
-   /* get old used count */
-   olduse = a->used;
-   
-   /* grow a as required */
-   if (a->alloc < m->used+1) {
-      if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
-   
-   /* copy the digits of a */
-   for (ix = 0; ix < a->used; ix++) {
-       W[ix] = a->dp[ix];
-   }
-   
-   /* zero the high words */
-   for (; ix < m->used * 2 + 1; ix++) {
-       W[ix] = 0;
-   }
-     
-   for (ix = 0; ix < m->used; ix++) {
-       /* ui = ai * m' mod b 
-        *
-        * We avoid a double precision multiplication (which isn't required)
-        * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
-        * the carry cleared (see after the inner loop)
-        */
-       ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
-       
-       /* a = a + ui * m * b^i 
-        *
-        * This is computed in place and on the fly.  The multiplication 
-        * by b^i is handled by offseting which columns the results 
-        * are added to.
-        *
-        * Note the comba method normally doesn't handle carries in the inner loop
-        * In this case we fix the carry from the previous column since the Montgomery
-        * reduction requires digits of the result (so far) [see above] to work.  This is 
-        * handled by fixing up one carry after the inner loop.  The carry fixups are done
-        * in order so after these loops the first m->used words of W[] have the carries
-        * fixed
-        */       
-       { 
-          register int      iy;
-          register mp_digit *tmpx;
-          register mp_word  *_W;
-          
-          /* aliases */
-          tmpx = m->dp;
-          _W   = W + ix;
-          
-          /* inner loop */
-          for (iy = 0; iy < m->used; iy++) {
-              *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++);
-          }
-       }
-
-       /* now fix carry for next digit, W[ix+1] */
-       W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
-   }
-   
-   /* nox fix rest of carries */
-   for (++ix; ix <= m->used * 2 + 1; ix++) {
-       W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-   }
-   
-   /* copy out, A = A/b^n 
-    *
-    * The result is A/b^n but instead of converting from an array of mp_word
-    * to mp_digit than calling mp_rshd we just copy them in the right
-    * order 
-    */
-   for (ix = 0; ix < m->used + 1; ix++) { 
-       a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
-   }
-   
-   /* set the max used */
-   a->used = m->used + 1;
-
-   /* zero oldused digits, if the input a was larger than 
-    * m->used+1 we'll have to clear the digits */  
-   for (; ix < olduse; ix++) {
-       a->dp[ix] = 0;
-   }
-
-   mp_clamp(a);
-   
-   /* if A >= m then A = A - m */
-   if (mp_cmp_mag(a, m) != MP_LT) {
-      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }   
-   
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
-int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
-{
-   int ix, res, digs;
-   mp_digit ui;
-   
-   REGFUNC("mp_montgomery_reduce");
-   VERIFY(a);
-   VERIFY(m);
-   
-   digs = m->used * 2 + 1;
-   if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
-      res = fast_mp_montgomery_reduce(a, m, mp);
-      DECFUNC();
-      return res;
-   }  
-
-   if (a->alloc < m->used*2+1) {
-      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
-   a->used = m->used * 2 + 1;
-      
-   for (ix = 0; ix < m->used; ix++) {
-       /* ui = ai * m' mod b */
-       ui = (a->dp[ix] * mp) & MP_MASK;
-       
-       /* a = a + ui * m * b^i */
-       { 
-          register int      iy;
-          register mp_digit *tmpx, *tmpy, mu;
-          register mp_word  r;
-          
-          /* aliases */
-          tmpx = m->dp;
-          tmpy = a->dp + ix;
-          
-          mu  = 0;
-          for (iy = 0; iy < m->used; iy++) {
-              r             = ((mp_word)ui) * ((mp_word)*tmpx++) + ((mp_word)mu) + ((mp_word)*tmpy);
-              mu            = (r >> ((mp_word)DIGIT_BIT));
-              *tmpy++       = (r & ((mp_word)MP_MASK));
-          }
-          /* propagate carries */
-          while (mu) {
-             *tmpy            += mu;
-             mu                = (*tmpy>>DIGIT_BIT)&1;
-             *tmpy++          &= MP_MASK;
-          }
-       }
-   }
-   
-   /* A = A/b^n */
-   mp_rshd(a, m->used);
-   
-   /* if A >= m then A = A - m */
-   if (mp_cmp_mag(a, m) != MP_LT) {
-      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
-         DECFUNC();
-         return res;
-      }
-   }
-   
-   DECFUNC();
-   return MP_OKAY;
-}
-
-/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 
- *
- * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
- * The value of k changes based on the size of the exponent.
- *
- * Uses Montgomery reduction 
- */
-static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
-{
-   mp_int M[256], res;
-   mp_digit buf, mp;
-   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-   
-   REGFUNC("mp_exptmod_fast");
-   VERIFY(G);
-   VERIFY(X);
-   VERIFY(P);
-   VERIFY(Y);
-   
-   /* find window size */
-   x = mp_count_bits(X);
-        if (x <= 7)     { winsize = 2; }
-   else if (x <= 36)    { winsize = 3; }
-   else if (x <= 140)   { winsize = 4; }
-   else if (x <= 450)   { winsize = 5; }
-   else if (x <= 1303)  { winsize = 6; }
-   else if (x <= 3529)  { winsize = 7; }
-   else                 { winsize = 8; }
-
-   /* init G array */
-   for (x = 0; x < (1<<winsize); x++) {
-      if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
-         for (y = 0; y < x; y++) {
-            mp_clear(&M[y]);
-         }
-         DECFUNC();
-         return err;
-      }
-   }
-   
-   /* now setup montgomery  */
-   if ((err = mp_montgomery_setup(P, &mp)) != MP_OKAY) {
-      goto __M;
-   }
-   
-   /* setup result */
-   if ((err = mp_init(&res)) != MP_OKAY) {
-      goto __RES;
-   }
-
-   /* now we need R mod m */
-   if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
-      goto __RES;
-   }
-      
-   /* res = R mod m */
-   if ((err = mp_mod(&res, P, &res)) != MP_OKAY) {
-      goto __RES;
-   }   
-   
-   /* create M table 
-    *
-    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
-    *
-    * The first half of the table is not computed though accept for M[0] and M[1]
-    */
-   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-      goto __RES;
-   }
-   
-   /* now set M[1] to G * R mod m */
-   if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) {
-      goto __RES;
-   }
-      
-   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-   if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
-      goto __RES;
-   }
-   
-   for (x = 0; x < (winsize-1); x++) {
-       if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) {
-          goto __RES;
-       }
-       if ((err = mp_montgomery_reduce(&M[1<<(winsize-1)], P, mp)) != MP_OKAY) {
-          goto __RES;
-       }
-   }  
-   
-   /* create upper table */
-   for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) {
-       if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
-          goto __RES;
-       }
-       if ((err = mp_montgomery_reduce(&M[x], P, mp)) != MP_OKAY) {
-          goto __RES;
-       }
-   }
-
-   /* set initial mode and bit cnt */
-   mode   = 0;
-   bitcnt = 0;
-   buf    = 0;
-   digidx = X->used - 1;
-   bitcpy = bitbuf = 0;
-  
-   bitcnt = 1;
-   for (;;) {
-      /* grab next digit as required */
-      if (--bitcnt == 0) {
-         if (digidx == -1) {
-            break;
-         }
-         buf = X->dp[digidx--];
-         bitcnt = (int)DIGIT_BIT;
-      }
-      
-      /* grab the next msb from the exponent */
-      y = (buf >> (DIGIT_BIT - 1)) & 1;
-      buf <<= 1;
-    
-      /* if the bit is zero and mode == 0 then we ignore it 
-       * These represent the leading zero bits before the first 1 bit
-       * in the exponent.  Technically this opt is not required but it 
-       * does lower the # of trivial squaring/reductions used
-       */
-      if (mode == 0 && y == 0) continue;
-      
-      /* if the bit is zero and mode == 1 then we square */
-      if (y == 0 && mode == 1) {
-         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
-            goto __RES;
-         }
-         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
-            goto __RES;
-         }
-         continue;
-      }
-      
-      /* else we add it to the window */
-      bitbuf  |= (y<<(winsize-++bitcpy));
-      mode     = 2;
-      
-      if (bitcpy == winsize) {
-         /* ok window is filled so square as required and multiply multiply */
-         /* square first */
-         for (x = 0; x < winsize; x++) {
-            if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
-               goto __RES;
-            }
-            if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
-               goto __RES;
-            }
-         }
-         
-         /* then multiply */
-         if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
-            goto __RES;
-         }
-         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
-            goto __RES;
-         }
-         
-         /* empty window and reset */
-         bitcpy = bitbuf = 0;
-         mode   = 1;
-      }
-   }
-   
-   /* if bits remain then square/multiply */
-   if (mode == 2 && bitcpy > 0) {
-      /* square then multiply if the bit is set */
-      for (x = 0; x < bitcpy; x++) {
-         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
-            goto __RES;
-         }
-         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
-            goto __RES;
-         }
-         
-         bitbuf <<= 1;
-         if ((bitbuf & (1<<winsize)) != 0) {
-            /* then multiply */
-            if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
-               goto __RES;
-            }
-            if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
-               goto __RES;
-            }
-         }
-      }
-   }
-   
-   /* fixup result */
-   if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
-      goto __RES;
-   } 
-   
-   mp_exch(&res, Y);
-   err = MP_OKAY;
-__RES: mp_clear(&res);
-__M  :
-   for (x = 0; x < (1<<winsize); x++) {
-      mp_clear(&M[x]);
-   }
-   DECFUNC();
-   return err;
-}
-
-int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
-{
-   mp_int M[256], res, mu;
-   mp_digit buf;
-   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-   
-   REGFUNC("mp_exptmod");
-   VERIFY(G);
-   VERIFY(X);
-   VERIFY(P);
-   VERIFY(Y);
-   
-   /* if the modulus is odd use the fast method */
-   if (mp_isodd(P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
-      err = mp_exptmod_fast(G, X, P, Y);
-      DECFUNC();
-      return err;
-   }   
-
-   /* find window size */
-   x = mp_count_bits(X);
-        if (x <= 7)     { winsize = 2; }
-   else if (x <= 36)    { winsize = 3; }
-   else if (x <= 140)   { winsize = 4; }
-   else if (x <= 450)   { winsize = 5; }
-   else if (x <= 1303)  { winsize = 6; }
-   else if (x <= 3529)  { winsize = 7; }
-   else                 { winsize = 8; }
-   
-   /* init G array */
-   for (x = 0; x < (1<<winsize); x++) {
-      if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
-         for (y = 0; y < x; y++) {
-            mp_clear(&M[y]);
-         }
-         DECFUNC();
-         return err;
-      }
-   }
-
-   /* create mu, used for Barrett reduction */
-   if ((err = mp_init(&mu)) != MP_OKAY) {
-      goto __M;
-   }
-   if ((err = mp_reduce_setup(&mu, P)) != MP_OKAY) {
-      goto __MU;
-   }
-   
-   /* create M table 
-    *
-    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
-    *
-    * The first half of the table is not computed though accept for M[0] and M[1]
-    */
-   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-      goto __MU;
-   }
-   
-   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-   if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
-      goto __MU;
-   }
-   
-   for (x = 0; x < (winsize-1); x++) {
-       if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) {
-          goto __MU;
-       }
-       if ((err = mp_reduce(&M[1<<(winsize-1)], P, &mu)) != MP_OKAY) {
-          goto __MU;
-       }
-   }  
-   
-   /* create upper table */
-   for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) {
-       if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
-          goto __MU;
-       }
-       if ((err = mp_reduce(&M[x], P, &mu)) != MP_OKAY) {
-          goto __MU;
-       }
-   }
-   
-   /* setup result */
-   if ((err = mp_init(&res)) != MP_OKAY) {
-      goto __MU;
-   }
-   mp_set(&res, 1);
-   
-   /* set initial mode and bit cnt */
-   mode   = 0;
-   bitcnt = 0;
-   buf    = 0;
-   digidx = X->used - 1;
-   bitcpy = bitbuf = 0;
-   
-   bitcnt = 1;
-   for (;;) {
-      /* grab next digit as required */
-      if (--bitcnt == 0) {
-         if (digidx == -1) {
-            break;
-         }
-         buf = X->dp[digidx--];
-         bitcnt = (int)DIGIT_BIT;
-      }
-      
-      /* grab the next msb from the exponent */
-      y = (buf >> (DIGIT_BIT - 1)) & 1;
-      buf <<= 1;
-    
-      /* if the bit is zero and mode == 0 then we ignore it 
-       * These represent the leading zero bits before the first 1 bit
-       * in the exponent.  Technically this opt is not required but it 
-       * does lower the # of trivial squaring/reductions used
-       */
-      if (y == 0 && mode == 0) continue;
-      
-      /* if the bit is zero and mode == 1 then we square */
-      if (y == 0 && mode == 1) {
-         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
-            goto __RES;
-         }
-         if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-            goto __RES;
-         }
-         continue;
-      }
-      
-      /* else we add it to the window */
-      bitbuf  |= (y<<(winsize-++bitcpy));
-      mode     = 2;
-      
-      if (bitcpy == winsize) {
-         /* ok window is filled so square as required and multiply multiply */
-         /* square first */
-         for (x = 0; x < winsize; x++) {
-            if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
-               goto __RES;
-            }
-            if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-               goto __RES;
-            }
-         }
-         
-         /* then multiply */
-         if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
-            goto __MU;
-         }
-         if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-            goto __MU;
-         }
-         
-         /* empty window and reset */
-         bitcpy = bitbuf = 0;
-         mode   = 1;
-      }
-   }
-   
-   /* if bits remain then square/multiply */
-   if (mode == 2 && bitcpy > 0) {
-      /* square then multiply if the bit is set */
-      for (x = 0; x < bitcpy; x++) {
-         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
-            goto __RES;
-         }
-         if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-            goto __RES;
-         }
-         
-         bitbuf <<= 1;
-         if ((bitbuf & (1<<winsize)) != 0) {
-            /* then multiply */
-            if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
-               goto __RES;
-            }
-            if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-               goto __RES;
-            }
-         }
-      }
-   }
-   
-   mp_exch(&res, Y);
-   err = MP_OKAY;
-__RES: mp_clear(&res);
-__MU : mp_clear(&mu);
-__M  :
-   for (x = 0; x < (1<<winsize); x++) {
-      mp_clear(&M[x]);
-   }
-   DECFUNC();
-   return err;
-}
-
-/* computes a = 2^b */
-int mp_2expt(mp_int *a, int b)
-{
-   int res;
-   
-   mp_zero(a);
-   if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
-      return res;
-   }
-   a->used = b/DIGIT_BIT + 1;
-   a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
-   
-   return MP_OKAY;
-}   
-   
-
-/* find the n'th root of an integer 
- *
- * Result found such that (c)^b <= a and (c+1)^b > a 
- */
-int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
-{
-   mp_int t1, t2, t3;
-   int res, neg;
-   
-   /* input must be positive if b is even*/
-   if ((b&1) == 0 && a->sign == MP_NEG) {
-      return MP_VAL;
-   }
-   
-   if ((res = mp_init(&t1)) != MP_OKAY) {
-      return res;
-   }
-   
-   if ((res = mp_init(&t2)) != MP_OKAY) {
-      goto __T1;
-   }
-   
-   if ((res = mp_init(&t3)) != MP_OKAY) {
-      goto __T2;
-   }
-
-   /* if a is negative fudge the sign but keep track */
-   neg     = a->sign;
-   a->sign = MP_ZPOS;
-
-   /* t2 = 2 */
-   mp_set(&t2, 2);
-  
-   do {
-      /* t1 = t2 */
-      if ((res = mp_copy(&t2, &t1)) != MP_OKAY) {
-         goto __T3;
-      }
-
-      /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
-      if ((res = mp_expt_d(&t1, b-1, &t3)) != MP_OKAY) {            /* t3 = t1^(b-1) */
-         goto __T3;
-      }
-
-      /* numerator */
-      if ((res = mp_mul(&t3, &t1, &t2)) != MP_OKAY) {               /* t2 = t1^b */
-         goto __T3;
-      }
-      
-      if ((res = mp_sub(&t2, a, &t2)) != MP_OKAY) {                 /* t2 = t1^b - a */
-         goto __T3;
-      }
-
-      if ((res = mp_mul_d(&t3, b, &t3)) != MP_OKAY) {               /* t3 = t1^(b-1) * b  */
-         goto __T3;
-      }
-      
-      if ((res = mp_div(&t2, &t3, &t3, NULL)) != MP_OKAY) {         /* t3 = (t1^b - a)/(b * t1^(b-1)) */
-         goto __T3;
-      }
-      
-      if ((res = mp_sub(&t1, &t3, &t2)) != MP_OKAY) {
-         goto __T3;
-      }
-   } while (mp_cmp(&t1, &t2) != MP_EQ);
-   
-   /* result can be off by a few so check */
-   for (;;) {
-      if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
-         goto __T3;
-      }
-   
-      if (mp_cmp(&t2, a) == MP_GT) {
-         if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
-            goto __T3;
-         }
-      } else {
-         break;
-      }
-   }      
-   
-   /* reset the sign of a first */
-   a->sign = neg;
-   
-   /* set the result */
-   mp_exch(&t1, c);
-   
-   /* set the sign of the result */
-   c->sign = neg;   
-   
-   res = MP_OKAY;
-   
-__T3:  mp_clear(&t3);
-__T2:  mp_clear(&t2);
-__T1:  mp_clear(&t1);
-   return res;
-}
-
-/* computes the jacobi c = (a | n) (or Legendre if b is prime) 
- * HAC pp. 73 Algorithm 2.149 
- */
-int mp_jacobi(mp_int *a, mp_int *n, int *c)
-{
-   mp_int a1, n1, e;
-   int s, r, res;
-   mp_digit residue;
-   
-   /* step 1.  if a == 0, return 0 */
-   if (mp_iszero(a) == 1) {
-      *c = 0;
-      return MP_OKAY;
-   }
-   
-   /* step 2.  if a == 1, return 1 */
-   if (mp_cmp_d(a, 1) == MP_EQ) {
-      *c = 1;
-      return MP_OKAY;
-   }
-   
-   /* default */
-   s = 0;
-   
-   /* step 3.  write a = a1 * 2^e  */
-   if ((res = mp_init_copy(&a1, a)) != MP_OKAY) {
-      return res;
-   }
-   
-   if ((res = mp_init(&n1)) != MP_OKAY) {
-      goto __A1;
-   }
-   
-   if ((res = mp_init(&e)) != MP_OKAY) {
-      goto __N1;
-   }
-   
-   while (mp_iseven(&a1) == 1) {
-       if ((res = mp_add_d(&e, 1, &e)) != MP_OKAY) {
-          goto __E;
-       }
-       
-       if ((res = mp_div_2(&a1, &a1)) != MP_OKAY) {
-          goto __E;
-       }
-   }
-   
-   /* step 4.  if e is even set s=1 */
-   if (mp_iseven(&e) == 1) {
-      s = 1;
-   } else {
-      /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
-      if ((res = mp_mod_d(n, 8, &residue)) != MP_OKAY) {
-         goto __E;
-      }
-      
-      if (residue == 1 || residue == 7) {
-         s = 1;
-      } else if (residue == 3 || residue == 5) {
-         s = -1;
-      }
-   }
-   
-   /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-   if ((res = mp_mod_d(n, 4, &residue)) != MP_OKAY) {
-      goto __E;
-   }
-   if (residue == 3) {
-      if ((res = mp_mod_d(&a1, 4, &residue)) != MP_OKAY) {
-         goto __E;
-      }
-      if (residue == 3) {
-         s = -s;
-      }
-   }
-   
-   /* if a1 == 1 we're done */
-   if (mp_cmp_d(&a1, 1) == MP_EQ) {
-      *c = s;
-   } else {
-      /* n1 = n mod a1 */
-      if ((res = mp_mod(n, &a1, &n1)) != MP_OKAY) {
-         goto __E;
-      }
-      if ((res = mp_jacobi(&n1, &a1, &r)) != MP_OKAY) {
-         goto __E;
-      }
-      *c = s * r;
-   }
-   
-   /* done */
-   res = MP_OKAY;
-__E:   mp_clear(&e);
-__N1:  mp_clear(&n1);
-__A1:  mp_clear(&a1);
-   return res;
-}
-
-/* --> radix conversion <-- */
-/* reverse an array, used for radix code */
-static void reverse(unsigned char *s, int len)
-{
-   int ix, iy;
-   unsigned char t;
-   
-   ix = 0; 
-   iy = len - 1;
-   while (ix < iy) {
-       t = s[ix]; s[ix] = s[iy]; s[iy] = t;
-       ++ix;
-       --iy;
-   }
-}
-
-/* returns the number of bits in an int */
-int mp_count_bits(mp_int *a)
-{
-   int r;
-   mp_digit q;
-   
-   if (a->used == 0) {
-      return 0;
-   }
-    
-   r = (a->used - 1) * DIGIT_BIT;
-   q = a->dp[a->used - 1];
-   while (q) {
-      ++r;
-      q >>= ((mp_digit)1);
-   }
-   return r;
-}
-
-/* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c)
-{
-   int res;
-   mp_zero(a);
-   while (c-- > 0) {
-       if ((res = mp_mul_2d(a, 8, a)) != MP_OKAY) {
-          return res;
-       }
-       
-       if (DIGIT_BIT != 7) {
-           a->dp[0] |= *b++;
-           a->used  += 1;
-       } else {
-           a->dp[0]  = (*b & MP_MASK);
-           a->dp[1] |= ((*b++ >> 7U) & 1);
-           a->used  += 2;
-       }
-   }
-   mp_clamp(a);
-   return MP_OKAY;
-}   
-
-/* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int mp_read_signed_bin(mp_int *a, unsigned char *b, int c)
-{
-   int res;
-   
-   if ((res = mp_read_unsigned_bin(a, b + 1, c - 1)) != MP_OKAY) {
-      return res;
-   }
-   a->sign = ((b[0] == (unsigned char)0) ? MP_ZPOS : MP_NEG);
-   return MP_OKAY;
-}
-
-/* store in unsigned [big endian] format */
-int mp_to_unsigned_bin(mp_int *a, unsigned char *b)
-{
-   int x, res;
-   mp_int t;
-   
-   if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
-      return res;
-   }
-   
-   x = 0;
-   while (mp_iszero(&t) == 0) {
-      if (DIGIT_BIT != 7) {
-         b[x++] = (unsigned char)(t.dp[0] & 255);
-      } else {
-         b[x++] = (unsigned char)(t.dp[0] | ((t.dp[1] & 0x01) << 7));
-      }
-      if ((res = mp_div_2d(&t, 8, &t, NULL))  != MP_OKAY) {
-         mp_clear(&t);
-         return res;
-      }
-   }
-   reverse(b, x);
-   mp_clear(&t);
-   return MP_OKAY;
-}
-
-/* store in signed [big endian] format */
-int mp_to_signed_bin(mp_int *a, unsigned char *b)
-{
-   int res;
-   
-   if ((res = mp_to_unsigned_bin(a, b+1)) != MP_OKAY) {
-      return res;
-   }
-   b[0] = (unsigned char)((a->sign == MP_ZPOS) ? 0 : 1);
-   return MP_OKAY;
-}
-
-/* get the size for an unsigned equivalent */
-int mp_unsigned_bin_size(mp_int *a)
-{
-   int size = mp_count_bits(a);
-   return (size/8 + ((size&7) != 0 ? 1 : 0));
-}
-
-/* get the size for an signed equivalent */
-int mp_signed_bin_size(mp_int *a)
-{
-   return 1 + mp_unsigned_bin_size(a);
-}
-
-/* read a string [ASCII] in a given radix */
-int mp_read_radix(mp_int *a, char *str, int radix)
-{
-   int y, res, neg;
-   char ch;
-   
-   if (radix < 2 || radix > 64) {
-      return MP_VAL;
-   }
-
-   if (*str == '-') {
-      ++str;
-      neg = MP_NEG;
-   } else {
-      neg = MP_ZPOS;
-   }
-   
-   mp_zero(a);
-   while (*str) {
-      ch = (char)((radix < 36) ? toupper(*str) : *str);
-      for (y = 0; y < 64; y++) {
-          if (ch == s_rmap[y]) {
-             break;
-          }
-      }
-      
-      if (y < radix) {
-         if ((res = mp_mul_d(a, (mp_digit)radix, a)) != MP_OKAY) {
-            return res;
-         }
-         if ((res = mp_add_d(a, (mp_digit)y, a)) != MP_OKAY) {
-            return res;
-         }
-      } else {
-         break;
-      }
-      ++str;
-   }
-   a->sign = neg;
-   return MP_OKAY;
-}
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int mp_toradix(mp_int *a, char *str, int radix)
-{
-   int res, digs;
-   mp_int t;
-   mp_digit d;
-   char *_s = str;
-   
-   if (radix < 2 || radix > 64) {
-      return MP_VAL;
-   }
-
-   if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
-      return res;
-   }
-   
-   if (t.sign == MP_NEG) { 
-      ++_s;
-      *str++ = '-';
-      t.sign = MP_ZPOS;
-   }
-   
-   digs = 0;
-   while (mp_iszero(&t) == 0) {
-       if ((res = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
-          mp_clear(&t);
-          return res;
-       }
-       *str++ = s_rmap[d];
-       ++digs;
-   }
-   reverse((unsigned char *)_s, digs);
-   *str++ = '\0';
-   mp_clear(&t);
-   return MP_OKAY;
-}
-
-/* returns size of ASCII reprensentation */
-int mp_radix_size(mp_int *a, int radix)
-{
-   int res, digs;
-   mp_int t;
-   mp_digit d;
-   
-   /* special case for binary */
-   if (radix == 2) {
-      return mp_count_bits(a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-   }
-      
-   if (radix < 2 || radix > 64) {
-      return 0;
-   }
-
-   if ((res = mp_init_copy(&t, a)) != MP_OKAY) {
-      return 0;
-   }
-   
-   digs = 0;
-   if (t.sign == MP_NEG) { 
-      ++digs;
-      t.sign = MP_ZPOS;
-   }
-   
-   while (mp_iszero(&t) == 0) {
-       if ((res = mp_div_d(&t, (mp_digit)radix, &t, &d)) != MP_OKAY) {
-          mp_clear(&t);
-          return 0;
-       }
-       ++digs;
-   }
-   mp_clear(&t);
-   return digs + 1;
-}
-
diff --git a/bn.pdf b/bn.pdf
index b8152e1c1..df69417ba 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 5c8b73e46..b5f02275a 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
 
-\title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -35,23 +35,24 @@ \section{Introduction}
 
 \section{Building Against LibTomMath}
 
-Building against LibTomMath is very simple because there is only one source file.  Simply add ``bn.c'' to your project and 
-copy both ``bn.c'' and ``bn.h'' into your project directory.  There is no configuration nor building required before hand.
+As of v0.12 LibTomMath is not a simple single source file project like MPI.  LibTomMath retains the exact same API as MPI
+but is implemented differently.  To build LibTomMath you will need a copy of GNU cc and GNU make.  Both are free so if you 
+don't have a copy don't whine to me about it.
 
-If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them 
-with references to LibTomMath.  For example, substitute 
+To build the library type 
 
 \begin{verbatim}
-#include "mpi.h"
+make
 \end{verbatim}
 
-with 
+This will build the library file libtommath.a.  If you want to build the library and also install it (in /usr/bin and /usr/include) then
+type 
 
 \begin{verbatim}
-#include "bn.h"
+make install
 \end{verbatim}
 
-Remove ``mpi.c'' from your project and replace it with ``bn.c''.
+Now within your application include ``tommath.h'' and link against libtommath.a to get MPI-like functionality.
 
 \section{Programming with LibTomMath}
 
@@ -183,6 +184,28 @@ \subsection{Digit Manipulations}
 
 /* c = a mod 2^d */
 int mp_mod_2d(mp_int *a, int b, mp_int *c);
+
+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b);
+
+/* makes a pseudo-random int of a given size */
+int mp_rand(mp_int *a, int digits);
+
+\end{verbatim}
+
+\subsection{Binary Operations}
+
+\begin{verbatim}
+
+/* c = a XOR b  */
+int mp_xor(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a OR b */
+int mp_or(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a AND b */
+int mp_and(mp_int *a, mp_int *b, mp_int *c);
+
 \end{verbatim}
 
 \subsection{Basic Arithmetic}
@@ -366,6 +389,27 @@ \subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
 
 This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
 
+\subsubsection{mp\_2expt(mp\_int *a, int b)}
+Computes $a = 2^b$ by first setting $a$ to zero then OR'ing the correct bit to get the right value.
+
+\subsubsection{mp\_rand(mp\_int *a, int digits)}
+Computes a pseudo-random (\textit{via rand()}) integer that is always ``$digits$'' digits in length.  Not for
+cryptographic use.
+
+\subsection{Binary Arithmetic}
+\subsubsection{mp\_xor(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c = a \oplus b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
+of $c$ is the maximum length of the two inputs.
+
+\subsubsection{mp\_or(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c = a \lor b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
+of $c$ is the maximum length of the two inputs.
+
+\subsubsection{mp\_and(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
+of $c$ is the maximum length of the two inputs.
+
+
 \subsection{Basic Arithmetic}
 
 \subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
@@ -622,14 +666,13 @@ \subsection{Exponentiation Algorithms}
 124 multiplications.  The MPI method would have 512 squarings and 512 multiplications.  Randomly every $2k$ bits another 
 multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides.
 
-Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given.  
+Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given.
 However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction
 do not have to give full precision.  As a result the reduction step is much faster and just as accurate.  The LibTomMath code
 will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used.  The
 faster multipliers have also been optimized into the two variants (baseline and comba baseline).
 
-As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI.  
-
-
+LibTomMath also has a variant of the exptmod function that uses Montgomery reductions instead of Barrett reductions
+which is faser.  As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI.  
 
 \end{document}
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
new file mode 100644
index 000000000..e1dcce303
--- /dev/null
+++ b/bn_fast_mp_invmod.c
@@ -0,0 +1,168 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
+int
+fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int    x, y, u, v, B, D;
+  int       res, neg;
+
+
+  if ((res = mp_init (&x)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  if ((res = mp_init (&y)) != MP_OKAY) {
+    goto __X;
+  }
+
+  if ((res = mp_init (&u)) != MP_OKAY) {
+    goto __Y;
+  }
+
+  if ((res = mp_init (&v)) != MP_OKAY) {
+    goto __U;
+  }
+
+  if ((res = mp_init (&B)) != MP_OKAY) {
+    goto __V;
+  }
+
+  if ((res = mp_init (&D)) != MP_OKAY) {
+    goto __B;
+  }
+
+  /* x == modulus, y == value to invert */
+  if ((res = mp_copy (b, &x)) != MP_OKAY) {
+    goto __D;
+  }
+  if ((res = mp_copy (a, &y)) != MP_OKAY) {
+    goto __D;
+  }
+
+  if ((res = mp_abs (&y, &y)) != MP_OKAY) {
+    goto __D;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __D;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __D;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __D;
+  }
+  mp_set (&D, 1);
+
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __D;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_iseven (&B) == 0) {
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+	goto __D;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __D;
+    }
+    /* 5.2 if C,D are even then */
+    if (mp_iseven (&D) == 0) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+	goto __D;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __D;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __D;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __D;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __D;
+  }
+
+  /* b is now the inverse */
+  neg = a->sign;
+  while (D.sign == MP_NEG) {
+    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+  mp_exch (&D, c);
+  c->sign = neg;
+  res = MP_OKAY;
+
+__D:mp_clear (&D);
+__B:mp_clear (&B);
+__V:mp_clear (&v);
+__U:mp_clear (&u);
+__Y:mp_clear (&y);
+__X:mp_clear (&x);
+__ERR:
+  return res;
+}
diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c
new file mode 100644
index 000000000..dbc3478b1
--- /dev/null
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -0,0 +1,116 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
+int
+fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+{
+  int       ix, res, olduse;
+  mp_word   W[512];
+
+  /* get old used count */
+  olduse = a->used;
+
+  /* grow a as required */
+  if (a->alloc < m->used + 1) {
+    if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* copy the digits of a */
+  for (ix = 0; ix < a->used; ix++) {
+    W[ix] = a->dp[ix];
+  }
+
+  /* zero the high words */
+  for (; ix < m->used * 2 + 1; ix++) {
+    W[ix] = 0;
+  }
+
+  for (ix = 0; ix < m->used; ix++) {
+    /* ui = ai * m' mod b
+     *
+     * We avoid a double precision multiplication (which isn't required)
+     * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
+     * the carry cleared (see after the inner loop)
+     */
+    register mp_digit ui;
+    ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK;
+
+    /* a = a + ui * m * b^i
+     *
+     * This is computed in place and on the fly.  The multiplication
+     * by b^i is handled by offseting which columns the results
+     * are added to.
+     *
+     * Note the comba method normally doesn't handle carries in the inner loop
+     * In this case we fix the carry from the previous column since the Montgomery
+     * reduction requires digits of the result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The carry fixups are done
+     * in order so after these loops the first m->used words of W[] have the carries
+     * fixed
+     */
+    {
+      register int iy;
+      register mp_digit *tmpx;
+      register mp_word *_W;
+
+      /* aliases */
+      tmpx = m->dp;
+      _W = W + ix;
+
+      /* inner loop */
+      for (iy = 0; iy < m->used; iy++) {
+	*_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+      }
+    }
+
+    /* now fix carry for next digit, W[ix+1] */
+    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+  }
+
+  /* nox fix rest of carries */
+  for (++ix; ix <= m->used * 2 + 1; ix++) {
+    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+  }
+
+  /* copy out, A = A/b^n
+   *
+   * The result is A/b^n but instead of converting from an array of mp_word
+   * to mp_digit than calling mp_rshd we just copy them in the right
+   * order
+   */
+  for (ix = 0; ix < m->used + 1; ix++) {
+    a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK);
+  }
+
+  /* set the max used */
+  a->used = m->used + 1;
+
+  /* zero oldused digits, if the input a was larger than
+   * m->used+1 we'll have to clear the digits */
+  for (; ix < olduse; ix++) {
+    a->dp[ix] = 0;
+  }
+  mp_clamp (a);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (a, m) != MP_LT) {
+    return s_mp_sub (a, m, a);
+  }
+  return MP_OKAY;
+}
diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c
new file mode 100644
index 000000000..c433dd745
--- /dev/null
+++ b/bn_fast_s_mp_mul_digs.c
@@ -0,0 +1,113 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* Fast (comba) multiplier
+ *
+ * This is the fast column-array [comba] multiplier.  It is designed to compute
+ * the columns of the product first then handle the carries afterwards.  This
+ * has the effect of making the nested loops that compute the columns very
+ * simple and schedulable on super-scalar processors.
+ *
+ */
+int
+fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int       olduse, res, pa, ix;
+  mp_word   W[512];
+
+  if (c->alloc < digs) {
+    if ((res = mp_grow (c, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* clear temp buf (the columns) */
+  memset (W, 0, sizeof (mp_word) * digs);
+
+  /* calculate the columns */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+
+    /* this multiplier has been modified to allow you to control how many digits 
+     * of output are produced.  So at most we want to make upto "digs" digits
+     * of output
+     */
+
+
+    /* this adds products to distinct columns (at ix+iy) of W
+     * note that each step through the loop is not dependent on
+     * the previous which means the compiler can easily unroll
+     * the loop without scheduling problems
+     */
+    {
+      register mp_digit tmpx, *tmpy;
+      register mp_word *_W;
+      register int iy, pb;
+
+      /* alias for the the word on the left e.g. A[ix] * A[iy] */
+      tmpx = a->dp[ix];
+
+      /* alias for the right side */
+      tmpy = b->dp;
+
+      /* alias for the columns, each step through the loop adds a new
+         term to each column
+       */
+      _W = W + ix;
+
+      /* the number of digits is limited by their placement.  E.g. 
+         we avoid multiplying digits that will end up above the # of
+         digits of precision requested
+       */
+      pb = MIN (b->used, digs - ix);
+
+      for (iy = 0; iy < pb; iy++) {
+	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+
+  }
+
+  /* setup dest */
+  olduse = c->used;
+  c->used = digs;
+
+
+  /* At this point W[] contains the sums of each column.  To get the
+   * correct result we must take the extra bits from each column and
+   * carry them down
+   *
+   * Note that while this adds extra code to the multiplier it saves time
+   * since the carry propagation is removed from the above nested loop.
+   * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
+   * N^2 + N*c where c is the cost of the shifting.  On very small numbers
+   * this is slower but on most cryptographic size numbers it is faster.
+   */
+
+  for (ix = 1; ix < digs; ix++) {
+    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+  }
+  c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
+
+  /* clear unused */
+  for (; ix < olduse; ix++) {
+    c->dp[ix] = 0;
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c
new file mode 100644
index 000000000..cc30386b4
--- /dev/null
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -0,0 +1,86 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* this is a modified version of fast_s_mp_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
+ * to see how it works.
+ *
+ * This is used in the Barrett reduction since for one of the multiplications
+ * only the higher digits were needed.  This essentially halves the work.
+ */
+int
+fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int       oldused, newused, res, pa, pb, ix;
+  mp_word   W[512];
+
+
+  newused = a->used + b->used + 1;
+  if (c->alloc < newused) {
+    if ((res = mp_grow (c, newused)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* like the other comba method we compute the columns first */
+  pa = a->used;
+  pb = b->used;
+  memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word));
+  for (ix = 0; ix < pa; ix++) {
+    {
+      register mp_digit tmpx, *tmpy;
+      register int iy;
+      register mp_word *_W;
+
+      /* work todo, that is we only calculate digits that are at "digs" or above  */
+      iy = digs - ix;
+
+      /* copy of word on the left of A[ix] * B[iy] */
+      tmpx = a->dp[ix];
+
+      /* alias for right side */
+      tmpy = b->dp + iy;
+
+      /* alias for the columns of output.  Offset to be equal to or above the 
+       * smallest digit place requested 
+       */
+      _W = &(W[digs]);
+
+      /* compute column products for digits above the minimum */
+      for (; iy < pb; iy++) {
+	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+  }
+
+  /* setup dest */
+  oldused = c->used;
+  c->used = newused;
+
+  /* now convert the array W downto what we need */
+  for (ix = digs + 1; ix < newused; ix++) {
+    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+  }
+  c->dp[(pa + pb + 1) - 1] =
+    (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+
+  for (; ix < oldused; ix++) {
+    c->dp[ix] = 0;
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c
new file mode 100644
index 000000000..c4f786da6
--- /dev/null
+++ b/bn_fast_s_mp_sqr.c
@@ -0,0 +1,112 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* fast squaring
+ *
+ * This is the comba method where the columns of the product are computed first
+ * then the carries are computed.  This has the effect of making a very simple
+ * inner loop that is executed the most
+ *
+ * W2 represents the outer products and W the inner.  
+ *
+ * A further optimizations is made because the inner products are of the form
+ * "A * B * 2".  The *2 part does not need to be computed until the end which is
+ * good because 64-bit shifts are slow!
+ *
+ *
+ */
+int
+fast_s_mp_sqr (mp_int * a, mp_int * b)
+{
+  int       olduse, newused, res, ix, pa;
+  mp_word   W2[512], W[512];
+
+  pa = a->used;
+  newused = pa + pa + 1;
+  if (b->alloc < newused) {
+    if ((res = mp_grow (b, newused)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* zero temp buffer (columns) 
+   * Note that there are two buffers.  Since squaring requires
+   * a outter and inner product and the inner product requires 
+   * computing a product and doubling it (a relatively expensive
+   * op to perform n^2 times if you don't have to) the inner and
+   * outer products are computed in different buffers.  This way
+   * the inner product can be doubled using n doublings instead of
+   * n^2
+   */
+  memset (W, 0,  newused * sizeof (mp_word));
+  memset (W2, 0, newused * sizeof (mp_word));
+
+  /* This computes the inner product.  To simplify the inner N^2 loop
+   * the multiplication by two is done afterwards in the N loop.
+   */
+  for (ix = 0; ix < pa; ix++) {
+     /* compute the outer product */
+    W2[ix + ix] += ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+
+    {
+      register mp_digit tmpx, *tmpy;
+      register mp_word *_W;
+      register int iy;
+
+      /* copy of left side */
+      tmpx = a->dp[ix];
+
+      /* alias for right side */
+      tmpy = a->dp + (ix + 1);
+
+      /* the column to store the result in */
+      _W = W + (ix + ix + 1);
+
+      /* inner products */
+      for (iy = ix + 1; iy < pa; iy++) {
+	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+  }
+
+  /* setup dest */
+  olduse = b->used;
+  b->used = newused;
+
+  /* double first value, since the inner products are half of what they should be */
+  W[0] += W[0] + W2[0];
+
+  /* now compute digits */
+  for (ix = 1; ix < newused; ix++) {
+    /* double/add next digit */
+    W[ix] += W[ix] + W2[ix];
+
+    W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+    b->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+  }
+  b->dp[(newused) - 1] = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
+
+  /* clear high */
+  for (; ix < olduse; ix++) {
+    b->dp[ix] = 0;
+  }
+
+  /* fix the sign (since we no longer make a fresh temp) */
+  b->sign = MP_ZPOS;
+
+  mp_clamp (b);
+  return MP_OKAY;
+}
diff --git a/bn_mp_2expt.c b/bn_mp_2expt.c
new file mode 100644
index 000000000..ace1b9999
--- /dev/null
+++ b/bn_mp_2expt.c
@@ -0,0 +1,31 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes a = 2^b */
+int
+mp_2expt (mp_int * a, int b)
+{
+  int       res;
+
+  mp_zero (a);
+  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
+    return res;
+  }
+  a->used = b / DIGIT_BIT + 1;
+  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+
+  return MP_OKAY;
+}
diff --git a/bn_mp_abs.c b/bn_mp_abs.c
new file mode 100644
index 000000000..299d3b659
--- /dev/null
+++ b/bn_mp_abs.c
@@ -0,0 +1,27 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* b = |a| */
+int
+mp_abs (mp_int * a, mp_int * b)
+{
+  int       res;
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+  b->sign = MP_ZPOS;
+  return MP_OKAY;
+}
diff --git a/bn_mp_add.c b/bn_mp_add.c
new file mode 100644
index 000000000..a2ad4fd9f
--- /dev/null
+++ b/bn_mp_add.c
@@ -0,0 +1,56 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* high level addition (handles signs) */
+int
+mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       sa, sb, res;
+
+
+  sa = a->sign;
+  sb = b->sign;
+
+  /* handle four cases */
+  if (sa == MP_ZPOS && sb == MP_ZPOS) {
+    /* both positive */
+    res = s_mp_add (a, b, c);
+    c->sign = MP_ZPOS;
+  } else if (sa == MP_ZPOS && sb == MP_NEG) {
+    /* a + -b == a - b, but if b>a then we do it as -(b-a) */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      res = s_mp_sub (b, a, c);
+      c->sign = MP_NEG;
+    } else {
+      res = s_mp_sub (a, b, c);
+      c->sign = MP_ZPOS;
+    }
+  } else if (sa == MP_NEG && sb == MP_ZPOS) {
+    /* -a + b == b - a, but if a>b then we do it as -(a-b) */
+    if (mp_cmp_mag (a, b) == MP_GT) {
+      res = s_mp_sub (a, b, c);
+      c->sign = MP_NEG;
+    } else {
+      res = s_mp_sub (b, a, c);
+      c->sign = MP_ZPOS;
+    }
+  } else {
+    /* -a + -b == -(a + b) */
+    res = s_mp_add (a, b, c);
+    c->sign = MP_NEG;
+  }
+  return res;
+}
diff --git a/bn_mp_add_d.c b/bn_mp_add_d.c
new file mode 100644
index 000000000..0391bc158
--- /dev/null
+++ b/bn_mp_add_d.c
@@ -0,0 +1,33 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* single digit addition */
+int
+mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int    t;
+  int       res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+  mp_set (&t, b);
+  res = mp_add (a, &t, c);
+
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_addmod.c b/bn_mp_addmod.c
new file mode 100644
index 000000000..adce5d565
--- /dev/null
+++ b/bn_mp_addmod.c
@@ -0,0 +1,36 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* d = a + b (mod c) */
+int
+mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int       res;
+  mp_int    t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_and.c b/bn_mp_and.c
new file mode 100644
index 000000000..d153c44cf
--- /dev/null
+++ b/bn_mp_and.c
@@ -0,0 +1,51 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* AND two ints together */
+int
+mp_and (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       res, ix, px;
+  mp_int    t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] &= x->dp[ix];
+  }
+
+  /* zero digits above the last from the smallest mp_int */
+  for (; ix < t.used; ix++) {
+    t.dp[ix] = 0;
+  }
+
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_mp_clamp.c b/bn_mp_clamp.c
new file mode 100644
index 000000000..77f3e1ab2
--- /dev/null
+++ b/bn_mp_clamp.c
@@ -0,0 +1,26 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* trim unused digits */
+void
+mp_clamp (mp_int * a)
+{
+  while (a->used > 0 && a->dp[a->used - 1] == 0)
+    --(a->used);
+  if (a->used == 0) {
+    a->sign = MP_ZPOS;
+  }
+}
diff --git a/bn_mp_clear.c b/bn_mp_clear.c
new file mode 100644
index 000000000..e88e95ac5
--- /dev/null
+++ b/bn_mp_clear.c
@@ -0,0 +1,33 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* clear one (frees)  */
+void
+mp_clear (mp_int * a)
+{
+  if (a->dp != NULL) {
+
+    /* first zero the digits */
+    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+
+    /* free ram */
+    free (a->dp);
+
+    /* reset members to make debugging easier */
+    a->dp = NULL;
+    a->alloc = a->used = 0;
+  }
+}
diff --git a/bn_mp_cmp.c b/bn_mp_cmp.c
new file mode 100644
index 000000000..527d35c98
--- /dev/null
+++ b/bn_mp_cmp.c
@@ -0,0 +1,30 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* compare two ints (signed)*/
+int
+mp_cmp (mp_int * a, mp_int * b)
+{
+  int       res;
+  /* compare based on sign */
+  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
+    return MP_LT;
+  } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
+    return MP_GT;
+  }
+  res = mp_cmp_mag (a, b);
+  return res;
+}
diff --git a/bn_mp_cmp_d.c b/bn_mp_cmp_d.c
new file mode 100644
index 000000000..1506b8779
--- /dev/null
+++ b/bn_mp_cmp_d.c
@@ -0,0 +1,37 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* compare a digit */
+int
+mp_cmp_d (mp_int * a, mp_digit b)
+{
+
+  if (a->sign == MP_NEG) {
+    return MP_LT;
+  }
+
+  if (a->used > 1) {
+    return MP_GT;
+  }
+
+  if (a->dp[0] > b) {
+    return MP_GT;
+  } else if (a->dp[0] < b) {
+    return MP_LT;
+  } else {
+    return MP_EQ;
+  }
+}
diff --git a/bn_mp_cmp_mag.c b/bn_mp_cmp_mag.c
new file mode 100644
index 000000000..efe5b2bef
--- /dev/null
+++ b/bn_mp_cmp_mag.c
@@ -0,0 +1,40 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* compare maginitude of two ints (unsigned) */
+int
+mp_cmp_mag (mp_int * a, mp_int * b)
+{
+  int       n;
+
+
+  /* compare based on # of non-zero digits */
+  if (a->used > b->used) {
+    return MP_GT;
+  } else if (a->used < b->used) {
+    return MP_LT;
+  }
+
+  /* compare based on digits  */
+  for (n = a->used - 1; n >= 0; n--) {
+    if (a->dp[n] > b->dp[n]) {
+      return MP_GT;
+    } else if (a->dp[n] < b->dp[n]) {
+      return MP_LT;
+    }
+  }
+  return MP_EQ;
+}
diff --git a/bn_mp_copy.c b/bn_mp_copy.c
new file mode 100644
index 000000000..a50262041
--- /dev/null
+++ b/bn_mp_copy.c
@@ -0,0 +1,48 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* copy, b = a */
+int
+mp_copy (mp_int * a, mp_int * b)
+{
+  int       res, n;
+
+
+  /* if dst == src do nothing */
+  if (a == b || a->dp == b->dp) {
+    return MP_OKAY;
+  }
+
+  /* grow dest */
+  if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero b and copy the parameters over */
+  b->used = a->used;
+  b->sign = a->sign;
+
+  /* copy all the digits */
+  for (n = 0; n < a->used; n++) {
+    b->dp[n] = a->dp[n];
+  }
+
+  /* clear high digits */
+  for (n = b->used; n < b->alloc; n++) {
+    b->dp[n] = 0;
+  }
+  return MP_OKAY;
+}
diff --git a/bn_mp_count_bits.c b/bn_mp_count_bits.c
new file mode 100644
index 000000000..3f6ed28aa
--- /dev/null
+++ b/bn_mp_count_bits.c
@@ -0,0 +1,35 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* returns the number of bits in an int */
+int
+mp_count_bits (mp_int * a)
+{
+  int       r;
+  mp_digit  q;
+
+  if (a->used == 0) {
+    return 0;
+  }
+
+  r = (a->used - 1) * DIGIT_BIT;
+  q = a->dp[a->used - 1];
+  while (q > ((mp_digit) 0)) {
+    ++r;
+    q >>= ((mp_digit) 1);
+  }
+  return r;
+}
diff --git a/bn_mp_div.c b/bn_mp_div.c
new file mode 100644
index 000000000..b324d2d2d
--- /dev/null
+++ b/bn_mp_div.c
@@ -0,0 +1,200 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+ * HAC pp.598 Algorithm 14.20
+ *
+ * Note that the description in HAC is horribly incomplete.  For example,
+ * it doesn't consider the case where digits are removed from 'x' in the inner
+ * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
+ *
+ * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
+*/
+int
+mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  mp_int    q, x, y, t1, t2;
+  int       res, n, t, i, norm, neg;
+
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+
+  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
+    return res;
+  }
+  q.used = a->used + 2;
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    goto __Q;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto __T1;
+  }
+
+  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
+    goto __T2;
+  }
+
+  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
+    goto __X;
+  }
+
+  /* fix the sign */
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  x.sign = y.sign = MP_ZPOS;
+
+  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
+  norm = 0;
+  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) ==
+	 ((mp_digit) 0)) {
+    ++norm;
+    if ((res = mp_mul_2d (&x, 1, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+    if ((res = mp_mul_2d (&y, 1, &y)) != MP_OKAY) {
+      goto __Y;
+    }
+  }
+
+  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+  n = x.used - 1;
+  t = y.used - 1;
+
+  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) {	/* y = y*b^{n-t} */
+    goto __Y;
+  }
+
+  while (mp_cmp (&x, &y) != MP_LT) {
+    ++(q.dp[n - t]);
+    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+  }
+
+  /* reset y by shifting it back down */
+  mp_rshd (&y, n - t);
+
+  /* step 3. for i from n down to (t + 1) */
+  for (i = n; i >= (t + 1); i--) {
+    if (i > x.alloc)
+      continue;
+
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    if (x.dp[i] == y.dp[t]) {
+      q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL);
+    } else {
+      mp_word   tmp;
+      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+      tmp |= ((mp_word) x.dp[i - 1]);
+      tmp /= ((mp_word) y.dp[t]);
+      if (tmp > (mp_word) MP_MASK)
+	tmp = MP_MASK;
+      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+    }
+
+    /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
+    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+    do {
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+
+      /* find left hand */
+      mp_zero (&t1);
+      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+      t1.dp[1] = y.dp[t];
+      t1.used = 2;
+      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+	goto __Y;
+      }
+
+      /* find right hand */
+      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+      t2.dp[2] = x.dp[i];
+      t2.used = 3;
+    }
+    while (mp_cmp (&t1, &t2) == MP_GT);
+
+    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
+    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
+    if (x.sign == MP_NEG) {
+      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
+	goto __Y;
+      }
+      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+	goto __Y;
+      }
+      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
+	goto __Y;
+      }
+
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+    }
+  }
+
+  /* now q is the quotient and x is the remainder [which we have to normalize] */
+  /* get sign before writing to c */
+  x.sign = a->sign;
+  if (c != NULL) {
+    mp_clamp (&q);
+    mp_exch (&q, c);
+    c->sign = neg;
+  }
+
+  if (d != NULL) {
+    mp_div_2d (&x, norm, &x, NULL);
+    mp_clamp (&x);
+    mp_exch (&x, d);
+  }
+
+  res = MP_OKAY;
+
+__Y:mp_clear (&y);
+__X:mp_clear (&x);
+__T2:mp_clear (&t2);
+__T1:mp_clear (&t1);
+__Q:mp_clear (&q);
+  return res;
+}
diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c
new file mode 100644
index 000000000..bc7dc28b9
--- /dev/null
+++ b/bn_mp_div_2.c
@@ -0,0 +1,38 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* b = a/2 */
+int
+mp_div_2 (mp_int * a, mp_int * b)
+{
+  mp_digit  r, rr;
+  int       x, res;
+
+
+  /* copy */
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+
+  r = 0;
+  for (x = b->used - 1; x >= 0; x--) {
+    rr = b->dp[x] & 1;
+    b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT - 1));
+    r = rr;
+  }
+  mp_clamp (b);
+  return MP_OKAY;
+}
diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c
new file mode 100644
index 000000000..3bfa5aabe
--- /dev/null
+++ b/bn_mp_div_2d.c
@@ -0,0 +1,78 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* shift right by a certain bit count (store quotient in c, remainder in d) */
+int
+mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+{
+  mp_digit  D, r, rr;
+  int       x, res;
+  mp_int    t;
+
+
+  /* if the shift count is <= 0 then we do no work */
+  if (b <= 0) {
+    res = mp_copy (a, c);
+    if (d != NULL) {
+      mp_zero (d);
+    }
+    return res;
+  }
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  /* get the remainder */
+  if (d != NULL) {
+    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  /* shift by as many digits in the bit count */
+  mp_rshd (c, b / DIGIT_BIT);
+
+  /* shift any bit count < DIGIT_BIT */
+  D = (mp_digit) (b % DIGIT_BIT);
+  if (D != 0) {
+    r = 0;
+    for (x = c->used - 1; x >= 0; x--) {
+      /* get the lower  bits of this word in a temp */
+      rr = c->dp[x] & ((mp_digit) ((1U << D) - 1U));
+
+      /* shift the current word and mix in the carry bits from the previous word */
+      c->dp[x] = (c->dp[x] >> D) | (r << (DIGIT_BIT - D));
+
+      /* set the carry to the carry bits of the current word found above */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  res = MP_OKAY;
+  if (d != NULL) {
+    mp_exch (&t, d);
+  }
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c
new file mode 100644
index 000000000..321f030dd
--- /dev/null
+++ b/bn_mp_div_d.c
@@ -0,0 +1,44 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* single digit division */
+int
+mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+{
+  mp_int    t, t2;
+  int       res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  mp_set (&t, b);
+  res = mp_div (a, &t, c, &t2);
+
+  if (d != NULL) {
+    *d = t2.dp[0];
+  }
+
+  mp_clear (&t);
+  mp_clear (&t2);
+  return res;
+}
diff --git a/bn_mp_exch.c b/bn_mp_exch.c
new file mode 100644
index 000000000..44e7087b5
--- /dev/null
+++ b/bn_mp_exch.c
@@ -0,0 +1,25 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+void
+mp_exch (mp_int * a, mp_int * b)
+{
+  mp_int    t;
+
+  t = *a;
+  *a = *b;
+  *b = t;
+}
diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c
new file mode 100644
index 000000000..80f9578a6
--- /dev/null
+++ b/bn_mp_expt_d.c
@@ -0,0 +1,49 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+int
+mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int       res, x;
+  mp_int    g;
+
+
+  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set initial result */
+  mp_set (c, 1);
+
+  for (x = 0; x < (int) DIGIT_BIT; x++) {
+    if ((res = mp_sqr (c, c)) != MP_OKAY) {
+      mp_clear (&g);
+      return res;
+    }
+
+    if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) {
+      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
+	mp_clear (&g);
+	return res;
+      }
+    }
+
+    b <<= 1;
+  }
+
+  mp_clear (&g);
+  return MP_OKAY;
+}
diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c
new file mode 100644
index 000000000..ce81953c1
--- /dev/null
+++ b/bn_mp_exptmod.c
@@ -0,0 +1,213 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+int
+mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int    M[256], res, mu;
+  mp_digit  buf;
+  int       err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+
+  /* if the modulus is odd use the fast method */
+  if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
+    err = mp_exptmod_fast (G, X, P, Y);
+    return err;
+  }
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+  /* init G array */
+  for (x = 0; x < (1 << winsize); x++) {
+    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
+      for (y = 0; y < x; y++) {
+	mp_clear (&M[y]);
+      }
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto __M;
+  }
+  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* create M table 
+   *
+   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err =
+	 mp_sqr (&M[1 << (winsize - 1)],
+		 &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode = 0;
+  bitcnt = 0;
+  buf = 0;
+  digidx = X->used - 1;
+  bitcpy = bitbuf = 0;
+
+  bitcnt = 1;
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+	break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= 1;
+
+    /* if the bit is zero and mode == 0 then we ignore it 
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it 
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+	goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+	goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply multiply */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+	  goto __RES;
+	}
+	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+	  goto __RES;
+	}
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+	goto __MU;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+	goto __MU;
+      }
+
+      /* empty window and reset */
+      bitcpy = bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+	goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+	goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+	/* then multiply */
+	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+	  goto __RES;
+	}
+	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+	  goto __RES;
+	}
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__MU:mp_clear (&mu);
+__M:
+  for (x = 0; x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
new file mode 100644
index 000000000..bd25b3e86
--- /dev/null
+++ b/bn_mp_exptmod_fast.c
@@ -0,0 +1,229 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
+ *
+ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
+ * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery reduction 
+ */
+int
+mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int    M[256], res;
+  mp_digit  buf, mp;
+  int       err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+  /* init G array */
+  for (x = 0; x < (1 << winsize); x++) {
+    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
+      for (y = 0; y < x; y++) {
+	mp_clear (&M[y]);
+      }
+      return err;
+    }
+  }
+
+  /* now setup montgomery  */
+  if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
+    goto __M;
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* now we need R mod m */
+  if ((err = mp_2expt (&res, P->used * DIGIT_BIT)) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* res = R mod m (can use modified double/subtract ...) */
+  if ((err = mp_mod (&res, P, &res)) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* now set M[1] to G * R mod m */
+  if ((err = mp_mulmod (&M[1], &res, P, &M[1])) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __RES;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err =
+	 mp_sqr (&M[1 << (winsize - 1)],
+		 &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __RES;
+    }
+    if ((err =
+	 mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+      goto __RES;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __RES;
+    }
+    if ((err = mp_montgomery_reduce (&M[x], P, mp)) != MP_OKAY) {
+      goto __RES;
+    }
+  }
+
+  /* set initial mode and bit cnt */
+  mode = 0;
+  bitcnt = 0;
+  buf = 0;
+  digidx = X->used - 1;
+  bitcpy = bitbuf = 0;
+
+  bitcnt = 1;
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+	break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= 1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+	goto __RES;
+      }
+      if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+	goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply multiply */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+	  goto __RES;
+	}
+	if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+	  goto __RES;
+	}
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+	goto __RES;
+      }
+      if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+	goto __RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+	goto __RES;
+      }
+      if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+	goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+	/* then multiply */
+	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+	  goto __RES;
+	}
+	if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+	  goto __RES;
+	}
+      }
+    }
+  }
+
+  /* fixup result */
+  if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+    goto __RES;
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__M:
+  for (x = 0; x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
new file mode 100644
index 000000000..e2594b561
--- /dev/null
+++ b/bn_mp_gcd.c
@@ -0,0 +1,118 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
+ */
+int
+mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int    u, v, t;
+  int       k, res, neg;
+
+
+  /* either zero than gcd is the largest */
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+    return mp_copy (b, c);
+  }
+  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+    return mp_copy (a, c);
+  }
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
+    mp_set (c, 1);
+    return MP_OKAY;
+  }
+
+  /* if both are negative they share (-1) as a common divisor */
+  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
+
+  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
+    goto __U;
+  }
+
+  /* must be positive for the remainder of the algorithm */
+  u.sign = v.sign = MP_ZPOS;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    goto __V;
+  }
+
+  /* B1.  Find power of two */
+  k = 0;
+  while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
+    ++k;
+    if ((res = mp_div_2d (&u, 1, &u, NULL)) != MP_OKAY) {
+      goto __T;
+    }
+    if ((res = mp_div_2d (&v, 1, &v, NULL)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+
+  /* B2.  Initialize */
+  if ((u.dp[0] & 1) == 1) {
+    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
+      goto __T;
+    }
+    t.sign = MP_NEG;
+  } else {
+    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+
+  do {
+    /* B3 (and B4).  Halve t, if even */
+    while (t.used != 0 && (t.dp[0] & 1) == 0) {
+      if ((res = mp_div_2d (&t, 1, &t, NULL)) != MP_OKAY) {
+	goto __T;
+      }
+    }
+
+    /* B5.  if t>0 then u=t otherwise v=-t */
+    if (t.used != 0 && t.sign != MP_NEG) {
+      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
+	goto __T;
+      }
+    } else {
+      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
+	goto __T;
+      }
+      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+    }
+
+    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
+    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+  while (t.used != 0);
+
+  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
+    goto __T;
+  }
+
+  mp_exch (&u, c);
+  c->sign = neg;
+  res = MP_OKAY;
+__T:mp_clear (&t);
+__V:mp_clear (&u);
+__U:mp_clear (&v);
+  return res;
+}
diff --git a/bn_mp_grow.c b/bn_mp_grow.c
new file mode 100644
index 000000000..2a123698a
--- /dev/null
+++ b/bn_mp_grow.c
@@ -0,0 +1,40 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* grow as required */
+int
+mp_grow (mp_int * a, int size)
+{
+  int       i, n;
+
+
+  /* if the alloc size is smaller alloc more ram */
+  if (a->alloc < size) {
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	/* ensure there are always at least MP_PREC digits extra on top */
+
+    a->dp = realloc (a->dp, sizeof (mp_digit) * size);
+    if (a->dp == NULL) {
+      return MP_MEM;
+    }
+
+    n = a->alloc;
+    a->alloc = size;
+    for (i = n; i < a->alloc; i++) {
+      a->dp[i] = 0;
+    }
+  }
+  return MP_OKAY;
+}
diff --git a/bn_mp_init.c b/bn_mp_init.c
new file mode 100644
index 000000000..024bfd9f1
--- /dev/null
+++ b/bn_mp_init.c
@@ -0,0 +1,35 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* init a new bigint */
+int
+mp_init (mp_int * a)
+{
+
+  /* allocate ram required and clear it */
+  a->dp = calloc (sizeof (mp_digit), MP_PREC);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the used to zero, allocated digit to the default precision
+   * and sign to positive */
+  a->used = 0;
+  a->alloc = MP_PREC;
+  a->sign = MP_ZPOS;
+
+  return MP_OKAY;
+}
diff --git a/bn_mp_init_copy.c b/bn_mp_init_copy.c
new file mode 100644
index 000000000..b3f25ee2a
--- /dev/null
+++ b/bn_mp_init_copy.c
@@ -0,0 +1,28 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* creates "a" then copies b into it */
+int
+mp_init_copy (mp_int * a, mp_int * b)
+{
+  int       res;
+
+  if ((res = mp_init (a)) != MP_OKAY) {
+    return res;
+  }
+  res = mp_copy (b, a);
+  return res;
+}
diff --git a/bn_mp_init_size.c b/bn_mp_init_size.c
new file mode 100644
index 000000000..4dc46b0f7
--- /dev/null
+++ b/bn_mp_init_size.c
@@ -0,0 +1,33 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* init a mp_init and grow it to a given size */
+int
+mp_init_size (mp_int * a, int size)
+{
+
+  /* pad up so there are at least 16 zero digits */
+  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	/* ensure there are always at least 16 digits extra on top */
+  a->dp = calloc (sizeof (mp_digit), size);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+  a->used = 0;
+  a->alloc = size;
+  a->sign = MP_ZPOS;
+
+  return MP_OKAY;
+}
diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c
new file mode 100644
index 000000000..1051eb017
--- /dev/null
+++ b/bn_mp_invmod.c
@@ -0,0 +1,203 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+int
+mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int    x, y, u, v, A, B, C, D;
+  int       res;
+
+
+  /* b cannot be negative */
+  if (b->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* if the modulus is odd we can use a faster routine instead */
+  if (mp_iseven (b) == 0) {
+    res = fast_mp_invmod (a, b, c);
+    return res;
+  }
+
+  if ((res = mp_init (&x)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  if ((res = mp_init (&y)) != MP_OKAY) {
+    goto __X;
+  }
+
+  if ((res = mp_init (&u)) != MP_OKAY) {
+    goto __Y;
+  }
+
+  if ((res = mp_init (&v)) != MP_OKAY) {
+    goto __U;
+  }
+
+  if ((res = mp_init (&A)) != MP_OKAY) {
+    goto __V;
+  }
+
+  if ((res = mp_init (&B)) != MP_OKAY) {
+    goto __A;
+  }
+
+  if ((res = mp_init (&C)) != MP_OKAY) {
+    goto __B;
+  }
+
+  if ((res = mp_init (&D)) != MP_OKAY) {
+    goto __C;
+  }
+
+  /* x = a, y = b */
+  if ((res = mp_copy (a, &x)) != MP_OKAY) {
+    goto __D;
+  }
+  if ((res = mp_copy (b, &y)) != MP_OKAY) {
+    goto __D;
+  }
+
+  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
+    goto __D;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __D;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __D;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __D;
+  }
+  mp_set (&A, 1);
+  mp_set (&D, 1);
+
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __D;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
+	goto __D;
+      }
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+	goto __D;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
+      goto __D;
+    }
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __D;
+    }
+    /* 5.2 if C,D are even then */
+    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
+	goto __D;
+      }
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+	goto __D;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
+      goto __D;
+    }
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __D;
+    }
+
+    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
+      goto __D;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __D;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __D;
+    }
+
+    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
+      goto __D;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __D;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __D;
+  }
+
+  /* a is now the inverse */
+  mp_exch (&C, c);
+  res = MP_OKAY;
+
+__D:mp_clear (&D);
+__C:mp_clear (&C);
+__B:mp_clear (&B);
+__A:mp_clear (&A);
+__V:mp_clear (&v);
+__U:mp_clear (&u);
+__Y:mp_clear (&y);
+__X:mp_clear (&x);
+__ERR:
+  return res;
+}
diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c
new file mode 100644
index 000000000..b97d5f3db
--- /dev/null
+++ b/bn_mp_jacobi.c
@@ -0,0 +1,114 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes the jacobi c = (a | n) (or Legendre if b is prime)
+ * HAC pp. 73 Algorithm 2.149
+ */
+int
+mp_jacobi (mp_int * a, mp_int * n, int *c)
+{
+  mp_int    a1, n1, e;
+  int       s, r, res;
+  mp_digit  residue;
+
+  /* step 1.  if a == 0, return 0 */
+  if (mp_iszero (a) == 1) {
+    *c = 0;
+    return MP_OKAY;
+  }
+
+  /* step 2.  if a == 1, return 1 */
+  if (mp_cmp_d (a, 1) == MP_EQ) {
+    *c = 1;
+    return MP_OKAY;
+  }
+
+  /* default */
+  s = 0;
+
+  /* step 3.  write a = a1 * 2^e  */
+  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&n1)) != MP_OKAY) {
+    goto __A1;
+  }
+
+  if ((res = mp_init (&e)) != MP_OKAY) {
+    goto __N1;
+  }
+
+  while (mp_iseven (&a1) == 1) {
+    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
+      goto __E;
+    }
+
+    if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
+      goto __E;
+    }
+  }
+
+  /* step 4.  if e is even set s=1 */
+  if (mp_iseven (&e) == 1) {
+    s = 1;
+  } else {
+    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
+    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
+      goto __E;
+    }
+
+    if (residue == 1 || residue == 7) {
+      s = 1;
+    } else if (residue == 3 || residue == 5) {
+      s = -1;
+    }
+  }
+
+  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
+    goto __E;
+  }
+  if (residue == 3) {
+    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
+      goto __E;
+    }
+    if (residue == 3) {
+      s = -s;
+    }
+  }
+
+  /* if a1 == 1 we're done */
+  if (mp_cmp_d (&a1, 1) == MP_EQ) {
+    *c = s;
+  } else {
+    /* n1 = n mod a1 */
+    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
+      goto __E;
+    }
+    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
+      goto __E;
+    }
+    *c = s * r;
+  }
+
+  /* done */
+  res = MP_OKAY;
+__E:mp_clear (&e);
+__N1:mp_clear (&n1);
+__A1:mp_clear (&a1);
+  return res;
+}
diff --git a/bn_mp_karatsuba_mul.c b/bn_mp_karatsuba_mul.c
new file mode 100644
index 000000000..0a8c34e73
--- /dev/null
+++ b/bn_mp_karatsuba_mul.c
@@ -0,0 +1,142 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications
+ *
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b)
+ *
+ * a = a1 * B^n + a0
+ * b = b1 * B^n + b0
+ *
+ * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0
+ *
+ * Note that a1b1 and a0b0 are used twice and only need to be computed once.  So in total
+ * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0)
+ *
+ * Note that a multiplication of half the digits requires 1/4th the number of single precision 
+ * multiplications so in total after one call 25% of the single precision multiplications are saved.
+ * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above
+ * the threshold.  This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which
+ * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use.  Generally though the 
+ * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached.
+ */
+int
+mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int    x0, x1, y0, y1, t1, t2, x0y0, x1y1;
+  int       B, err, x;
+
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = MIN (a->used, b->used);
+
+  /* now divide in two */
+  B = B / 2;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+  if (mp_init_size (&y0, B) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+    goto Y0;
+
+  /* init temps */
+  if (mp_init (&t1) != MP_OKAY)
+    goto Y1;
+  if (mp_init (&t2) != MP_OKAY)
+    goto T1;
+  if (mp_init (&x0y0) != MP_OKAY)
+    goto T2;
+  if (mp_init (&x1y1) != MP_OKAY)
+    goto X0Y0;
+
+  /* now shift the digits */
+  x0.sign = x1.sign = a->sign;
+  y0.sign = y1.sign = b->sign;
+
+  x0.used = y0.used = B;
+  x1.used = a->used - B;
+  y1.used = b->used - B;
+
+  /* we copy the digits directly instead of using higher level functions
+   * since we also need to shift the digits
+   */
+  for (x = 0; x < B; x++) {
+    x0.dp[x] = a->dp[x];
+    y0.dp[x] = b->dp[x];
+  }
+  for (x = B; x < a->used; x++) {
+    x1.dp[x - B] = a->dp[x];
+  }
+  for (x = B; x < b->used; x++) {
+    y1.dp[x - B] = b->dp[x];
+  }
+
+  /* only need to clamp the lower words since by definition the upper words x1/y1 must
+   * have a known number of digits
+   */
+  mp_clamp (&x0);
+  mp_clamp (&y0);
+
+  /* now calc the products x0y0 and x1y1 */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
+    goto X1Y1;			/* x0y0 = x0*y0 */
+  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+    goto X1Y1;			/* x1y1 = x1*y1 */
+
+  /* now calc x1-x0 and y1-y0 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;			/* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &t2) != MP_OKAY)
+    goto X1Y1;			/* t2 = y1 - y0 */
+  if (mp_mul (&t1, &t2, &t1) != MP_OKAY)
+    goto X1Y1;			/* t1 = (x1 - x0) * (y1 - y0) */
+
+  /* add x0y0 */
+  if (mp_add (&x0y0, &x1y1, &t2) != MP_OKAY)
+    goto X1Y1;			/* t2 = x0y0 + x1y1 */
+  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+    goto X1Y1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1Y1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+    goto X1Y1;			/* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;			/* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+    goto X1Y1;			/* t1 = x0y0 + t1 + x1y1 */
+
+  err = MP_OKAY;
+
+X1Y1:mp_clear (&x1y1);
+X0Y0:mp_clear (&x0y0);
+T2:mp_clear (&t2);
+T1:mp_clear (&t1);
+Y1:mp_clear (&y1);
+Y0:mp_clear (&y0);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
diff --git a/bn_mp_karatsuba_sqr.c b/bn_mp_karatsuba_sqr.c
new file mode 100644
index 000000000..29396baed
--- /dev/null
+++ b/bn_mp_karatsuba_sqr.c
@@ -0,0 +1,106 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* Karatsuba squaring, computes b = a*a using three half size squarings
+ *
+ * See comments of mp_karatsuba_mul for details.  It is essentially the same algorithm
+ * but merely tuned to perform recursive squarings.
+ */
+int
+mp_karatsuba_sqr (mp_int * a, mp_int * b)
+{
+  mp_int    x0, x1, t1, t2, x0x0, x1x1;
+  int       B, err, x;
+
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = a->used;
+
+  /* now divide in two */
+  B = B / 2;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+
+  /* init temps */
+  if (mp_init (&t1) != MP_OKAY)
+    goto X1;
+  if (mp_init (&t2) != MP_OKAY)
+    goto T1;
+  if (mp_init (&x0x0) != MP_OKAY)
+    goto T2;
+  if (mp_init (&x1x1) != MP_OKAY)
+    goto X0X0;
+
+  /* now shift the digits */
+  for (x = 0; x < B; x++) {
+    x0.dp[x] = a->dp[x];
+  }
+
+  for (x = B; x < a->used; x++) {
+    x1.dp[x - B] = a->dp[x];
+  }
+
+  x0.used = B;
+  x1.used = a->used - B;
+
+  mp_clamp (&x0);
+
+  /* now calc the products x0*x0 and x1*x1 */
+  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+    goto X1X1;			/* x0x0 = x0*x0 */
+  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+    goto X1X1;			/* x1x1 = x1*x1 */
+
+  /* now calc x1-x0 and y1-y0 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1X1;			/* t1 = x1 - x0 */
+  if (mp_sqr (&t1, &t1) != MP_OKAY)
+    goto X1X1;			/* t1 = (x1 - x0) * (y1 - y0) */
+
+  /* add x0y0 */
+  if (mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+    goto X1X1;			/* t2 = x0y0 + x1y1 */
+  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+    goto X1X1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1X1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+    goto X1X1;			/* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+    goto X1X1;			/* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+    goto X1X1;			/* t1 = x0y0 + t1 + x1y1 */
+
+  err = MP_OKAY;
+
+X1X1:mp_clear (&x1x1);
+X0X0:mp_clear (&x0x0);
+T2:mp_clear (&t2);
+T1:mp_clear (&t1);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
diff --git a/bn_mp_lcm.c b/bn_mp_lcm.c
new file mode 100644
index 000000000..7d3813541
--- /dev/null
+++ b/bn_mp_lcm.c
@@ -0,0 +1,42 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes least common multiple as a*b/(a, b) */
+int
+mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       res;
+  mp_int    t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  res = mp_div (&t, c, c, NULL);
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_lshd.c b/bn_mp_lshd.c
new file mode 100644
index 000000000..ea0240900
--- /dev/null
+++ b/bn_mp_lshd.c
@@ -0,0 +1,46 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* shift left a certain amount of digits */
+int
+mp_lshd (mp_int * a, int b)
+{
+  int       x, res;
+
+
+  /* if its less than zero return */
+  if (b <= 0) {
+    return MP_OKAY;
+  }
+
+  /* grow to fit the new digits */
+  if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+    return res;
+  }
+
+  /* increment the used by the shift amount than copy upwards */
+  a->used += b;
+  for (x = a->used - 1; x >= b; x--) {
+    a->dp[x] = a->dp[x - b];
+  }
+
+  /* zero the lower digits */
+  for (x = 0; x < b; x++) {
+    a->dp[x] = 0;
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
diff --git a/bn_mp_mod.c b/bn_mp_mod.c
new file mode 100644
index 000000000..5b208a6ee
--- /dev/null
+++ b/bn_mp_mod.c
@@ -0,0 +1,43 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* c = a mod b, 0 <= c < b */
+int
+mp_mod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int    t;
+  int       res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if (t.sign == MP_NEG) {
+    res = mp_add (b, &t, c);
+  } else {
+    res = MP_OKAY;
+    mp_exch (&t, c);
+  }
+
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_mod_2d.c b/bn_mp_mod_2d.c
new file mode 100644
index 000000000..df7361270
--- /dev/null
+++ b/bn_mp_mod_2d.c
@@ -0,0 +1,51 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* calc a value mod 2^b */
+int
+mp_mod_2d (mp_int * a, int b, mp_int * c)
+{
+  int       x, res;
+
+
+  /* if b is <= 0 then zero the int */
+  if (b <= 0) {
+    mp_zero (c);
+    return MP_OKAY;
+  }
+
+  /* if the modulus is larger than the value than return */
+  if (b > (int) (a->used * DIGIT_BIT)) {
+    res = mp_copy (a, c);
+    return res;
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero digits above the last digit of the modulus */
+  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
+    c->dp[x] = 0;
+  }
+  /* clear the digit that is not completely outside/inside the modulus */
+  c->dp[b / DIGIT_BIT] &=
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) -
+		((mp_digit) 1));
+  mp_clamp (c);
+  return MP_OKAY;
+}
diff --git a/bn_mp_mod_d.c b/bn_mp_mod_d.c
new file mode 100644
index 000000000..7b08d2349
--- /dev/null
+++ b/bn_mp_mod_d.c
@@ -0,0 +1,47 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+int
+mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
+{
+  mp_int    t, t2;
+  int       res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  mp_set (&t, b);
+  mp_div (a, &t, NULL, &t2);
+
+  if (t2.sign == MP_NEG) {
+    if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) {
+      mp_clear (&t);
+      mp_clear (&t2);
+      return res;
+    }
+  }
+  *c = t2.dp[0];
+  mp_clear (&t);
+  mp_clear (&t2);
+  return MP_OKAY;
+}
diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c
new file mode 100644
index 000000000..aeb2cdece
--- /dev/null
+++ b/bn_mp_montgomery_reduce.c
@@ -0,0 +1,80 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+int
+mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+{
+  int       ix, res, digs;
+  mp_digit  ui;
+
+  digs = m->used * 2 + 1;
+  if ((digs < 512)
+      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    res = fast_mp_montgomery_reduce (a, m, mp);
+    return res;
+  }
+
+  if (a->alloc < m->used * 2 + 1) {
+    if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+  a->used = m->used * 2 + 1;
+
+  for (ix = 0; ix < m->used; ix++) {
+    /* ui = ai * m' mod b */
+    ui = (a->dp[ix] * mp) & MP_MASK;
+
+    /* a = a + ui * m * b^i */
+    {
+      register int iy;
+      register mp_digit *tmpx, *tmpy, mu;
+      register mp_word r;
+
+      /* aliases */
+      tmpx = m->dp;
+      tmpy = a->dp + ix;
+
+      mu = 0;
+      for (iy = 0; iy < m->used; iy++) {
+	r =
+	  ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) +
+	  ((mp_word) * tmpy);
+	mu = (r >> ((mp_word) DIGIT_BIT));
+	*tmpy++ = (r & ((mp_word) MP_MASK));
+      }
+      /* propagate carries */
+      while (mu) {
+	*tmpy += mu;
+	mu = (*tmpy >> DIGIT_BIT) & 1;
+	*tmpy++ &= MP_MASK;
+      }
+    }
+  }
+
+  /* A = A/b^n */
+  mp_rshd (a, m->used);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (a, m) != MP_LT) {
+    if ((res = s_mp_sub (a, m, a)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  return MP_OKAY;
+}
diff --git a/bn_mp_montgomery_setup.c b/bn_mp_montgomery_setup.c
new file mode 100644
index 000000000..601e74cff
--- /dev/null
+++ b/bn_mp_montgomery_setup.c
@@ -0,0 +1,53 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* setups the montgomery reduction stuff */
+int
+mp_montgomery_setup (mp_int * a, mp_digit * mp)
+{
+  mp_int    t, tt;
+  int       res;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&tt)) != MP_OKAY) {
+    goto __T;
+  }
+
+  /* tt = b */
+  tt.dp[0] = 0;
+  tt.dp[1] = 1;
+  tt.used = 2;
+
+  /* t = m mod b */
+  t.dp[0] = a->dp[0];
+  t.used = 1;
+
+  /* t = 1/m mod b */
+  if ((res = mp_invmod (&t, &tt, &t)) != MP_OKAY) {
+    goto __TT;
+  }
+
+  /* t = -1/m mod b */
+  *mp = ((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - t.dp[0];
+
+  res = MP_OKAY;
+__TT:mp_clear (&tt);
+__T:mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_mul.c b/bn_mp_mul.c
new file mode 100644
index 000000000..cfa1467cb
--- /dev/null
+++ b/bn_mp_mul.c
@@ -0,0 +1,30 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* high level multiplication (handles sign) */
+int
+mp_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       res, neg;
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
+    res = mp_karatsuba_mul (a, b, c);
+  } else {
+    res = s_mp_mul (a, b, c);
+  }
+  c->sign = neg;
+  return res;
+}
diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c
new file mode 100644
index 000000000..dd5ecca4d
--- /dev/null
+++ b/bn_mp_mul_2.c
@@ -0,0 +1,44 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* b = a*2 */
+int
+mp_mul_2 (mp_int * a, mp_int * b)
+{
+  mp_digit  r, rr;
+  int       x, res;
+
+
+  /* copy */
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
+    return res;
+  }
+  ++b->used;
+
+  /* shift any bit count < DIGIT_BIT */
+  r = 0;
+  for (x = 0; x < b->used; x++) {
+    rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1;
+    b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK;
+    r = rr;
+  }
+  mp_clamp (b);
+  return MP_OKAY;
+}
diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c
new file mode 100644
index 000000000..7823eb99b
--- /dev/null
+++ b/bn_mp_mul_2d.c
@@ -0,0 +1,57 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* shift left by a certain bit count */
+int
+mp_mul_2d (mp_int * a, int b, mp_int * c)
+{
+  mp_digit  d, r, rr;
+  int       x, res;
+
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* shift by as many digits in the bit count */
+  if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  c->used = c->alloc;
+
+  /* shift any bit count < DIGIT_BIT */
+  d = (mp_digit) (b % DIGIT_BIT);
+  if (d != 0) {
+    r = 0;
+    for (x = 0; x < c->used; x++) {
+      /* get the higher bits of the current word */
+      rr = (c->dp[x] >> (DIGIT_BIT - d)) & ((mp_digit) ((1U << d) - 1U));
+
+      /* shift the current word and OR in the carry */
+      c->dp[x] = ((c->dp[x] << d) | r) & MP_MASK;
+
+      /* set the carry to the carry bits of the current word */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c
new file mode 100644
index 000000000..bced9b7b7
--- /dev/null
+++ b/bn_mp_mul_d.c
@@ -0,0 +1,46 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* multiply by a digit */
+int
+mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int       res, pa, ix;
+  mp_word   r;
+  mp_digit  u;
+  mp_int    t;
+
+
+  pa = a->used;
+  if ((res = mp_init_size (&t, pa + 2)) != MP_OKAY) {
+    return res;
+  }
+  t.used = pa + 2;
+
+  u = 0;
+  for (ix = 0; ix < pa; ix++) {
+    r = ((mp_word) u) + ((mp_word) a->dp[ix]) * ((mp_word) b);
+    t.dp[ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+    u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+  }
+  t.dp[ix] = u;
+
+  t.sign = a->sign;
+  mp_clamp (&t);
+  mp_exch (&t, c);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_mp_mulmod.c b/bn_mp_mulmod.c
new file mode 100644
index 000000000..2cdbdda9a
--- /dev/null
+++ b/bn_mp_mulmod.c
@@ -0,0 +1,36 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* d = a * b (mod c) */
+int
+mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int       res;
+  mp_int    t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_n_root.c b/bn_mp_n_root.c
new file mode 100644
index 000000000..72b3a8cd9
--- /dev/null
+++ b/bn_mp_n_root.c
@@ -0,0 +1,115 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* find the n'th root of an integer 
+ *
+ * Result found such that (c)^b <= a and (c+1)^b > a 
+ */
+int
+mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int    t1, t2, t3;
+  int       res, neg;
+
+  /* input must be positive if b is even */
+  if ((b & 1) == 0 && a->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto __T1;
+  }
+
+  if ((res = mp_init (&t3)) != MP_OKAY) {
+    goto __T2;
+  }
+
+  /* if a is negative fudge the sign but keep track */
+  neg = a->sign;
+  a->sign = MP_ZPOS;
+
+  /* t2 = 2 */
+  mp_set (&t2, 2);
+
+  do {
+    /* t1 = t2 */
+    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
+      goto __T3;
+    }
+
+    /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
+    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {	/* t3 = t1^(b-1) */
+      goto __T3;
+    }
+
+    /* numerator */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {	/* t2 = t1^b */
+      goto __T3;
+    }
+
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {	/* t2 = t1^b - a */
+      goto __T3;
+    }
+
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {	/* t3 = t1^(b-1) * b  */
+      goto __T3;
+    }
+
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {	/* t3 = (t1^b - a)/(b * t1^(b-1)) */
+      goto __T3;
+    }
+
+    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
+      goto __T3;
+    }
+  }
+  while (mp_cmp (&t1, &t2) != MP_EQ);
+
+  /* result can be off by a few so check */
+  for (;;) {
+    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
+      goto __T3;
+    }
+
+    if (mp_cmp (&t2, a) == MP_GT) {
+      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
+	goto __T3;
+      }
+    } else {
+      break;
+    }
+  }
+
+  /* reset the sign of a first */
+  a->sign = neg;
+
+  /* set the result */
+  mp_exch (&t1, c);
+
+  /* set the sign of the result */
+  c->sign = neg;
+
+  res = MP_OKAY;
+
+__T3:mp_clear (&t3);
+__T2:mp_clear (&t2);
+__T1:mp_clear (&t1);
+  return res;
+}
diff --git a/bn_mp_neg.c b/bn_mp_neg.c
new file mode 100644
index 000000000..a2bb7c4bb
--- /dev/null
+++ b/bn_mp_neg.c
@@ -0,0 +1,27 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* b = -a */
+int
+mp_neg (mp_int * a, mp_int * b)
+{
+  int       res;
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  return MP_OKAY;
+}
diff --git a/bn_mp_or.c b/bn_mp_or.c
new file mode 100644
index 000000000..a3843d920
--- /dev/null
+++ b/bn_mp_or.c
@@ -0,0 +1,45 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* OR two ints together */
+int
+mp_or (mp_int * a, mp_int * b, mp_int * c)
+{
+  int res, ix, px;
+  mp_int t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] |= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_mp_rand.c b/bn_mp_rand.c
new file mode 100644
index 000000000..c72dec970
--- /dev/null
+++ b/bn_mp_rand.c
@@ -0,0 +1,48 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* makes a pseudo-random int of a given size */
+int
+mp_rand (mp_int * a, int digits)
+{
+  int       res;
+  mp_digit  d;
+
+  mp_zero (a);
+  if (digits <= 0) {
+    return MP_OKAY;
+  }
+
+  /* first place a random non-zero digit */
+  d = ((mp_digit) abs (rand ()));
+  d = d == 0 ? 1 : d;
+
+  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
+    return res;
+  }
+
+
+  while (digits-- > 0) {
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+      return res;
+    }
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  return MP_OKAY;
+}
diff --git a/bn_mp_read_signed_bin.c b/bn_mp_read_signed_bin.c
new file mode 100644
index 000000000..b5af4c0f0
--- /dev/null
+++ b/bn_mp_read_signed_bin.c
@@ -0,0 +1,28 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* read signed bin, big endian, first byte is 0==positive or 1==negative */
+int
+mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+{
+  int       res;
+
+  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
+    return res;
+  }
+  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
+  return MP_OKAY;
+}
diff --git a/bn_mp_read_unsigned_bin.c b/bn_mp_read_unsigned_bin.c
new file mode 100644
index 000000000..726b574d2
--- /dev/null
+++ b/bn_mp_read_unsigned_bin.c
@@ -0,0 +1,39 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* reads a unsigned char array, assumes the msb is stored first [big endian] */
+int
+mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+{
+  int       res;
+  mp_zero (a);
+  while (c-- > 0) {
+    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
+      return res;
+    }
+
+    if (DIGIT_BIT != 7) {
+      a->dp[0] |= *b++;
+      a->used += 1;
+    } else {
+      a->dp[0] = (*b & MP_MASK);
+      a->dp[1] |= ((*b++ >> 7U) & 1);
+      a->used += 2;
+    }
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c
new file mode 100644
index 000000000..be5d18ef3
--- /dev/null
+++ b/bn_mp_reduce.c
@@ -0,0 +1,95 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int
+mp_reduce_setup (mp_int * a, mp_int * b)
+{
+  int       res;
+
+
+  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  res = mp_div (a, b, a, NULL);
+  return res;
+}
+
+/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup 
+ * From HAC pp.604 Algorithm 14.42 
+ */
+int
+mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+{
+  mp_int    q;
+  int       res, um = m->used;
+
+
+  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
+    return res;
+  }
+
+  mp_rshd (&q, um - 1);		/* q1 = x / b^(k-1)  */
+
+  /* according to HAC this is optimization is ok */
+  if (((unsigned long) m->used) > (1UL << (unsigned long) (DIGIT_BIT - 1UL))) {
+    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  } else {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  }
+
+  mp_rshd (&q, um + 1);		/* q3 = q2 / b^(k+1) */
+
+  /* x = x mod b^(k+1), quick (no division) */
+  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* q = q * m mod b^(k+1), quick (no division) */
+  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* x = x - q */
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY)
+    goto CLEANUP;
+
+  /* If x < 0, add b^(k+1) to it */
+  if (mp_cmp_d (x, 0) == MP_LT) {
+    mp_set (&q, 1);
+    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+      goto CLEANUP;
+    if ((res = mp_add (x, &q, x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  /* Back off if it's too big */
+  while (mp_cmp (x, m) != MP_LT) {
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY)
+      break;
+  }
+
+CLEANUP:
+  mp_clear (&q);
+
+  return res;
+}
diff --git a/bn_mp_rshd.c b/bn_mp_rshd.c
new file mode 100644
index 000000000..8b37d09d5
--- /dev/null
+++ b/bn_mp_rshd.c
@@ -0,0 +1,45 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* shift right a certain amount of digits */
+void
+mp_rshd (mp_int * a, int b)
+{
+  int       x;
+
+
+  /* if b <= 0 then ignore it */
+  if (b <= 0) {
+    return;
+  }
+
+  /* if b > used then simply zero it and return */
+  if (a->used < b) {
+    mp_zero (a);
+    return;
+  }
+
+  /* shift the digits down */
+  for (x = 0; x < (a->used - b); x++) {
+    a->dp[x] = a->dp[x + b];
+  }
+
+  /* zero the top digits */
+  for (; x < a->used; x++) {
+    a->dp[x] = 0;
+  }
+  mp_clamp (a);
+}
diff --git a/bn_mp_set.c b/bn_mp_set.c
new file mode 100644
index 000000000..aeaf9cd92
--- /dev/null
+++ b/bn_mp_set.c
@@ -0,0 +1,24 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* set to a digit */
+void
+mp_set (mp_int * a, mp_digit b)
+{
+  mp_zero (a);
+  a->dp[0] = b & MP_MASK;
+  a->used = (a->dp[0] != 0) ? 1 : 0;
+}
diff --git a/bn_mp_set_int.c b/bn_mp_set_int.c
new file mode 100644
index 000000000..a690bb415
--- /dev/null
+++ b/bn_mp_set_int.c
@@ -0,0 +1,45 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* set a 32-bit const */
+int
+mp_set_int (mp_int * a, unsigned long b)
+{
+  int       x, res;
+
+  mp_zero (a);
+
+  /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
+  for (x = 0; x < 8; x++) {
+
+    /* shift the number up four bits */
+    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
+      return res;
+    }
+
+    /* OR in the top four bits of the source */
+    a->dp[0] |= (b >> 28) & 15;
+
+    /* shift the source up to the next four bits */
+    b <<= 4;
+
+    /* ensure that digits are not clamped off */
+    a->used += 32 / DIGIT_BIT + 1;
+  }
+
+  mp_clamp (a);
+  return MP_OKAY;
+}
diff --git a/bn_mp_shrink.c b/bn_mp_shrink.c
new file mode 100644
index 000000000..3cc506b73
--- /dev/null
+++ b/bn_mp_shrink.c
@@ -0,0 +1,28 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* shrink a bignum */
+int
+mp_shrink (mp_int * a)
+{
+  if (a->alloc != a->used) {
+    if ((a->dp = realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+      return MP_MEM;
+    }
+    a->alloc = a->used;
+  }
+  return MP_OKAY;
+}
diff --git a/bn_mp_signed_bin_size.c b/bn_mp_signed_bin_size.c
new file mode 100644
index 000000000..d50ae7a94
--- /dev/null
+++ b/bn_mp_signed_bin_size.c
@@ -0,0 +1,22 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* get the size for an signed equivalent */
+int
+mp_signed_bin_size (mp_int * a)
+{
+  return 1 + mp_unsigned_bin_size (a);
+}
diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c
new file mode 100644
index 000000000..2ba877c66
--- /dev/null
+++ b/bn_mp_sqr.c
@@ -0,0 +1,29 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* computes b = a*a */
+int
+mp_sqr (mp_int * a, mp_int * b)
+{
+  int       res;
+  if (a->used > KARATSUBA_SQR_CUTOFF) {
+    res = mp_karatsuba_sqr (a, b);
+  } else {
+    res = s_mp_sqr (a, b);
+  }
+  b->sign = MP_ZPOS;
+  return res;
+}
diff --git a/bn_mp_sqrmod.c b/bn_mp_sqrmod.c
new file mode 100644
index 000000000..66135d46f
--- /dev/null
+++ b/bn_mp_sqrmod.c
@@ -0,0 +1,36 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* c = a * a (mod b) */
+int
+mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       res;
+  mp_int    t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, b, c);
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_sub.c b/bn_mp_sub.c
new file mode 100644
index 000000000..045dee5fb
--- /dev/null
+++ b/bn_mp_sub.c
@@ -0,0 +1,58 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* high level subtraction (handles signs) */
+int
+mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       sa, sb, res;
+
+
+  sa = a->sign;
+  sb = b->sign;
+
+  /* handle four cases */
+  if (sa == MP_ZPOS && sb == MP_ZPOS) {
+    /* both positive, a - b, but if b>a then we do -(b - a) */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      /* b>a */
+      res = s_mp_sub (b, a, c);
+      c->sign = MP_NEG;
+    } else {
+      res = s_mp_sub (a, b, c);
+      c->sign = MP_ZPOS;
+    }
+  } else if (sa == MP_ZPOS && sb == MP_NEG) {
+    /* a - -b == a + b  */
+    res = s_mp_add (a, b, c);
+    c->sign = MP_ZPOS;
+  } else if (sa == MP_NEG && sb == MP_ZPOS) {
+    /* -a - b == -(a + b) */
+    res = s_mp_add (a, b, c);
+    c->sign = MP_NEG;
+  } else {
+    /* -a - -b == b - a, but if a>b == -(a - b) */
+    if (mp_cmp_mag (a, b) == MP_GT) {
+      res = s_mp_sub (a, b, c);
+      c->sign = MP_NEG;
+    } else {
+      res = s_mp_sub (b, a, c);
+      c->sign = MP_ZPOS;
+    }
+  }
+
+  return res;
+}
diff --git a/bn_mp_sub_d.c b/bn_mp_sub_d.c
new file mode 100644
index 000000000..9839d5eac
--- /dev/null
+++ b/bn_mp_sub_d.c
@@ -0,0 +1,33 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* single digit subtraction */
+int
+mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int    t;
+  int       res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+  mp_set (&t, b);
+  res = mp_sub (a, &t, c);
+
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_submod.c b/bn_mp_submod.c
new file mode 100644
index 000000000..b56d92161
--- /dev/null
+++ b/bn_mp_submod.c
@@ -0,0 +1,36 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* d = a - b (mod c) */
+int
+mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int       res;
+  mp_int    t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
diff --git a/bn_mp_to_signed_bin.c b/bn_mp_to_signed_bin.c
new file mode 100644
index 000000000..b00cf8f8b
--- /dev/null
+++ b/bn_mp_to_signed_bin.c
@@ -0,0 +1,28 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* store in signed [big endian] format */
+int
+mp_to_signed_bin (mp_int * a, unsigned char *b)
+{
+  int       res;
+
+  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
+    return res;
+  }
+  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
+  return MP_OKAY;
+}
diff --git a/bn_mp_to_unsigned_bin.c b/bn_mp_to_unsigned_bin.c
new file mode 100644
index 000000000..b122555b8
--- /dev/null
+++ b/bn_mp_to_unsigned_bin.c
@@ -0,0 +1,43 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* store in unsigned [big endian] format */
+int
+mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+{
+  int       x, res;
+  mp_int    t;
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  x = 0;
+  while (mp_iszero (&t) == 0) {
+    if (DIGIT_BIT != 7) {
+      b[x++] = (unsigned char) (t.dp[0] & 255);
+    } else {
+      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
+    }
+    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+  bn_reverse (b, x);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_mp_unsigned_bin_size.c b/bn_mp_unsigned_bin_size.c
new file mode 100644
index 000000000..1c92b5c3a
--- /dev/null
+++ b/bn_mp_unsigned_bin_size.c
@@ -0,0 +1,23 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* get the size for an unsigned equivalent */
+int
+mp_unsigned_bin_size (mp_int * a)
+{
+  int       size = mp_count_bits (a);
+  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
diff --git a/bn_mp_xor.c b/bn_mp_xor.c
new file mode 100644
index 000000000..c8e9f43e1
--- /dev/null
+++ b/bn_mp_xor.c
@@ -0,0 +1,45 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* XOR two ints together */
+int
+mp_xor (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       res, ix, px;
+  mp_int    t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] ^= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_mp_zero.c b/bn_mp_zero.c
new file mode 100644
index 000000000..27ca5bd2f
--- /dev/null
+++ b/bn_mp_zero.c
@@ -0,0 +1,24 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* set to zero */
+void
+mp_zero (mp_int * a)
+{
+  a->sign = MP_ZPOS;
+  a->used = 0;
+  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+}
diff --git a/bn_radix.c b/bn_radix.c
new file mode 100644
index 000000000..1fc4b3519
--- /dev/null
+++ b/bn_radix.c
@@ -0,0 +1,139 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* chars used in radix conversions */
+static const char *s_rmap =
+  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+
+/* read a string [ASCII] in a given radix */
+int
+mp_read_radix (mp_int * a, char *str, int radix)
+{
+  int       y, res, neg;
+  char      ch;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  mp_zero (a);
+  while (*str) {
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == s_rmap[y]) {
+	break;
+      }
+    }
+
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+	return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+	return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  a->sign = neg;
+  return MP_OKAY;
+}
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int
+mp_toradix (mp_int * a, char *str, int radix)
+{
+  int       res, digs;
+  mp_int    t;
+  mp_digit  d;
+  char     *_s = str;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = s_rmap[d];
+    ++digs;
+  }
+  bn_reverse ((unsigned char *) _s, digs);
+  *str++ = '\0';
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* returns size of ASCII reprensentation */
+int
+mp_radix_size (mp_int * a, int radix)
+{
+  int       res, digs;
+  mp_int    t;
+  mp_digit  d;
+
+  /* special case for binary */
+  if (radix == 2) {
+    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+  }
+
+  if (radix < 2 || radix > 64) {
+    return 0;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return 0;
+  }
+
+  digs = 0;
+  if (t.sign == MP_NEG) {
+    ++digs;
+    t.sign = MP_ZPOS;
+  }
+
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return 0;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+  return digs + 1;
+}
diff --git a/bn_reverse.c b/bn_reverse.c
new file mode 100644
index 000000000..10c2375ef
--- /dev/null
+++ b/bn_reverse.c
@@ -0,0 +1,33 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* reverse an array, used for radix code */
+void
+bn_reverse (unsigned char *s, int len)
+{
+  int       ix, iy;
+  unsigned char t;
+
+  ix = 0;
+  iy = len - 1;
+  while (ix < iy) {
+    t = s[ix];
+    s[ix] = s[iy];
+    s[iy] = t;
+    ++ix;
+    --iy;
+  }
+}
diff --git a/bn_s_mp_add.c b/bn_s_mp_add.c
new file mode 100644
index 000000000..369a0e132
--- /dev/null
+++ b/bn_s_mp_add.c
@@ -0,0 +1,91 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* low level addition, based on HAC pp.594, Algorithm 14.7 */
+int
+s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int   *x;
+  int       olduse, res, min, max, i;
+  mp_digit  u;
+
+
+  /* find sizes, we let |a| <= |b| which means we have to sort
+   * them.  "x" will point to the input with the most digits
+   */
+  if (a->used > b->used) {
+    min = b->used;
+    max = a->used;
+    x = a;
+  } else if (a->used < b->used) {
+    min = a->used;
+    max = b->used;
+    x = b;
+  } else {
+    min = max = a->used;
+    x = NULL;
+  }
+
+  /* init result */
+  if (c->alloc < max + 1) {
+    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  olduse = c->used;
+  c->used = max + 1;
+
+  /* add digits from lower part */
+
+  /* set the carry to zero */
+  u = 0;
+  for (i = 0; i < min; i++) {
+    /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+    c->dp[i] = a->dp[i] + b->dp[i] + u;
+
+    /* U = carry bit of T[i] */
+    u = (c->dp[i] >> DIGIT_BIT) & 1;
+
+    /* take away carry bit from T[i] */
+    c->dp[i] &= MP_MASK;
+  }
+
+  /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
+  if (min != max) {
+    for (; i < max; i++) {
+      /* T[i] = X[i] + U */
+      c->dp[i] = x->dp[i] + u;
+
+      /* U = carry bit of T[i] */
+      u = (c->dp[i] >> DIGIT_BIT) & 1;
+
+      /* take away carry bit from T[i] */
+      c->dp[i] &= MP_MASK;
+    }
+  }
+
+  /* add carry */
+  c->dp[i] = u;
+
+  /* clear digits above used (since we may not have grown result above) */
+  for (i = c->used; i < olduse; i++) {
+    c->dp[i] = 0;
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c
new file mode 100644
index 000000000..55522a1d0
--- /dev/null
+++ b/bn_s_mp_mul_digs.c
@@ -0,0 +1,83 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* multiplies |a| * |b| and only computes upto digs digits of result
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits of 
+ * output are created.  
+ */
+int
+s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int    t;
+  int       res, pa, pb, ix, iy;
+  mp_digit  u;
+  mp_word   r;
+  mp_digit  tmpx, *tmpt, *tmpy;
+
+
+  /* can we use the fast multiplier? 
+   *
+   * The fast multiplier can be used if the output will have less than 
+   * 512 digits and the number of digits won't affect carry propagation
+   */
+  if ((digs < 512)
+      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    res = fast_s_mp_mul_digs (a, b, c, digs);
+    return res;
+  }
+
+  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
+    return res;
+  }
+  t.used = digs;
+
+  /* compute the digits of the product directly */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* set the carry to zero */
+    u = 0;
+
+    /* limit ourselves to making digs digits of output */
+    pb = MIN (b->used, digs - ix);
+
+    /* setup some aliases */
+    tmpx = a->dp[ix];
+    tmpt = &(t.dp[ix]);
+    tmpy = b->dp;
+
+    /* compute the columns of the output and propagate the carry */
+    for (iy = 0; iy < pb; iy++) {
+      /* compute the column as a mp_word */
+      r =
+	((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
+	((mp_word) u);
+
+      /* the new column is the lower part of the result */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get the carry word from the result */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    if (ix + iy < digs)
+      *tmpt = u;
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, c);
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c
new file mode 100644
index 000000000..ff2530f8f
--- /dev/null
+++ b/bn_s_mp_mul_high_digs.c
@@ -0,0 +1,77 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* multiplies |a| * |b| and does not compute the lower digs digits 
+ * [meant to get the higher part of the product]
+ */
+int
+s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int    t;
+  int       res, pa, pb, ix, iy;
+  mp_digit  u;
+  mp_word   r;
+  mp_digit  tmpx, *tmpt, *tmpy;
+
+
+  /* can we use the fast multiplier? */
+  if (((a->used + b->used + 1) < 512)
+      && MAX (a->used,
+	      b->used) <
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    res = fast_s_mp_mul_high_digs (a, b, c, digs);
+    return res;
+  }
+
+  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = a->used + b->used + 1;
+
+  pa = a->used;
+  pb = b->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* clear the carry */
+    u = 0;
+
+    /* left hand side of A[ix] * B[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias to the address of where the digits will be stored */
+    tmpt = &(t.dp[digs]);
+
+    /* alias for where to read the right hand side from */
+    tmpy = b->dp + (digs - ix);
+
+    for (iy = digs - ix; iy < pb; iy++) {
+      /* calculate the double precision result */
+      r =
+	((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
+	((mp_word) u);
+
+      /* get the lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* carry the carry */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpt = u;
+  }
+  mp_clamp (&t);
+  mp_exch (&t, c);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_s_mp_sqr.c b/bn_s_mp_sqr.c
new file mode 100644
index 000000000..94449a348
--- /dev/null
+++ b/bn_s_mp_sqr.c
@@ -0,0 +1,89 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+int
+s_mp_sqr (mp_int * a, mp_int * b)
+{
+  mp_int    t;
+  int       res, ix, iy, pa;
+  mp_word   r, u;
+  mp_digit  tmpx, *tmpt;
+
+  /* can we use the fast multiplier? */
+  if (((a->used * 2 + 1) < 512)
+      && a->used <
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
+    res = fast_s_mp_sqr (a, b);
+    return res;
+  }
+
+  pa = a->used;
+  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = pa + pa + 1;
+
+  for (ix = 0; ix < pa; ix++) {
+    /* first calculate the digit at 2*ix */
+    /* calculate double precision result */
+    r =
+      ((mp_word) t.dp[ix + ix]) +
+      ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+
+    /* store lower part in result */
+    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* get the carry */
+    u = (r >> ((mp_word) DIGIT_BIT));
+
+    /* left hand side of A[ix] * A[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias for where to store the results */
+    tmpt = &(t.dp[ix + ix + 1]);
+    for (iy = ix + 1; iy < pa; iy++) {
+      /* first calculate the product */
+      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
+
+      /* now calculate the double precision result, note we use
+       * addition instead of *2 since its easier to optimize
+       */
+      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
+
+      /* store lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get carry */
+      u = (r >> ((mp_word) DIGIT_BIT));
+    }
+    r = ((mp_word) * tmpt) + u;
+    *tmpt = (mp_digit) (r & ((mp_word) MP_MASK));
+    u = (r >> ((mp_word) DIGIT_BIT));
+    /* propagate upwards */
+    ++tmpt;
+    while (u != ((mp_word) 0)) {
+      r = ((mp_word) * tmpt) + ((mp_word) 1);
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u = (r >> ((mp_word) DIGIT_BIT));
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, b);
+  mp_clear (&t);
+  return MP_OKAY;
+}
diff --git a/bn_s_mp_sub.c b/bn_s_mp_sub.c
new file mode 100644
index 000000000..f6da162f3
--- /dev/null
+++ b/bn_s_mp_sub.c
@@ -0,0 +1,74 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */
+int
+s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int       olduse, res, min, max, i;
+  mp_digit  u;
+
+
+  /* find sizes */
+  min = b->used;
+  max = a->used;
+
+  /* init result */
+  if (c->alloc < max) {
+    if ((res = mp_grow (c, max)) != MP_OKAY) {
+      return res;
+    }
+  }
+  olduse = c->used;
+  c->used = max;
+
+  /* sub digits from lower part */
+
+  /* set carry to zero */
+  u = 0;
+  for (i = 0; i < min; i++) {
+    /* T[i] = A[i] - B[i] - U */
+    c->dp[i] = a->dp[i] - (b->dp[i] + u);
+
+    /* U = carry bit of T[i] */
+    u = (c->dp[i] >> DIGIT_BIT) & 1;
+
+    /* Clear carry from T[i] */
+    c->dp[i] &= MP_MASK;
+  }
+
+  /* now copy higher words if any, e.g. if A has more digits than B  */
+  if (min != max) {
+    for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      c->dp[i] = a->dp[i] - u;
+
+      /* U = carry bit of T[i] */
+      u = (c->dp[i] >> DIGIT_BIT) & 1;
+
+      /* Clear carry from T[i] */
+      c->dp[i] &= MP_MASK;
+    }
+  }
+
+  /* clear digits above used (since we may not have grown result above) */
+  for (i = c->used; i < olduse; i++) {
+    c->dp[i] = 0;
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
diff --git a/bncore.c b/bncore.c
new file mode 100644
index 000000000..5c7d09896
--- /dev/null
+++ b/bncore.c
@@ -0,0 +1,19 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+int       KARATSUBA_MUL_CUTOFF = 80,	/* Min. number of digits before Karatsuba multiplication is used. */
+          KARATSUBA_SQR_CUTOFF = 80,	/* Min. number of digits before Karatsuba squaring is used. */
+          MONTGOMERY_EXPT_CUTOFF = 40;	/* max. number of digits that montgomery reductions will help for */
diff --git a/changes.txt b/changes.txt
index e78a2a3cd..e2c99034d 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,13 @@
+Jan 17th, 2003
+v0.12  -- re-wrote the majority of the makefile so its more portable and will
+          install via "make install" on most *nix platforms
+       -- Re-packaged all the source as seperate files.  Means the library a single
+          file packagage any more.  Instead of just adding "bn.c" you have to add
+          libtommath.a
+       -- Renamed "bn.h" to "tommath.h"
+       -- Changes to the manual to reflect all of this
+       -- Used GNU Indent to clean up the source
+
 Jan 15th, 2003
 v0.11  -- More subtle fixes
        -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
diff --git a/demo.c b/demo/demo.c
similarity index 94%
rename from demo.c
rename to demo/demo.c
index f48212025..3c3ef072f 100644
--- a/demo.c
+++ b/demo/demo.c
@@ -14,24 +14,14 @@
       typedef unsigned long long ulong64;
    #endif   
 #else   
-   #include "bn.h"
-#endif
-
-#ifdef TIMER_X86
-#define TIMER
-extern ulong64 _rdtsc(void);
-extern void _reset(void);
-ulong64 rdtsc(void) { return _rdtsc(); }
-void reset(void) { _reset(); }
+   #include "tommath.h"
 #endif
 
 #ifdef TIMER
-#ifndef TIMER_X86
 ulong64 _tt;
 void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
 #endif
-#endif
 
 #ifndef DEBUG
 int _ifuncs;
@@ -87,19 +77,19 @@ int main(void)
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
    int rr;
-   
+
 #ifdef TIMER
    int n;
    ulong64 tt;
-#endif   
-   
+#endif
+
    mp_init(&a);
    mp_init(&b);
    mp_init(&c);
    mp_init(&d);
    mp_init(&e);
    mp_init(&f);
-   
+
 #ifdef DEBUG
    mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
    mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
@@ -123,10 +113,10 @@ int main(void)
    mp_exptmod(&c, &b, &a, &d);
    dump_timings();
    return 0;
-#endif   
-      
-#ifdef TIMER      
-goto expt;
+#endif
+
+#ifdef TIMER
+      printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
       mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
@@ -135,7 +125,7 @@ goto expt;
              mp_add(&a, &b, &c);
          }
          tt = rdtsc();
-         printf("Adding %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)1000000));
+         printf("Adding %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
          mp_sqr(&a, &a);
          mp_sqr(&b, &b);
       }
@@ -148,7 +138,7 @@ goto expt;
              mp_sub(&a, &b, &c);
          }
          tt = rdtsc();
-         printf("Subtracting %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)1000000));
+         printf("Subtracting %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
          mp_sqr(&a, &a);
          mp_sqr(&b, &b);
       }
@@ -156,25 +146,25 @@ goto expt;
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 100000; rr++) {
+      for (rr = 0; rr < 1000000; rr++) {
           mp_sqr(&a, &b);
       }
       tt = rdtsc();
-      printf("Squaring %d-bit took %lu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
+      printf("Squaring %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
       mp_copy(&b, &a);
    }
    
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 100000; rr++) {
+      for (rr = 0; rr < 1000000; rr++) {
           mp_mul(&a, &a, &b);
       }
       tt = rdtsc();
-      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
+      printf("Multiplying %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
       mp_copy(&b, &a);
    }
-expt:
+   
    {
       char *primes[] = {
          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@@ -211,7 +201,7 @@ goto expt;
          draw(&d);
          exit(0);
       }
-      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
+      printf("Exponentiating %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
    }
    }   
 
@@ -229,14 +219,14 @@ goto expt;
          printf("Failed to invert\n");
          return 0;
       }
-      printf("Inverting mod %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
+      printf("Inverting mod %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
       mp_sqr(&a, &a);
       mp_sqr(&b, &b);
    }
    
    return 0;
   
-#endif   
+#endif
 
    inv_n = expt_n = lcm_n = gcd_n = add_n = sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = 0;   
    for (;;) {
diff --git a/etc/makefile b/etc/makefile
index ed8f91508..7a98e09b9 100644
--- a/etc/makefile
+++ b/etc/makefile
@@ -1 +1,15 @@
-CFLAGS += -I../ -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops ../bn.c 
\ No newline at end of file
+CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
+
+pprime: pprime.o
+	$(CC) pprime.o -ltommath -o pprime
+
+tune: tune.o
+	$(CC) tune.o -ltommath -o tune
+        
+mersenne: mersenne.o
+	$(CC) mersenne.o -ltommath -o mersenne
+        
+clean:
+	rm -f *.o pprime tune mersenne
+
+
diff --git a/etc/mersenne.c b/etc/mersenne.c
index b6bbd5139..04b2d988b 100644
--- a/etc/mersenne.c
+++ b/etc/mersenne.c
@@ -3,148 +3,152 @@
  * Tom St Denis, tomstdenis@iahu.ca
  */
 #include <time.h>
-#include <bn.h> 
+#include <bn.h>
 
-int is_mersenne(long s, int *pp)
+int
+is_mersenne (long s, int *pp)
 {
-  mp_int n, u, mu;
-  int res, k;
-  long ss;
-  
+  mp_int    n, u, mu;
+  int       res, k;
+  long      ss;
+
   *pp = 0;
-  
-  if ((res = mp_init(&n)) != MP_OKAY) {
-     return res;
+
+  if ((res = mp_init (&n)) != MP_OKAY) {
+    return res;
   }
-  
-  if ((res = mp_init(&u)) != MP_OKAY) {
-     goto __N;
+
+  if ((res = mp_init (&u)) != MP_OKAY) {
+    goto __N;
   }
-  
-  if ((res = mp_init(&mu)) != MP_OKAY) {
-     goto __U;
+
+  if ((res = mp_init (&mu)) != MP_OKAY) {
+    goto __U;
   }
-  
+
   /* n = 2^s - 1 */
-  mp_set(&n, 1);
+  mp_set (&n, 1);
   ss = s;
   while (ss--) {
-     if ((res = mp_mul_2(&n, &n)) != MP_OKAY) {
-        goto __MU;
-     }
+    if ((res = mp_mul_2 (&n, &n)) != MP_OKAY) {
+      goto __MU;
+    }
   }
-  if ((res = mp_sub_d(&n, 1, &n)) != MP_OKAY) {
-     goto __MU;
+  if ((res = mp_sub_d (&n, 1, &n)) != MP_OKAY) {
+    goto __MU;
   }
-  
+
   /* setup mu */
-  if ((res = mp_reduce_setup(&mu, &n)) != MP_OKAY) {
-     goto __MU;
+  if ((res = mp_reduce_setup (&mu, &n)) != MP_OKAY) {
+    goto __MU;
   }
-  
+
   /* set u=4 */
-  mp_set(&u, 4);
-  
+  mp_set (&u, 4);
+
   /* for k=1 to s-2 do */
   for (k = 1; k <= s - 2; k++) {
-      /* u = u^2 - 2 mod n */
-      if ((res = mp_sqr(&u, &u)) != MP_OKAY) {
-         goto __MU;
-      }
-      if ((res = mp_sub_d(&u, 2, &u)) != MP_OKAY) {
-         goto __MU;
-      }
-      
-      /* make sure u is positive */
-      if (u.sign == MP_NEG) {
-         if ((res = mp_add(&u, &n, &u)) != MP_OKAY) {
-            goto __MU;
-         }
-      }
-      
-      /* reduce */
-      if ((res = mp_reduce(&u, &n, &mu)) != MP_OKAY) {
-         goto __MU;
+    /* u = u^2 - 2 mod n */
+    if ((res = mp_sqr (&u, &u)) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((res = mp_sub_d (&u, 2, &u)) != MP_OKAY) {
+      goto __MU;
+    }
+
+    /* make sure u is positive */
+    if (u.sign == MP_NEG) {
+      if ((res = mp_add (&u, &n, &u)) != MP_OKAY) {
+	goto __MU;
       }
+    }
+
+    /* reduce */
+    if ((res = mp_reduce (&u, &n, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
   }
-  
+
   /* if u == 0 then its prime */
-  if (mp_iszero(&u) == 1) {
-     *pp = 1;
+  if (mp_iszero (&u) == 1) {
+    *pp = 1;
   }
-  
+
   res = MP_OKAY;
-__MU:  mp_clear(&mu);
-__U:   mp_clear(&u);
-__N:   mp_clear(&n);
+__MU:mp_clear (&mu);
+__U:mp_clear (&u);
+__N:mp_clear (&n);
   return res;
 }
 
 /* square root of a long < 65536 */
-long i_sqrt(long x)
+long
+i_sqrt (long x)
 {
-   long x1, x2;
-   
-   x2 = 16;
-   do {
-      x1 = x2;
-      x2 = x1 - ((x1 * x1) - x)/(2*x1);
-   } while (x1 != x2);
-   
-   if (x1*x1 > x) {
-      --x1;
-   }
-   
-   return x1;
+  long      x1, x2;
+
+  x2 = 16;
+  do {
+    x1 = x2;
+    x2 = x1 - ((x1 * x1) - x) / (2 * x1);
+  } while (x1 != x2);
+
+  if (x1 * x1 > x) {
+    --x1;
+  }
+
+  return x1;
 }
 
 /* is the long prime by brute force */
-int isprime(long k)
+int
+isprime (long k)
 {
-   long y, z;
-   
-   y = i_sqrt(k);
-   for (z = 2; z <= y; z++) {
-       if ((k % z) == 0) return 0;
-   }
-   return 1;
-}   
-   
-
-int main(void)
+  long      y, z;
+
+  y = i_sqrt (k);
+  for (z = 2; z <= y; z++) {
+    if ((k % z) == 0)
+      return 0;
+  }
+  return 1;
+}
+
+
+int
+main (void)
 {
-   int pp;
-   long k;
-   clock_t tt;
-  
-   k = 3;
-   
-   for (;;) {
-      /* start time */
-      tt = clock();
-      
-      /* test if 2^k - 1 is prime */
-      if (is_mersenne(k, &pp) != MP_OKAY) {
-         printf("Whoa error\n");
-         return -1;
-      }
-      
-      if (pp == 1) {
-         /* count time */
-         tt = clock() - tt;
-         
-         /* display if prime */
-         printf("2^%-5ld - 1 is prime, test took %ld ticks\n", k, tt);
-      }
-      
-      /* goto next odd exponent */
+  int       pp;
+  long      k;
+  clock_t   tt;
+
+  k = 3;
+
+  for (;;) {
+    /* start time */
+    tt = clock ();
+
+    /* test if 2^k - 1 is prime */
+    if (is_mersenne (k, &pp) != MP_OKAY) {
+      printf ("Whoa error\n");
+      return -1;
+    }
+
+    if (pp == 1) {
+      /* count time */
+      tt = clock () - tt;
+
+      /* display if prime */
+      printf ("2^%-5ld - 1 is prime, test took %ld ticks\n", k, tt);
+    }
+
+    /* goto next odd exponent */
+    k += 2;
+
+    /* but make sure its prime */
+    while (isprime (k) == 0) {
       k += 2;
-      
-      /* but make sure its prime */
-      while (isprime(k) == 0) {
-         k += 2;
-      }
-   }
-   return 0;
+    }
+  }
+  return 0;
 }
-
diff --git a/etc/pprime.c b/etc/pprime.c
index 4943088cd..c13690167 100644
--- a/etc/pprime.c
+++ b/etc/pprime.c
@@ -8,220 +8,286 @@
 #include "bn.h"
 
 /* fast square root */
-static mp_digit i_sqrt(mp_word x)
+static    mp_digit
+i_sqrt (mp_word x)
 {
-   mp_word x1, x2;
-   
-   x2 = x;
-   do {
-      x1 = x2;
-      x2 = x1 - ((x1 * x1) - x)/(2*x1);
-   } while (x1 != x2);
-   
-   if (x1*x1 > x) {
-      --x1;
-   }
-   
-   return x1;
-}   
-      
+  mp_word   x1, x2;
+
+  x2 = x;
+  do {
+    x1 = x2;
+    x2 = x1 - ((x1 * x1) - x) / (2 * x1);
+  } while (x1 != x2);
+
+  if (x1 * x1 > x) {
+    --x1;
+  }
+
+  return x1;
+}
+
 
 /* generates a prime digit */
-static mp_digit prime_digit()
+static    mp_digit
+prime_digit ()
 {
-   mp_digit r, x, y, next;
-   
-   /* make a DIGIT_BIT-bit random number */
-   for (r = x = 0; x < DIGIT_BIT; x++) {
-       r = (r << 1) | (rand() & 1);
-   }
-   
-   /* now force it odd */
-   r |= 1;
-   
-   /* force it to be >30 */
-   if (r < 30) {
-      r += 30;
-   }
-   
-   /* get square root, since if 'r' is composite its factors must be < than this */
-   y = i_sqrt(r);
-   next = (y+1)*(y+1);
-
-   do {
-      r += 2;   /* next candidate */
-        
-      /* update sqrt ? */
-      if (next <= r) {
-         ++y;
-         next = (y+1)*(y+1);
-      }
+  mp_digit  r, x, y, next;
 
-      /* loop if divisible by 3,5,7,11,13,17,19,23,29  */
-      if ((r % 3) == 0) { x = 0; continue; }
-      if ((r % 5) == 0) { x = 0; continue; }
-      if ((r % 7) == 0) { x = 0; continue; }
-      if ((r % 11) == 0) { x = 0; continue; }
-      if ((r % 13) == 0) { x = 0; continue; }
-      if ((r % 17) == 0) { x = 0; continue; }
-      if ((r % 19) == 0) { x = 0; continue; }
-      if ((r % 23) == 0) { x = 0; continue; }
-      if ((r % 29) == 0) { x = 0; continue; }
-      
-      /* now check if r is divisible by x + k={1,7,11,13,17,19,23,29} */
-      for (x = 30; x <= y; x += 30) {
-          if ((r % (x+1)) == 0) { x = 0; break; }
-          if ((r % (x+7)) == 0) { x = 0; break; }
-          if ((r % (x+11)) == 0) { x = 0; break; }
-          if ((r % (x+13)) == 0) { x = 0; break; }
-          if ((r % (x+17)) == 0) { x = 0; break; }
-          if ((r % (x+19)) == 0) { x = 0; break; }
-          if ((r % (x+23)) == 0) { x = 0; break; }
-          if ((r % (x+29)) == 0) { x = 0; break; }
+  /* make a DIGIT_BIT-bit random number */
+  for (r = x = 0; x < DIGIT_BIT; x++) {
+    r = (r << 1) | (rand () & 1);
+  }
+
+  /* now force it odd */
+  r |= 1;
+
+  /* force it to be >30 */
+  if (r < 30) {
+    r += 30;
+  }
+
+  /* get square root, since if 'r' is composite its factors must be < than this */
+  y = i_sqrt (r);
+  next = (y + 1) * (y + 1);
+
+  do {
+    r += 2;			/* next candidate */
+
+    /* update sqrt ? */
+    if (next <= r) {
+      ++y;
+      next = (y + 1) * (y + 1);
+    }
+
+    /* loop if divisible by 3,5,7,11,13,17,19,23,29  */
+    if ((r % 3) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 5) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 7) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 11) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 13) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 17) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 19) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 23) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 29) == 0) {
+      x = 0;
+      continue;
+    }
+
+    /* now check if r is divisible by x + k={1,7,11,13,17,19,23,29} */
+    for (x = 30; x <= y; x += 30) {
+      if ((r % (x + 1)) == 0) {
+	x = 0;
+	break;
       }
-   } while (x == 0);
-   
-   return r;
+      if ((r % (x + 7)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 11)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 13)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 17)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 19)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 23)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 29)) == 0) {
+	x = 0;
+	break;
+      }
+    }
+  } while (x == 0);
+
+  return r;
 }
 
 /* makes a prime of at least k bits */
-int pprime(int k, int li, mp_int *p, mp_int *q)
+int
+pprime (int k, int li, mp_int * p, mp_int * q)
 {
-   mp_int a, b, c, n, x, y, z, v;
-   int res, ii;
-   static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
-   
-   /* single digit ? */
-   if (k <= (int)DIGIT_BIT) {
-      mp_set(p, prime_digit());
-      return MP_OKAY;
-   }
-   
-   if ((res = mp_init(&c)) != MP_OKAY) {
-      return res;
-   }
-   
-   if ((res = mp_init(&v)) != MP_OKAY) {
-      goto __C;
-   }
-   
-   /* product of first 50 primes */
-   if ((res = mp_read_radix(&v, "19078266889580195013601891820992757757219839668357012055907516904309700014933909014729740190", 10)) != MP_OKAY) {
-      goto __V;
-   }   
-
-   if ((res = mp_init(&a)) != MP_OKAY) {
-      goto __V;
-   }
-   
-   /* set the prime */
-   mp_set(&a, prime_digit());
-   
-   if ((res = mp_init(&b)) != MP_OKAY) {
-      goto __A;
-   }
-   
-   if ((res = mp_init(&n)) != MP_OKAY) {
-      goto __B;
-   }
-   
-   if ((res = mp_init(&x)) != MP_OKAY) {
-      goto __N;
-   }
-
-   if ((res = mp_init(&y)) != MP_OKAY) {
-      goto __X;
-   }
-
-   if ((res = mp_init(&z)) != MP_OKAY) {
-      goto __Y;
-   }
-
-   /* now loop making the single digit */
-   while (mp_count_bits(&a) < k) {
-      printf("prime has %4d bits left\r", k - mp_count_bits(&a)); fflush(stdout);
-   top: 
-      mp_set(&b, prime_digit());
-      
-      /* now compute z = a * b * 2 */
-      if ((res = mp_mul(&a, &b, &z)) != MP_OKAY) {                     /* z = a * b */
-         goto __Z;
+  mp_int    a, b, c, n, x, y, z, v;
+  int       res, ii;
+  static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
+
+  /* single digit ? */
+  if (k <= (int) DIGIT_BIT) {
+    mp_set (p, prime_digit ());
+    return MP_OKAY;
+  }
+
+  if ((res = mp_init (&c)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&v)) != MP_OKAY) {
+    goto __C;
+  }
+
+  /* product of first 50 primes */
+  if ((res =
+       mp_read_radix (&v,
+		      "19078266889580195013601891820992757757219839668357012055907516904309700014933909014729740190",
+		      10)) != MP_OKAY) {
+    goto __V;
+  }
+
+  if ((res = mp_init (&a)) != MP_OKAY) {
+    goto __V;
+  }
+
+  /* set the prime */
+  mp_set (&a, prime_digit ());
+
+  if ((res = mp_init (&b)) != MP_OKAY) {
+    goto __A;
+  }
+
+  if ((res = mp_init (&n)) != MP_OKAY) {
+    goto __B;
+  }
+
+  if ((res = mp_init (&x)) != MP_OKAY) {
+    goto __N;
+  }
+
+  if ((res = mp_init (&y)) != MP_OKAY) {
+    goto __X;
+  }
+
+  if ((res = mp_init (&z)) != MP_OKAY) {
+    goto __Y;
+  }
+
+  /* now loop making the single digit */
+  while (mp_count_bits (&a) < k) {
+    printf ("prime has %4d bits left\r", k - mp_count_bits (&a));
+    fflush (stdout);
+  top:
+    mp_set (&b, prime_digit ());
+
+    /* now compute z = a * b * 2 */
+    if ((res = mp_mul (&a, &b, &z)) != MP_OKAY) {	/* z = a * b */
+      goto __Z;
+    }
+
+    if ((res = mp_copy (&z, &c)) != MP_OKAY) {	/* c = a * b */
+      goto __Z;
+    }
+
+    if ((res = mp_mul_2 (&z, &z)) != MP_OKAY) {	/* z = 2 * a * b */
+      goto __Z;
+    }
+
+    /* n = z + 1 */
+    if ((res = mp_add_d (&z, 1, &n)) != MP_OKAY) {	/* n = z + 1 */
+      goto __Z;
+    }
+
+    /* check (n, v) == 1 */
+    if ((res = mp_gcd (&n, &v, &y)) != MP_OKAY) {	/* y = (n, v) */
+      goto __Z;
+    }
+
+    if (mp_cmp_d (&y, 1) != MP_EQ)
+      goto top;
+
+    /* now try base x=bases[ii]  */
+    for (ii = 0; ii < li; ii++) {
+      mp_set (&x, bases[ii]);
+
+      /* compute x^a mod n */
+      if ((res = mp_exptmod (&x, &a, &n, &y)) != MP_OKAY) {	/* y = x^a mod n */
+	goto __Z;
       }
-   
-      if ((res = mp_copy(&z, &c)) != MP_OKAY) {                        /* c = a * b */
-         goto __Z;
+
+      /* if y == 1 loop */
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* now x^2a mod n */
+      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2a mod n */
+	goto __Z;
       }
-      
-      if ((res = mp_mul_2(&z, &z)) != MP_OKAY) {                       /* z = 2 * a * b */
-         goto __Z;
+
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* compute x^b mod n */
+      if ((res = mp_exptmod (&x, &b, &n, &y)) != MP_OKAY) {	/* y = x^b mod n */
+	goto __Z;
       }
-      
-      /* n = z + 1 */
-      if ((res = mp_add_d(&z, 1, &n)) != MP_OKAY) {                    /* n = z + 1 */
-         goto __Z;
+
+      /* if y == 1 loop */
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* now x^2b mod n */
+      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2b mod n */
+	goto __Z;
       }
-      
-      /* check (n, v) == 1 */
-      if ((res = mp_gcd(&n, &v, &y)) != MP_OKAY) {                     /* y = (n, v) */
-         goto __Z;
+
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* compute x^c mod n == x^ab mod n */
+      if ((res = mp_exptmod (&x, &c, &n, &y)) != MP_OKAY) {	/* y = x^ab mod n */
+	goto __Z;
       }
-      
-      if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
-      
-      /* now try base x=bases[ii]  */
-      for (ii = 0; ii < li; ii++) {
-         mp_set(&x, bases[ii]);
-       
-         /* compute x^a mod n */
-         if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) {             /* y = x^a mod n */
-            goto __Z;
-         }
-      
-         /* if y == 1 loop */
-         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
-   
-         /* now x^2a mod n */
-         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2a mod n */
-           goto __Z;
-         }
-   
-         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
-   
-         /* compute x^b mod n */
-         if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) {             /* y = x^b mod n */
-            goto __Z;
-         }
-      
-         /* if y == 1 loop */
-         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
-   
-         /* now x^2b mod n */
-         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2b mod n */
-            goto __Z;
-         }
-   
-         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
-
-         /* compute x^c mod n == x^ab mod n */
-         if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) {             /* y = x^ab mod n */
-             goto __Z;
-         }
-      
-         /* if y == 1 loop */
-         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
-   
-         /* now compute (x^c mod n)^2 */
-         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2ab mod n */
-            goto __Z;
-         }
-   
-         /* y should be 1 */
-         if (mp_cmp_d(&y, 1) != MP_EQ) continue;
-         break;
+
+      /* if y == 1 loop */
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* now compute (x^c mod n)^2 */
+      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2ab mod n */
+	goto __Z;
       }
-      
-      /* no bases worked? */
-      if (ii == li) goto top;
+
+      /* y should be 1 */
+      if (mp_cmp_d (&y, 1) != MP_EQ)
+	continue;
+      break;
+    }
+
+    /* no bases worked? */
+    if (ii == li)
+      goto top;
 
 /*
 {
@@ -236,62 +302,62 @@ int pprime(int k, int li, mp_int *p, mp_int *q)
    printf("----------------------------------------------------------------\n");
 }   
 */
-     /* a = n */
-     mp_copy(&n, &a);
+    /* a = n */
+    mp_copy (&n, &a);
   }
-  
+
   /* get q to be the order of the large prime subgroup */
-  mp_sub_d(&n, 1, q);
-  mp_div_2(q, q);
-  mp_div(q, &b, q, NULL);
+  mp_sub_d (&n, 1, q);
+  mp_div_2 (q, q);
+  mp_div (q, &b, q, NULL);
 
-  mp_exch(&n, p);
-   
-   res = MP_OKAY;
-__Z: mp_clear(&z); 
-__Y: mp_clear(&y); 
-__X: mp_clear(&x); 
-__N: mp_clear(&n); 
-__B: mp_clear(&b); 
-__A: mp_clear(&a); 
-__V: mp_clear(&v);
-__C: mp_clear(&c);
-   return res;
-}   
+  mp_exch (&n, p);
+
+  res = MP_OKAY;
+__Z:mp_clear (&z);
+__Y:mp_clear (&y);
+__X:mp_clear (&x);
+__N:mp_clear (&n);
+__B:mp_clear (&b);
+__A:mp_clear (&a);
+__V:mp_clear (&v);
+__C:mp_clear (&c);
+  return res;
+}
 
 
-int main(void)
+int
+main (void)
 {
-   mp_int p, q;
-   char buf[4096];
-   int k, li;
-   clock_t t1;
-      
-   srand(time(NULL));
-   
-   printf("Enter # of bits: \n");
-   fgets(buf, sizeof(buf), stdin);
-   sscanf(buf, "%d", &k);
-   
-   printf("Enter number of bases to try (1 to 8):\n");
-   fgets(buf, sizeof(buf), stdin);
-   sscanf(buf, "%d", &li);
-   
-   
-   mp_init(&p);
-   mp_init(&q);
-   
-   t1 = clock();   
-   pprime(k, li, &p, &q);
-   t1 = clock() - t1;
-   
-   printf("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits(&p));
-   
-   mp_toradix(&p, buf, 10);
-   printf("P == %s\n", buf);
-   mp_toradix(&q, buf, 10);
-   printf("Q == %s\n", buf);
-   
-   return 0;
-}
+  mp_int    p, q;
+  char      buf[4096];
+  int       k, li;
+  clock_t   t1;
+
+  srand (time (NULL));
+
+  printf ("Enter # of bits: \n");
+  fgets (buf, sizeof (buf), stdin);
+  sscanf (buf, "%d", &k);
 
+  printf ("Enter number of bases to try (1 to 8):\n");
+  fgets (buf, sizeof (buf), stdin);
+  sscanf (buf, "%d", &li);
+
+
+  mp_init (&p);
+  mp_init (&q);
+
+  t1 = clock ();
+  pprime (k, li, &p, &q);
+  t1 = clock () - t1;
+
+  printf ("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits (&p));
+
+  mp_toradix (&p, buf, 10);
+  printf ("P == %s\n", buf);
+  mp_toradix (&q, buf, 10);
+  printf ("Q == %s\n", buf);
+
+  return 0;
+}
diff --git a/etc/tune.c b/etc/tune.c
new file mode 100644
index 000000000..0d03e7780
--- /dev/null
+++ b/etc/tune.c
@@ -0,0 +1,96 @@
+/* Tune the Karatsuba parameters
+ *
+ * Tom St Denis, tomstdenis@iahu.ca
+ */
+#include <tommath.h>
+#include <time.h>
+
+clock_t
+time_mult (void)
+{
+  clock_t   t1;
+  int       x, y;
+  mp_int    a, b, c;
+
+  mp_init (&a);
+  mp_init (&b);
+  mp_init (&c);
+
+  t1 = clock ();
+  for (x = 8; x <= 128; x += 8) {
+    mp_rand (&a, x);
+    mp_rand (&b, x);
+    for (y = 0; y < 10000; y++) {
+      mp_mul (&a, &b, &c);
+    }
+  }
+  mp_clear (&a);
+  mp_clear (&b);
+  mp_clear (&c);
+  return clock () - t1;
+}
+
+clock_t
+time_sqr (void)
+{
+  clock_t   t1;
+  int       x, y;
+  mp_int    a, b;
+
+  mp_init (&a);
+  mp_init (&b);
+
+  t1 = clock ();
+  for (x = 8; x <= 128; x += 8) {
+    mp_rand (&a, x);
+    for (y = 0; y < 10000; y++) {
+      mp_sqr (&a, &b);
+    }
+  }
+  mp_clear (&a);
+  mp_clear (&b);
+  return clock () - t1;
+}
+
+int
+main (void)
+{
+  int       best_mult, best_square;
+  clock_t   best, ti;
+
+  best_mult = best_square = 0;
+
+  /* tune multiplication first */
+  best = CLOCKS_PER_SEC * 1000;
+  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 128;
+       KARATSUBA_MUL_CUTOFF++) {
+    ti = time_mult ();
+    printf ("%4d : %9lu\r", KARATSUBA_MUL_CUTOFF, ti);
+    fflush (stdout);
+    if (ti < best) {
+      printf ("New best: %lu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
+      best = ti;
+      best_mult = KARATSUBA_MUL_CUTOFF;
+    }
+  }
+
+  /* tune squaring */
+  best = CLOCKS_PER_SEC * 1000;
+  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 128;
+       KARATSUBA_SQR_CUTOFF++) {
+    ti = time_sqr ();
+    printf ("%4d : %9lu\r", KARATSUBA_SQR_CUTOFF, ti);
+    fflush (stdout);
+    if (ti < best) {
+      printf ("New best: %lu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
+      best = ti;
+      best_square = KARATSUBA_SQR_CUTOFF;
+    }
+  }
+
+  printf
+    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\n",
+     best_mult, best_square);
+
+  return 0;
+}
diff --git a/poly.c b/etclib/poly.c
similarity index 100%
rename from poly.c
rename to etclib/poly.c
diff --git a/poly.h b/etclib/poly.h
similarity index 100%
rename from poly.h
rename to etclib/poly.h
diff --git a/ltmtest.exe b/ltmtest.exe
new file mode 100644
index 000000000..cc5c1f527
Binary files /dev/null and b/ltmtest.exe differ
diff --git a/makefile b/makefile
index 7567d22cc..5f6bcc6dc 100644
--- a/makefile
+++ b/makefile
@@ -1,18 +1,56 @@
-CC = gcc
-CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops
+CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.11
+VERSION=0.12
 
-default: test
+default: libtommath.a
 
-test: bn.o demo.o
-	$(CC) bn.o demo.o -o demo
-	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
+#default files to install
+LIBNAME=libtommath.a
+HEADERS=tommath.h
+
+#LIBPATH-The directory for libtomcrypt to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o
 
-# builds the x86 demo
-test86:
-	nasm -f coff timer.asm
-	$(CC) -DDEBUG -DTIMER_X86 $(CFLAGS) bn.c demo.c timer.o -o demo -s
+libtommath.a:  $(OBJECTS)
+	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
+	ranlib libtommath.a
+
+install: libtommath.a docs
+	install -d -g root -o root $(DESTDIR)$(LIBPATH)
+	install -d -g root -o root $(DESTDIR)$(INCPATH)
+	install -d -g root -o root $(DESTDIR)$(DATAPATH)
+	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -g root -o root bn.pdf $(DESTDIR)$(DATAPATH)
+
+test: libtommath.a demo/demo.o
+	$(CC) demo/demo.o libtommath.a -o test
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
+        
+timing: libtommath.a
+	$(CC) $(CFLAGS) -DTIMER demo/demo.c libtommath.a -o ltmtest -s
+	$(CC) $(CFLAGS) -DTIMER -DU_MPI -I./mtest/ demo/demo.c mtest/mpi.c -o mpitest -s
 
 docdvi: bn.tex
 	latex bn
@@ -22,7 +60,9 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 	
 clean:
-	rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne
+	rm -f *.pdf *.o *.a etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest \
+        bn.log bn.aux bn.dvi *.log *.s
+	cd etc ; make clean
 
 zipup: clean docs
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
diff --git a/mpitest.exe b/mpitest.exe
new file mode 100644
index 000000000..d32553a3f
Binary files /dev/null and b/mpitest.exe differ
diff --git a/mtest/mtest.c b/mtest/mtest.c
index df5422f47..17aef8d54 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -1,242 +1,242 @@
-/* makes a bignum test harness with NUM tests per operation
- *
- * the output is made in the following format [one parameter per line]
-
-operation
-operand1
-operand2
-[... operandN]
-result1
-result2
-[... resultN]
-
-So for example "a * b mod n" would be 
-
-mulmod
-a
-b
-n
-a*b mod n
-
-e.g. if a=3, b=4 n=11 then 
-
-mulmod
-3
-4
-11
-1
-
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include "mpi.c"
-
-FILE *rng;
-
-void rand_num(mp_int *a)
-{
-   int n, size;
-   unsigned char buf[512];
-
-top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
-   buf[0] = (fgetc(rng)&1)?1:0;
-   fread(buf+1, 1, size, rng);
-   for (n = 0; n < size; n++) {
-       if (buf[n+1]) break;
-   }
-   if (n == size) goto top;
-   mp_read_raw(a, buf, 1+size);
-}
-
-void rand_num2(mp_int *a)
-{
-   int n, size;
-   unsigned char buf[512];
-
-top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
-   buf[0] = (fgetc(rng)&1)?1:0;
-   fread(buf+1, 1, size, rng);
-   for (n = 0; n < size; n++) {
-       if (buf[n+1]) break;
-   }
-   if (n == size) goto top;
-   mp_read_raw(a, buf, 1+size);
-}
-
-int main(void)
-{
-   int n;
-   mp_int a, b, c, d, e;
-   char buf[4096];
-
-   mp_init(&a);
-   mp_init(&b);
-   mp_init(&c);
-   mp_init(&d);
-   mp_init(&e);
-
-   rng = fopen("/dev/urandom", "rb");
-   if (rng == NULL) {
-      rng = fopen("/dev/random", "rb");
-      if (rng == NULL) {
-         fprintf(stderr, "\nWarning:  stdin used as random source\n\n");
-         rng = stdin;
-      }
-   }
-
-   for (;;) {
-       n = fgetc(rng) % 11;
-
-   if (n == 0) {
-       /* add tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_add(&a, &b, &c);
-       printf("add\n");
-       mp_todecimal(&a, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
-       printf("%s\n", buf);
-   } else if (n == 1) {
-      /* sub tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_sub(&a, &b, &c);
-       printf("sub\n");
-       mp_todecimal(&a, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
-       printf("%s\n", buf);
-   } else if (n == 2) {
-       /* mul tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_mul(&a, &b, &c);
-       printf("mul\n");
-       mp_todecimal(&a, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
-       printf("%s\n", buf);
-   } else if (n == 3) {
-      /* div tests */
-       rand_num(&a);
-       rand_num(&b);
-       mp_div(&a, &b, &c, &d);
-       printf("div\n");
-       mp_todecimal(&a, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&d, buf);
-       printf("%s\n", buf);
-   } else if (n == 4) {
-      /* sqr tests */
-       rand_num(&a);
-       mp_sqr(&a, &b);
-       printf("sqr\n");
-       mp_todecimal(&a, buf);
-       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
-       printf("%s\n", buf);
-   } else if (n == 5) {
-      /* mul_2d test */
-      rand_num(&a);
-      mp_copy(&a, &b);
-      n = fgetc(rng) & 63;
-      mp_mul_2d(&b, n, &b);
-      mp_todecimal(&a, buf);
-      printf("mul2d\n");
-      printf("%s\n", buf);
-      printf("%d\n", n);
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 6) {
-      /* div_2d test */
-      rand_num(&a);
-      mp_copy(&a, &b);
-      n = fgetc(rng) & 63;
-      mp_div_2d(&b, n, &b, NULL);
-      mp_todecimal(&a, buf);
-      printf("div2d\n");
-      printf("%s\n", buf);
-      printf("%d\n", n);
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);
-   } else if (n == 7) {
-      /* gcd test */
-      rand_num(&a);
-      rand_num(&b);
-      a.sign = MP_ZPOS;
-      b.sign = MP_ZPOS;
-      mp_gcd(&a, &b, &c);
-      printf("gcd\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
-   } else if (n == 8) {
-      /* lcm test */
-      rand_num(&a);
-      rand_num(&b);
-      a.sign = MP_ZPOS;
-      b.sign = MP_ZPOS;
-      mp_lcm(&a, &b, &c);
-      printf("lcm\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
-   } else if (n == 9) {
-      /* exptmod test */
-      rand_num2(&a);
-      rand_num2(&b);
-      rand_num2(&c);
-      a.sign = b.sign = c.sign = 0;
-      mp_exptmod(&a, &b, &c, &d);
-      printf("expt\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&d, buf);
-      printf("%s\n", buf);      
-   } else if (n == 10) {
-      /* invmod test */
-      rand_num2(&a);
-      rand_num2(&b);
-      b.sign = MP_ZPOS;
-      a.sign = MP_ZPOS;
-      mp_gcd(&a, &b, &c);
-      if (mp_cmp_d(&c, 1) != 0) continue;
-      if (mp_cmp_d(&b, 1) == 0) continue;
-      mp_invmod(&a, &b, &c);
-      printf("invmod\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
-   } 
-   }
-   fclose(rng);
-   return 0;
-}
+/* makes a bignum test harness with NUM tests per operation
+ *
+ * the output is made in the following format [one parameter per line]
+
+operation
+operand1
+operand2
+[... operandN]
+result1
+result2
+[... resultN]
+
+So for example "a * b mod n" would be 
+
+mulmod
+a
+b
+n
+a*b mod n
+
+e.g. if a=3, b=4 n=11 then 
+
+mulmod
+3
+4
+11
+1
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "mpi.c"
+
+FILE *rng;
+
+void rand_num(mp_int *a)
+{
+   int n, size;
+   unsigned char buf[512];
+
+top:
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
+   buf[0] = (fgetc(rng)&1)?1:0;
+   fread(buf+1, 1, size, rng);
+   for (n = 0; n < size; n++) {
+       if (buf[n+1]) break;
+   }
+   if (n == size) goto top;
+   mp_read_raw(a, buf, 1+size);
+}
+
+void rand_num2(mp_int *a)
+{
+   int n, size;
+   unsigned char buf[512];
+
+top:
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
+   buf[0] = (fgetc(rng)&1)?1:0;
+   fread(buf+1, 1, size, rng);
+   for (n = 0; n < size; n++) {
+       if (buf[n+1]) break;
+   }
+   if (n == size) goto top;
+   mp_read_raw(a, buf, 1+size);
+}
+
+int main(void)
+{
+   int n;
+   mp_int a, b, c, d, e;
+   char buf[4096];
+
+   mp_init(&a);
+   mp_init(&b);
+   mp_init(&c);
+   mp_init(&d);
+   mp_init(&e);
+
+   rng = fopen("/dev/urandom", "rb");
+   if (rng == NULL) {
+      rng = fopen("/dev/random", "rb");
+      if (rng == NULL) {
+         fprintf(stderr, "\nWarning:  stdin used as random source\n\n");
+         rng = stdin;
+      }
+   }
+
+   for (;;) {
+       n = fgetc(rng) % 11;
+
+   if (n == 0) {
+       /* add tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_add(&a, &b, &c);
+       printf("add\n");
+       mp_todecimal(&a, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&b, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&c, buf);
+       printf("%s\n", buf);
+   } else if (n == 1) {
+      /* sub tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_sub(&a, &b, &c);
+       printf("sub\n");
+       mp_todecimal(&a, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&b, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&c, buf);
+       printf("%s\n", buf);
+   } else if (n == 2) {
+       /* mul tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_mul(&a, &b, &c);
+       printf("mul\n");
+       mp_todecimal(&a, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&b, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&c, buf);
+       printf("%s\n", buf);
+   } else if (n == 3) {
+      /* div tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_div(&a, &b, &c, &d);
+       printf("div\n");
+       mp_todecimal(&a, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&b, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&c, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&d, buf);
+       printf("%s\n", buf);
+   } else if (n == 4) {
+      /* sqr tests */
+       rand_num(&a);
+       mp_sqr(&a, &b);
+       printf("sqr\n");
+       mp_todecimal(&a, buf);
+       printf("%s\n", buf);
+       mp_todecimal(&b, buf);
+       printf("%s\n", buf);
+   } else if (n == 5) {
+      /* mul_2d test */
+      rand_num(&a);
+      mp_copy(&a, &b);
+      n = fgetc(rng) & 63;
+      mp_mul_2d(&b, n, &b);
+      mp_todecimal(&a, buf);
+      printf("mul2d\n");
+      printf("%s\n", buf);
+      printf("%d\n", n);
+      mp_todecimal(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 6) {
+      /* div_2d test */
+      rand_num(&a);
+      mp_copy(&a, &b);
+      n = fgetc(rng) & 63;
+      mp_div_2d(&b, n, &b, NULL);
+      mp_todecimal(&a, buf);
+      printf("div2d\n");
+      printf("%s\n", buf);
+      printf("%d\n", n);
+      mp_todecimal(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 7) {
+      /* gcd test */
+      rand_num(&a);
+      rand_num(&b);
+      a.sign = MP_ZPOS;
+      b.sign = MP_ZPOS;
+      mp_gcd(&a, &b, &c);
+      printf("gcd\n");
+      mp_todecimal(&a, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&b, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&c, buf);
+      printf("%s\n", buf);      
+   } else if (n == 8) {
+      /* lcm test */
+      rand_num(&a);
+      rand_num(&b);
+      a.sign = MP_ZPOS;
+      b.sign = MP_ZPOS;
+      mp_lcm(&a, &b, &c);
+      printf("lcm\n");
+      mp_todecimal(&a, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&b, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&c, buf);
+      printf("%s\n", buf);      
+   } else if (n == 9) {
+      /* exptmod test */
+      rand_num2(&a);
+      rand_num2(&b);
+      rand_num2(&c);
+      a.sign = b.sign = c.sign = 0;
+      mp_exptmod(&a, &b, &c, &d);
+      printf("expt\n");
+      mp_todecimal(&a, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&b, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&c, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&d, buf);
+      printf("%s\n", buf);      
+   } else if (n == 10) {
+      /* invmod test */
+      rand_num2(&a);
+      rand_num2(&b);
+      b.sign = MP_ZPOS;
+      a.sign = MP_ZPOS;
+      mp_gcd(&a, &b, &c);
+      if (mp_cmp_d(&c, 1) != 0) continue;
+      if (mp_cmp_d(&b, 1) == 0) continue;
+      mp_invmod(&a, &b, &c);
+      printf("invmod\n");
+      mp_todecimal(&a, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&b, buf);
+      printf("%s\n", buf);      
+      mp_todecimal(&c, buf);
+      printf("%s\n", buf);      
+   } 
+   }
+   fclose(rng);
+   return 0;
+}
diff --git a/timer.asm b/timer.asm
deleted file mode 100644
index b317e3e8b..000000000
--- a/timer.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-; Simple RDTSC reader for NASM
-;
-; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever]
-;
-; Most *nix installs use elf so it would be "nasm -f elf timer.asm"
-;
-; Tom St Denis
-[bits 32]
-[section .data]
-timer dd 0, 0
-[section .text]
-
-[global _gettsc]
-_gettsc:
-   rdtsc
-   ret
-
-[global _rdtsc]
-_rdtsc:
-   rdtsc
-   sub eax,[timer]
-   sbb edx,[timer+4]
-   ret
-
-[global _reset]
-_reset:
-   push eax
-   push edx
-   rdtsc
-   mov [timer],eax 
-   mov [timer+4],edx
-   pop edx
-   pop eax
-   ret
diff --git a/timings.txt b/timings.txt
new file mode 100644
index 000000000..117b9c5e0
--- /dev/null
+++ b/timings.txt
@@ -0,0 +1,39 @@
+CLOCKS_PER_SEC == 1000000
+Adding 128-bit took 0.070000 ticks
+Adding 256-bit took 0.100000 ticks
+Adding 512-bit took 0.140000 ticks
+Adding 1024-bit took 0.210000 ticks
+Adding 2048-bit took 0.360000 ticks
+Adding 4096-bit took 0.670000 ticks
+Subtracting 128-bit took 0.090000 ticks
+Subtracting 256-bit took 0.120000 ticks
+Subtracting 512-bit took 0.140000 ticks
+Subtracting 1024-bit took 0.210000 ticks
+Subtracting 2048-bit took 0.330000 ticks
+Subtracting 4096-bit took 0.580000 ticks
+Squaring 128-bit took 0.320000 ticks
+Squaring 256-bit took 0.620000 ticks
+Squaring 512-bit took 1.410000 ticks
+Squaring 1024-bit took 3.730000 ticks
+Squaring 2048-bit took 11.580000 ticks
+Squaring 4096-bit took 44.540000 ticks
+Multiplying 128-bit took 0.270000 ticks
+Multiplying 256-bit took 0.650000 ticks
+Multiplying 512-bit took 1.630000 ticks
+Multiplying 1024-bit took 5.180000 ticks
+Multiplying 2048-bit took 19.210000 ticks
+Multiplying 4096-bit took 67.500000 ticks
+Exponentiating 513-bit took 2000.000000 ticks
+Exponentiating 769-bit took 5200.000000 ticks
+Exponentiating 1025-bit took 11400.000000 ticks
+Exponentiating 2049-bit took 75100.000000 ticks
+Exponentiating 2561-bit took 150000.000000 ticks
+Exponentiating 3073-bit took 237800.000000 ticks
+Exponentiating 4097-bit took 510600.000000 ticks
+Inverting mod 128-bit took 0.000000 ticks
+Inverting mod 256-bit took 200.000000 ticks
+Inverting mod 512-bit took 300.000000 ticks
+Inverting mod 1024-bit took 800.000000 ticks
+Inverting mod 2048-bit took 2500.000000 ticks
+Inverting mod 4096-bit took 8400.000000 ticks
+
diff --git a/bn.h b/tommath.h
similarity index 83%
rename from bn.h
rename to tommath.h
index 5f39cbdaa..4ac61734b 100644
--- a/bn.h
+++ b/tommath.h
@@ -21,6 +21,11 @@
 #include <ctype.h>
 #include <limits.h>
 
+#undef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#undef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -82,9 +87,9 @@ extern "C" {
 typedef int           mp_err;
 
 /* you'll have to tune these... */
-#define KARATSUBA_MUL_CUTOFF    80      /* Min. number of digits before Karatsuba multiplication is used. */
-#define KARATSUBA_SQR_CUTOFF    80      /* Min. number of digits before Karatsuba squaring is used. */
-#define MONTGOMERY_EXPT_CUTOFF  40      /* max. number of digits that montgomery reductions will help for */
+extern int KARATSUBA_MUL_CUTOFF,
+           KARATSUBA_SQR_CUTOFF,
+           MONTGOMERY_EXPT_CUTOFF;
 
 #define MP_PREC                 64      /* default digits of precision */
 
@@ -126,6 +131,9 @@ void mp_set(mp_int *a, mp_digit b);
 /* set a 32-bit const */
 int mp_set_int(mp_int *a, unsigned long b);
 
+/* grow an int to a given size */
+int mp_grow(mp_int *a, int size);
+
 /* init to a given number of digits */
 int mp_init_size(mp_int *a, int size);
 
@@ -135,6 +143,9 @@ int mp_copy(mp_int *a, mp_int *b);
 /* inits and copies, a = b */
 int mp_init_copy(mp_int *a, mp_int *b);
 
+/* trim unused digits */
+void mp_clamp(mp_int *a);
+
 /* ---> digit manipulation <--- */
 
 /* right shift by "b" digits */
@@ -144,7 +155,7 @@ void mp_rshd(mp_int *a, int b);
 int mp_lshd(mp_int *a, int b);
 
 /* c = a / 2^b */
-int  mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
+int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
 
 /* b = a/2 */
 int mp_div_2(mp_int *a, mp_int *b);
@@ -161,6 +172,19 @@ int mp_mod_2d(mp_int *a, int b, mp_int *c);
 /* computes a = 2^b */
 int mp_2expt(mp_int *a, int b);
 
+/* makes a pseudo-random int of a given size */
+int mp_rand(mp_int *a, int digits);
+
+/* ---> binary operations <--- */
+/* c = a XOR b  */
+int mp_xor(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a OR b */
+int mp_or(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a AND b */
+int mp_and(mp_int *a, mp_int *b, mp_int *c);
+
 /* ---> Basic arithmetic <--- */
 
 /* b = -a */
@@ -178,6 +202,7 @@ int mp_cmp_mag(mp_int *a, mp_int *b);
 /* c = a + b */
 int mp_add(mp_int *a, mp_int *b, mp_int *c);
 
+
 /* c = a - b */
 int mp_sub(mp_int *a, mp_int *b, mp_int *c);
 
@@ -254,7 +279,7 @@ int mp_jacobi(mp_int *a, mp_int *n, int *c);
 /* used to setup the Barrett reduction for a given modulus b */
 int mp_reduce_setup(mp_int *a, mp_int *b);
 
-/* Barrett Reduction, computes a (mod b) with a precomputed value c  
+/* Barrett Reduction, computes a (mod b) with a precomputed value c
  *
  * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
  * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
@@ -266,12 +291,11 @@ int mp_montgomery_setup(mp_int *a, mp_digit *mp);
 
 /* computes xR^-1 == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
- 
+
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 
 /* ---> radix conversion <--- */
-
 int mp_count_bits(mp_int *a);
 
 int mp_unsigned_bin_size(mp_int *a);
@@ -298,6 +322,23 @@ int mp_radix_size(mp_int *a, int radix);
 #define mp_todecimal(M, S) mp_toradix((M), (S), 10)
 #define mp_tohex(M, S)     mp_toradix((M), (S), 16)
 
+/* lowlevel functions, do not call! */
+int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
+int s_mp_sub(mp_int *a, mp_int *b, mp_int *c);
+#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
+int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int fast_s_mp_sqr(mp_int *a, mp_int *b);
+int s_mp_sqr(mp_int *a, mp_int *b);
+int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c);
+int mp_karatsuba_sqr(mp_int *a, mp_int *b);
+int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y);
+void bn_reverse(unsigned char *s, int len);
+
 #ifdef __cplusplus
    }
 #endif