diff --git a/monocypher.opam b/monocypher.opam
index e69de29..f2c2edc 100644
--- a/monocypher.opam
+++ b/monocypher.opam
@@ -0,0 +1,20 @@
+opam-version: "2.0"
+authors: "Vincent Bernardoff <vb@luminar.eu.org>"
+maintainer: "Vincent Bernardoff <vb@luminar.eu.org>"
+homepage: "https://github.com/vbmithr/ocaml-monocypher"
+bug-reports: "https://github.com/vbmithr/ocaml-monocypher/issues"
+dev-repo: "git+https://github.com/vbmithr/ocaml-monocypher"
+doc: "https://vbmithr.github.io/ocaml-monocypher/doc"
+build: [ "dune" "build" "-j" jobs "-p" name ]
+run-test: [ "dune" "runtest" "-j" jobs "-p" name ]
+depends: [
+  "dune" {>= "1.11.4"}
+  "gen" {>= "0.5.2"}
+  "bigstring" {>= "0.2"}
+  "zarith" {>= "1.9.1"}
+  "hex" {with-test & >= "1.4.0"}
+  "alcotest" {with-test & >= "0.8.5"}
+]
+synopsis: "Boring crypto that simply works"
+description: """
+"""
diff --git a/src/dune b/src/dune
index d052bf4..8bd2fa5 100644
--- a/src/dune
+++ b/src/dune
@@ -2,5 +2,5 @@
  (name            monocypher)
  (public_name     monocypher)
  (libraries       gen bigstring zarith)
- (c_names         sha512 monocypher_stubs)
- (c_flags         -DED25519_SHA512 -O3))
+ (c_names         monocypher-ed25519 monocypher_stubs)
+ (c_flags         -O3))
diff --git a/src/monocypher-ed25519.c b/src/monocypher-ed25519.c
new file mode 100644
index 0000000..e9363b4
--- /dev/null
+++ b/src/monocypher-ed25519.c
@@ -0,0 +1,518 @@
+// Monocypher version __git__
+//
+// This file is dual-licensed.  Choose whichever licence you want from
+// the two licences listed below.
+//
+// The first licence is a regular 2-clause BSD licence.  The second licence
+// is the CC-0 from Creative Commons. It is intended to release Monocypher
+// to the public domain.  The BSD licence serves as a fallback option.
+//
+// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
+//
+// ------------------------------------------------------------------------
+//
+// Copyright (c) 2017-2019, Loup Vaillant
+// All rights reserved.
+//
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the
+//    distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ------------------------------------------------------------------------
+//
+// Written in 2017-2019 by Loup Vaillant
+//
+// To the extent possible under law, the author(s) have dedicated all copyright
+// and related neighboring rights to this software to the public domain
+// worldwide.  This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication along
+// with this software.  If not, see
+// <https://creativecommons.org/publicdomain/zero/1.0/>
+
+#include "monocypher-ed25519.h"
+
+/////////////////
+/// Utilities ///
+/////////////////
+#define FOR(i, min, max)     for (size_t i = min; i < max; i++)
+#define WIPE_CTX(ctx)        crypto_wipe(ctx   , sizeof(*(ctx)))
+#define MIN(a, b)            ((a) <= (b) ? (a) : (b))
+#define ALIGN(x, block_size) ((~(x) + 1) & ((block_size) - 1))
+typedef uint8_t u8;
+typedef uint64_t u64;
+
+static u64 load64_be(const u8 s[8])
+{
+    return((u64)s[0] << 56)
+        | ((u64)s[1] << 48)
+        | ((u64)s[2] << 40)
+        | ((u64)s[3] << 32)
+        | ((u64)s[4] << 24)
+        | ((u64)s[5] << 16)
+        | ((u64)s[6] <<  8)
+        |  (u64)s[7];
+}
+
+static void store64_be(u8 out[8], u64 in)
+{
+    out[0] = (in >> 56) & 0xff;
+    out[1] = (in >> 48) & 0xff;
+    out[2] = (in >> 40) & 0xff;
+    out[3] = (in >> 32) & 0xff;
+    out[4] = (in >> 24) & 0xff;
+    out[5] = (in >> 16) & 0xff;
+    out[6] = (in >>  8) & 0xff;
+    out[7] =  in        & 0xff;
+}
+
+///////////////
+/// SHA 512 ///
+///////////////
+static u64 rot(u64 x, int c       ) { return (x >> c) | (x << (64 - c));   }
+static u64 ch (u64 x, u64 y, u64 z) { return (x & y) ^ (~x & z);           }
+static u64 maj(u64 x, u64 y, u64 z) { return (x & y) ^ ( x & z) ^ (y & z); }
+static u64 big_sigma0(u64 x) { return rot(x, 28) ^ rot(x, 34) ^ rot(x, 39); }
+static u64 big_sigma1(u64 x) { return rot(x, 14) ^ rot(x, 18) ^ rot(x, 41); }
+static u64 lit_sigma0(u64 x) { return rot(x,  1) ^ rot(x,  8) ^ (x >> 7);   }
+static u64 lit_sigma1(u64 x) { return rot(x, 19) ^ rot(x, 61) ^ (x >> 6);   }
+
+static const u64 K[80] = {
+    0x428a2f98d728ae22,0x7137449123ef65cd,0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc,
+    0x3956c25bf348b538,0x59f111f1b605d019,0x923f82a4af194f9b,0xab1c5ed5da6d8118,
+    0xd807aa98a3030242,0x12835b0145706fbe,0x243185be4ee4b28c,0x550c7dc3d5ffb4e2,
+    0x72be5d74f27b896f,0x80deb1fe3b1696b1,0x9bdc06a725c71235,0xc19bf174cf692694,
+    0xe49b69c19ef14ad2,0xefbe4786384f25e3,0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65,
+    0x2de92c6f592b0275,0x4a7484aa6ea6e483,0x5cb0a9dcbd41fbd4,0x76f988da831153b5,
+    0x983e5152ee66dfab,0xa831c66d2db43210,0xb00327c898fb213f,0xbf597fc7beef0ee4,
+    0xc6e00bf33da88fc2,0xd5a79147930aa725,0x06ca6351e003826f,0x142929670a0e6e70,
+    0x27b70a8546d22ffc,0x2e1b21385c26c926,0x4d2c6dfc5ac42aed,0x53380d139d95b3df,
+    0x650a73548baf63de,0x766a0abb3c77b2a8,0x81c2c92e47edaee6,0x92722c851482353b,
+    0xa2bfe8a14cf10364,0xa81a664bbc423001,0xc24b8b70d0f89791,0xc76c51a30654be30,
+    0xd192e819d6ef5218,0xd69906245565a910,0xf40e35855771202a,0x106aa07032bbd1b8,
+    0x19a4c116b8d2d0c8,0x1e376c085141ab53,0x2748774cdf8eeb99,0x34b0bcb5e19b48a8,
+    0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb,0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3,
+    0x748f82ee5defb2fc,0x78a5636f43172f60,0x84c87814a1f0ab72,0x8cc702081a6439ec,
+    0x90befffa23631e28,0xa4506cebde82bde9,0xbef9a3f7b2c67915,0xc67178f2e372532b,
+    0xca273eceea26619c,0xd186b8c721c0c207,0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178,
+    0x06f067aa72176fba,0x0a637dc5a2c898a6,0x113f9804bef90dae,0x1b710b35131c471b,
+    0x28db77f523047d84,0x32caab7b40c72493,0x3c9ebe0a15c9bebc,0x431d67c49c100d4c,
+    0x4cc5d4becb3e42b6,0x597f299cfc657e2a,0x5fcb6fab3ad6faec,0x6c44198c4a475817
+};
+
+static void sha512_compress(crypto_sha512_ctx *ctx)
+{
+    u64 a = ctx->hash[0];    u64 b = ctx->hash[1];
+    u64 c = ctx->hash[2];    u64 d = ctx->hash[3];
+    u64 e = ctx->hash[4];    u64 f = ctx->hash[5];
+    u64 g = ctx->hash[6];    u64 h = ctx->hash[7];
+
+    FOR (j, 0, 16) {
+        u64 in = K[j] + ctx->input[j];
+        u64 t1 = big_sigma1(e) + ch (e, f, g) + h + in;
+        u64 t2 = big_sigma0(a) + maj(a, b, c);
+        h = g;  g = f;  f = e;  e = d  + t1;
+        d = c;  c = b;  b = a;  a = t1 + t2;
+    }
+    size_t i16 = 0;
+    FOR(i, 1, 5) {
+        i16 += 16;
+        FOR (j, 0, 16) {
+            ctx->input[j] += lit_sigma1(ctx->input[(j- 2) & 15]);
+            ctx->input[j] += lit_sigma0(ctx->input[(j-15) & 15]);
+            ctx->input[j] +=            ctx->input[(j- 7) & 15];
+            u64 in = K[i16 + j] + ctx->input[j];
+            u64 t1 = big_sigma1(e) + ch (e, f, g) + h + in;
+            u64 t2 = big_sigma0(a) + maj(a, b, c);
+            h = g;  g = f;  f = e;  e = d  + t1;
+            d = c;  c = b;  b = a;  a = t1 + t2;
+        }
+    }
+
+    ctx->hash[0] += a;    ctx->hash[1] += b;
+    ctx->hash[2] += c;    ctx->hash[3] += d;
+    ctx->hash[4] += e;    ctx->hash[5] += f;
+    ctx->hash[6] += g;    ctx->hash[7] += h;
+}
+
+static void sha512_set_input(crypto_sha512_ctx *ctx, u8 input)
+{
+    if (ctx->input_idx == 0) {
+        FOR (i, 0, 16) {
+            ctx->input[i] = 0;
+        }
+    }
+    size_t word = ctx->input_idx / 8;
+    size_t byte = ctx->input_idx % 8;
+    ctx->input[word] |= (u64)input << (8 * (7 - byte));
+}
+
+// increment a 128-bit "word".
+static void sha512_incr(u64 x[2], u64 y)
+{
+    x[1] += y;
+    if (x[1] < y) {
+        x[0]++;
+    }
+}
+
+static void sha512_end_block(crypto_sha512_ctx *ctx)
+{
+    if (ctx->input_idx == 128) {
+        sha512_incr(ctx->input_size, 1024); // size is in bits
+        sha512_compress(ctx);
+        ctx->input_idx = 0;
+    }
+}
+
+static void sha512_update(crypto_sha512_ctx *ctx,
+                          const u8 *message, size_t message_size)
+{
+    FOR (i, 0, message_size) {
+        sha512_set_input(ctx, message[i]);
+        ctx->input_idx++;
+        sha512_end_block(ctx);
+    }
+}
+
+void crypto_sha512_init(crypto_sha512_ctx *ctx)
+{
+    ctx->hash[0] = 0x6a09e667f3bcc908;
+    ctx->hash[1] = 0xbb67ae8584caa73b;
+    ctx->hash[2] = 0x3c6ef372fe94f82b;
+    ctx->hash[3] = 0xa54ff53a5f1d36f1;
+    ctx->hash[4] = 0x510e527fade682d1;
+    ctx->hash[5] = 0x9b05688c2b3e6c1f;
+    ctx->hash[6] = 0x1f83d9abfb41bd6b;
+    ctx->hash[7] = 0x5be0cd19137e2179;
+    ctx->input_size[0] = 0;
+    ctx->input_size[1] = 0;
+    ctx->input_idx = 0;
+}
+
+void crypto_sha512_update(crypto_sha512_ctx *ctx,
+                          const u8 *message, size_t message_size)
+{
+    // Align ourselves with block boundaries
+    size_t align = MIN(ALIGN(ctx->input_idx, 128), message_size);
+    sha512_update(ctx, message, align);
+    message      += align;
+    message_size -= align;
+
+    // Process the message block by block
+    FOR (i, 0, message_size / 128) { // number of blocks
+        FOR (j, 0, 16) {
+            ctx->input[j] = load64_be(message + j*8);
+        }
+        message        += 128;
+        ctx->input_idx += 128;
+        sha512_end_block(ctx);
+    }
+    message_size &= 127;
+
+    // remaining bytes
+    sha512_update(ctx, message, message_size);
+}
+
+void crypto_sha512_final(crypto_sha512_ctx *ctx, u8 hash[64])
+{
+    sha512_incr(ctx->input_size, ctx->input_idx * 8); // size is in bits
+    sha512_set_input(ctx, 128);                       // padding
+
+    // compress penultimate block (if any)
+    if (ctx->input_idx > 111) {
+        sha512_compress(ctx);
+        FOR(i, 0, 14) {
+            ctx->input[i] = 0;
+        }
+    }
+    // compress last block
+    ctx->input[14] = ctx->input_size[0];
+    ctx->input[15] = ctx->input_size[1];
+    sha512_compress(ctx);
+
+    // copy hash to output (big endian)
+    FOR (i, 0, 8) {
+        store64_be(hash + i*8, ctx->hash[i]);
+    }
+
+    WIPE_CTX(ctx);
+}
+
+void crypto_sha512(u8 hash[64], const u8 *message, size_t message_size)
+{
+    crypto_sha512_ctx ctx;
+    crypto_sha512_init  (&ctx);
+    crypto_sha512_update(&ctx, message, message_size);
+    crypto_sha512_final (&ctx, hash);
+}
+
+static void sha512_vtable_init(void *ctx)
+{
+    crypto_sha512_init(&((crypto_sign_ed25519_ctx*)ctx)->hash);
+}
+
+static void sha512_vtable_update(void *ctx, const u8 *m, size_t s)
+{
+    crypto_sha512_update(&((crypto_sign_ed25519_ctx*)ctx)->hash, m, s);
+}
+
+static void sha512_vtable_final(void *ctx, u8 *h)
+{
+    crypto_sha512_final(&((crypto_sign_ed25519_ctx*)ctx)->hash, h);
+}
+
+const crypto_sign_vtable crypto_sha512_vtable = {
+    crypto_sha512,
+    sha512_vtable_init,
+    sha512_vtable_update,
+    sha512_vtable_final,
+    sizeof(crypto_sign_ed25519_ctx),
+};
+
+////////////////////
+/// HMAC SHA 512 ///
+////////////////////
+void crypto_hmac_sha512_init(crypto_hmac_sha512_ctx *ctx,
+                             const u8 *key, size_t key_size)
+{
+    // hash key if it is too long
+    if (key_size > 128) {
+        crypto_sha512(ctx->key, key, key_size);
+        key      = ctx->key;
+        key_size = 64;
+    }
+    // Compute inner key: padded key XOR 0x36
+    FOR (i, 0, key_size)   { ctx->key[i] = key[i] ^ 0x36; }
+    FOR (i, key_size, 128) { ctx->key[i] =          0x36; }
+    // Start computing inner hash
+    crypto_sha512_init  (&ctx->ctx);
+    crypto_sha512_update(&ctx->ctx, ctx->key, 128);
+}
+
+void crypto_hmac_sha512_update(crypto_hmac_sha512_ctx *ctx,
+                               const u8 *message, size_t message_size)
+{
+    crypto_sha512_update(&ctx->ctx, message, message_size);
+}
+
+void crypto_hmac_sha512_final(crypto_hmac_sha512_ctx *ctx, u8 hmac[64])
+{
+    // Finish computing inner hash
+    crypto_sha512_final(&ctx->ctx, hmac);
+    // Compute outer key: padded key XOR 0x5c
+    FOR (i, 0, 128) {
+        ctx->key[i] ^= 0x36 ^ 0x5c;
+    }
+    // Compute outer hash
+    crypto_sha512_init  (&ctx->ctx);
+    crypto_sha512_update(&ctx->ctx, ctx->key , 128);
+    crypto_sha512_update(&ctx->ctx, hmac, 64);
+    crypto_sha512_final (&ctx->ctx, hmac); // outer hash
+    WIPE_CTX(ctx);
+}
+
+void crypto_hmac_sha512(u8 hmac[64], const u8 *key, size_t key_size,
+                        const u8 *message, size_t message_size)
+{
+    crypto_hmac_sha512_ctx ctx;
+    crypto_hmac_sha512_init  (&ctx, key, key_size);
+    crypto_hmac_sha512_update(&ctx, message, message_size);
+    crypto_hmac_sha512_final (&ctx, hmac);
+}
+
+
+///////////////
+/// Ed25519 ///
+///////////////
+
+void crypto_ed25519_public_key(u8       public_key[32],
+                               const u8 secret_key[32])
+{
+    crypto_sign_public_key_custom_hash(public_key, secret_key,
+                                       &crypto_sha512_vtable);
+}
+
+void crypto_ed25519_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
+                                         const u8 secret_key[32],
+                                         const u8 public_key[32])
+{
+    crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
+                                            &crypto_sha512_vtable);
+}
+
+void crypto_ed25519_check_init(crypto_check_ctx_abstract *ctx,
+                               const u8 signature[64],
+                               const u8 public_key[32])
+{
+    crypto_check_init_custom_hash(ctx, signature, public_key,
+                                  &crypto_sha512_vtable);
+}
+
+void crypto_ed25519_sign(u8        signature [64],
+                         const u8  secret_key[32],
+                         const u8  public_key[32],
+                         const u8 *message, size_t message_size)
+{
+    crypto_sign_ed25519_ctx ctx;
+    crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
+    crypto_ed25519_sign_init_first_pass (actx, secret_key, public_key);
+    crypto_ed25519_sign_update          (actx, message, message_size);
+    crypto_ed25519_sign_init_second_pass(actx);
+    crypto_ed25519_sign_update          (actx, message, message_size);
+    crypto_ed25519_sign_final           (actx, signature);
+}
+
+int crypto_ed25519_check(const u8  signature [64],
+                         const u8  public_key[32],
+                         const u8 *message, size_t message_size)
+{
+    crypto_check_ed25519_ctx ctx;
+    crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
+    crypto_ed25519_check_init  (actx, signature, public_key);
+    crypto_ed25519_check_update(actx, message, message_size);
+    return crypto_ed25519_check_final(actx);
+}
+
+/////////////////////////////
+/// Custom and OCaml code ///
+/////////////////////////////
+
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/alloc.h>
+#include <caml/bigarray.h>
+#include "platform-specific.h"
+
+CAMLprim value caml_monocypher_sizeof_crypto_sha512_ctx(value unit) {
+    return Val_long(sizeof(crypto_sha512_ctx));
+}
+
+CAMLprim value caml_monocypher_crypto_sha512_init(value ctx) {
+    crypto_sha512_init(Caml_ba_data_val(ctx));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_sha512_update(value ctx, value msg) {
+    crypto_sha512_update(Caml_ba_data_val(ctx),
+                         Caml_ba_data_val(msg),
+                         Caml_ba_array_val(msg)->dim[0]);
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_sha512_final(value ctx, value hash) {
+    crypto_sha512_final(Caml_ba_data_val(ctx),
+                        Caml_ba_data_val(hash));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_public_key(value pk, value sk) {
+    crypto_ed25519_public_key(Caml_ba_data_val(pk),
+                              Caml_ba_data_val(sk));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_sizeof_crypto_sign_ctx(value unit) {
+    return Val_int(sizeof(crypto_sign_ctx));
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_sign_init_first_pass(value ctx, value sk, value pk) {
+    crypto_ed25519_sign_init_first_pass(Caml_ba_data_val(ctx),
+                                        Caml_ba_data_val(sk),
+                                        Caml_ba_data_val(pk));
+    return Val_unit;
+}
+
+void crypto_ed25519_sign_init_first_pass_extended(crypto_sign_ctx_abstract *ctx,
+                                                  const u8 extended_key[64],
+                                                  const u8 public_key[32])
+{
+    ctx->hash  = &crypto_sha512_vtable; // set vtable
+    u8 *a      = ctx->buf;
+    u8 *prefix = ctx->buf + 32;
+    FOR(i, 0, 64) { a[i] = extended_key[i]; }
+
+    if (public_key == 0) {
+        crypto_sign_public_key_extended(ctx->pk, extended_key);
+    } else {
+        FOR (i, 0, 32) {
+            ctx->pk[i] = public_key[i];
+        }
+    }
+
+    // Deterministic part of EdDSA: Construct a nonce by hashing the message
+    // instead of generating a random number.
+    // An actual random number would work just fine, and would save us
+    // the trouble of hashing the message twice.  If we did that
+    // however, the user could fuck it up and reuse the nonce.
+    ctx->hash->init  (ctx);
+    ctx->hash->update(ctx, prefix , 32);
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_sign_init_first_pass_extended(value ctx, value ek, value pk) {
+    crypto_ed25519_sign_init_first_pass_extended(Caml_ba_data_val(ctx),
+                                                 Caml_ba_data_val(ek),
+                                                 Caml_ba_data_val(pk));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_sign_update(value ctx, value msg) {
+    crypto_ed25519_sign_update(Caml_ba_data_val(ctx),
+                               Caml_ba_data_val(msg),
+                               Caml_ba_array_val(msg)->dim[0]);
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_sign_init_second_pass(value ctx) {
+    crypto_ed25519_sign_init_second_pass(Caml_ba_data_val(ctx));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_sign_final(value ctx, value signature) {
+    crypto_ed25519_sign_final(Caml_ba_data_val(ctx),
+                              Caml_ba_data_val(signature));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_sizeof_crypto_check_ctx(value unit) {
+    return Val_int(sizeof(crypto_check_ctx));
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_check_init(value ctx, value signature, value pk) {
+    crypto_ed25519_check_init(Caml_ba_data_val(ctx),
+                              Caml_ba_data_val(signature),
+                              Caml_ba_data_val(pk));
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_check_update(value ctx, value msg) {
+    crypto_ed25519_check_update(Caml_ba_data_val(ctx),
+                                Caml_ba_data_val(msg),
+                                Caml_ba_array_val(msg)->dim[0]);
+    return Val_unit;
+}
+
+CAMLprim value caml_monocypher_crypto_ed25519_check_final(value ctx) {
+    return Val_int(crypto_ed25519_check_final(Caml_ba_data_val(ctx)));
+}
diff --git a/src/monocypher-ed25519.h b/src/monocypher-ed25519.h
new file mode 100644
index 0000000..110d4d0
--- /dev/null
+++ b/src/monocypher-ed25519.h
@@ -0,0 +1,139 @@
+// Monocypher version __git__
+//
+// This file is dual-licensed.  Choose whichever licence you want from
+// the two licences listed below.
+//
+// The first licence is a regular 2-clause BSD licence.  The second licence
+// is the CC-0 from Creative Commons. It is intended to release Monocypher
+// to the public domain.  The BSD licence serves as a fallback option.
+//
+// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
+//
+// ------------------------------------------------------------------------
+//
+// Copyright (c) 2017-2019, Loup Vaillant
+// All rights reserved.
+//
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the
+//    distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ------------------------------------------------------------------------
+//
+// Written in 2017-2019 by Loup Vaillant
+//
+// To the extent possible under law, the author(s) have dedicated all copyright
+// and related neighboring rights to this software to the public domain
+// worldwide.  This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication along
+// with this software.  If not, see
+// <https://creativecommons.org/publicdomain/zero/1.0/>
+
+#ifndef ED25519_H
+#define ED25519_H
+
+#include "monocypher.h"
+
+////////////////////////
+/// Type definitions ///
+////////////////////////
+
+// Do not rely on the size or content on any of those types,
+// they may change without notice.
+typedef struct {
+    uint64_t hash[8];
+    uint64_t input[16];
+    uint64_t input_size[2];
+    size_t   input_idx;
+} crypto_sha512_ctx;
+
+typedef struct {
+    uint8_t key[128];
+    crypto_sha512_ctx ctx;
+} crypto_hmac_sha512_ctx;
+
+typedef struct {
+    crypto_sign_ctx_abstract ctx;
+    crypto_sha512_ctx        hash;
+} crypto_sign_ed25519_ctx;
+typedef crypto_sign_ed25519_ctx crypto_check_ed25519_ctx;
+
+// SHA 512
+// -------
+void crypto_sha512_init  (crypto_sha512_ctx *ctx);
+void crypto_sha512_update(crypto_sha512_ctx *ctx,
+                          const uint8_t *message, size_t  message_size);
+void crypto_sha512_final (crypto_sha512_ctx *ctx, uint8_t hash[64]);
+void crypto_sha512(uint8_t hash[64], const uint8_t *message, size_t message_size);
+
+// vtable for signatures
+extern const crypto_sign_vtable crypto_sha512_vtable;
+
+
+// HMAC SHA 512
+// ------------
+void crypto_hmac_sha512_init(crypto_hmac_sha512_ctx *ctx,
+                             const uint8_t *key, size_t key_size);
+void crypto_hmac_sha512_update(crypto_hmac_sha512_ctx *ctx,
+                               const uint8_t *message, size_t  message_size);
+void crypto_hmac_sha512_final(crypto_hmac_sha512_ctx *ctx, uint8_t hmac[64]);
+void crypto_hmac_sha512(uint8_t hmac[64],
+                        const uint8_t *key    , size_t key_size,
+                        const uint8_t *message, size_t message_size);
+
+
+// Ed25519
+// -------
+
+// Generate public key
+void crypto_ed25519_public_key(uint8_t       public_key[32],
+                               const uint8_t secret_key[32]);
+
+// Direct interface
+void crypto_ed25519_sign(uint8_t        signature [64],
+                         const uint8_t  secret_key[32],
+                         const uint8_t  public_key[32], // optional, may be 0
+                         const uint8_t *message, size_t message_size);
+int crypto_ed25519_check(const uint8_t  signature [64],
+                         const uint8_t  public_key[32],
+                         const uint8_t *message, size_t message_size);
+
+// Incremental interface
+void crypto_ed25519_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
+                                         const uint8_t secret_key[32],
+                                         const uint8_t public_key[32]);
+#define crypto_ed25519_sign_update crypto_sign_update
+#define crypto_ed25519_sign_init_second_pass crypto_sign_init_second_pass
+// use crypto_ed25519_sign_update() again.
+#define crypto_ed25519_sign_final crypto_sign_final
+
+void crypto_ed25519_check_init(crypto_check_ctx_abstract *ctx,
+                               const uint8_t signature[64],
+                               const uint8_t public_key[32]);
+#define crypto_ed25519_check_update crypto_check_update
+#define crypto_ed25519_check_final crypto_check_final
+
+
+#endif // ED25519_H
diff --git a/src/monocypher.h b/src/monocypher.h
index 019f70f..b92e044 100644
--- a/src/monocypher.h
+++ b/src/monocypher.h
@@ -1,22 +1,79 @@
+// Monocypher version __git__
+//
+// This file is dual-licensed.  Choose whichever licence you want from
+// the two licences listed below.
+//
+// The first licence is a regular 2-clause BSD licence.  The second licence
+// is the CC-0 from Creative Commons. It is intended to release Monocypher
+// to the public domain.  The BSD licence serves as a fallback option.
+//
+// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
+//
+// ------------------------------------------------------------------------
+//
+// Copyright (c) 2017-2019, Loup Vaillant
+// All rights reserved.
+//
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the
+//    distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ------------------------------------------------------------------------
+//
+// Written in 2017-2019 by Loup Vaillant
+//
+// To the extent possible under law, the author(s) have dedicated all copyright
+// and related neighboring rights to this software to the public domain
+// worldwide.  This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication along
+// with this software.  If not, see
+// <https://creativecommons.org/publicdomain/zero/1.0/>
+
 #ifndef MONOCYPHER_H
 #define MONOCYPHER_H
 
-#include <inttypes.h>
 #include <stddef.h>
+#include <stdint.h>
 
 ////////////////////////
 /// Type definitions ///
 ////////////////////////
 
-// Do not rely on the size or content on any of those types,
-// they may change without notice.
-
-// Chacha20
+// Vtable for EdDSA with a custom hash.
+// Instantiate it to define a custom hash.
+// Its size, contents, and layout, are part of the public API.
 typedef struct {
-    uint32_t input[16]; // current input, unencrypted
-    uint32_t pool [16]; // last input, encrypted
-    size_t   pool_idx;  // pointer to random_pool
-} crypto_chacha_ctx;
+    void (*hash)(uint8_t hash[64], const uint8_t *message, size_t message_size);
+    void (*init  )(void *ctx);
+    void (*update)(void *ctx, const uint8_t *message, size_t message_size);
+    void (*final )(void *ctx, uint8_t hash[64]);
+    size_t ctx_size;
+} crypto_sign_vtable;
+
+// Do not rely on the size or contents of any of the types below,
+// they may change without notice.
 
 // Poly1305
 typedef struct {
@@ -27,16 +84,6 @@ typedef struct {
     size_t   c_idx;  // How many bytes are there in the chunk.
 } crypto_poly1305_ctx;
 
-// Authenticated encryption
-typedef struct {
-    crypto_chacha_ctx   chacha;
-    crypto_poly1305_ctx poly;
-    uint64_t            ad_size;
-    uint64_t            message_size;
-    int                 ad_phase;
-} crypto_lock_ctx;
-#define crypto_unlock_ctx crypto_lock_ctx
-
 // Hash (Blake2b)
 typedef struct {
     uint64_t hash[8];
@@ -47,23 +94,18 @@ typedef struct {
 } crypto_blake2b_ctx;
 
 // Signatures (EdDSA)
-#ifdef ED25519_SHA512
-    #include "sha512.h"
-    typedef crypto_sha512_ctx crypto_hash_ctx;
-#else
-    typedef crypto_blake2b_ctx crypto_hash_ctx;
-#endif
 typedef struct {
-    crypto_hash_ctx hash;
+    const crypto_sign_vtable *hash;
     uint8_t buf[96];
     uint8_t pk [32];
-} crypto_sign_ctx;
-typedef struct {
-    crypto_hash_ctx hash;
-    uint8_t sig[64];
-    uint8_t pk [32];
-} crypto_check_ctx;
+} crypto_sign_ctx_abstract;
+typedef crypto_sign_ctx_abstract crypto_check_ctx_abstract;
 
+typedef struct {
+    crypto_sign_ctx_abstract ctx;
+    crypto_blake2b_ctx       hash;
+} crypto_sign_ctx;
+typedef crypto_sign_ctx crypto_check_ctx;
 
 ////////////////////////////
 /// High level interface ///
@@ -113,31 +155,6 @@ int crypto_unlock_aead(uint8_t       *plain_text,
                        const uint8_t *ad         , size_t ad_size,
                        const uint8_t *cipher_text, size_t text_size);
 
-// Incremental interface (encryption)
-void crypto_lock_init(crypto_lock_ctx *ctx,
-                      const uint8_t    key[32],
-                      const uint8_t    nonce[24]);
-void crypto_lock_auth_ad(crypto_lock_ctx *ctx,
-                         const uint8_t   *message,
-                         size_t           message_size);
-void crypto_lock_auth_message(crypto_lock_ctx *ctx,
-                              const uint8_t *cipher_text, size_t text_size);
-void crypto_lock_update(crypto_lock_ctx *ctx,
-                        uint8_t         *cipher_text,
-                        const uint8_t   *plain_text,
-                        size_t           text_size);
-void crypto_lock_final(crypto_lock_ctx *ctx, uint8_t mac[16]);
-
-// Incremental interface (decryption)
-#define crypto_unlock_init         crypto_lock_init
-#define crypto_unlock_auth_ad      crypto_lock_auth_ad
-#define crypto_unlock_auth_message crypto_lock_auth_message
-void crypto_unlock_update(crypto_unlock_ctx *ctx,
-                          uint8_t           *plain_text,
-                          const uint8_t     *cipher_text,
-                          size_t             text_size);
-int crypto_unlock_final(crypto_unlock_ctx *ctx, const uint8_t mac[16]);
-
 
 // General purpose hash (Blake2b)
 // ------------------------------
@@ -159,6 +176,9 @@ void crypto_blake2b_final (crypto_blake2b_ctx *ctx, uint8_t *hash);
 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
                                  const uint8_t      *key, size_t key_size);
 
+// vtable for signatures
+extern const crypto_sign_vtable crypto_blake2b_vtable;
+
 
 // Password key derivation (Argon2 i)
 // ----------------------------------
@@ -166,7 +186,7 @@ void crypto_argon2i(uint8_t       *hash,      uint32_t hash_size,     // >= 4
                     void          *work_area, uint32_t nb_blocks,     // >= 8
                     uint32_t       nb_iterations,                     // >= 1
                     const uint8_t *password,  uint32_t password_size,
-                    const uint8_t *salt,      uint32_t salt_size);
+                    const uint8_t *salt,      uint32_t salt_size);    // >= 8
 
 void crypto_argon2i_general(uint8_t       *hash,      uint32_t hash_size,// >= 4
                             void          *work_area, uint32_t nb_blocks,// >= 8
@@ -180,9 +200,9 @@ void crypto_argon2i_general(uint8_t       *hash,      uint32_t hash_size,// >= 4
 // Key exchange (x25519 + HChacha20)
 // ---------------------------------
 #define crypto_key_exchange_public_key crypto_x25519_public_key
-int crypto_key_exchange(uint8_t       shared_key      [32],
-                        const uint8_t your_secret_key [32],
-                        const uint8_t their_public_key[32]);
+void crypto_key_exchange(uint8_t       shared_key      [32],
+                         const uint8_t your_secret_key [32],
+                         const uint8_t their_public_key[32]);
 
 
 // Signatures (EdDSA with curve25519 + Blake2b)
@@ -192,6 +212,10 @@ int crypto_key_exchange(uint8_t       shared_key      [32],
 void crypto_sign_public_key(uint8_t        public_key[32],
                             const uint8_t  secret_key[32]);
 
+// Generate public key from an extended (already hashed) key
+void crypto_sign_public_key_extended(uint8_t        public_key[32],
+                                     const uint8_t  extended_key[64]);
+
 // Direct interface
 void crypto_sign(uint8_t        signature [64],
                  const uint8_t  secret_key[32],
@@ -202,23 +226,35 @@ int crypto_check(const uint8_t  signature [64],
                  const uint8_t *message, size_t message_size);
 
 // Incremental interface for signatures (2 passes)
-void crypto_sign_init_first_pass(crypto_sign_ctx *ctx,
+void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
                                  const uint8_t  secret_key[32],
                                  const uint8_t  public_key[32]);
-void crypto_sign_update(crypto_sign_ctx *ctx,
+void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
                         const uint8_t *message, size_t message_size);
-void crypto_sign_init_second_pass(crypto_sign_ctx *ctx);
+void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx);
 // use crypto_sign_update() again.
-void crypto_sign_final(crypto_sign_ctx *ctx, uint8_t signature[64]);
+void crypto_sign_final(crypto_sign_ctx_abstract *ctx, uint8_t signature[64]);
 
 // Incremental interface for verification (1 pass)
-void crypto_check_init  (crypto_check_ctx *ctx,
+void crypto_check_init  (crypto_check_ctx_abstract *ctx,
                          const uint8_t signature[64],
                          const uint8_t public_key[32]);
-void crypto_check_update(crypto_check_ctx *ctx,
+void crypto_check_update(crypto_check_ctx_abstract *ctx,
                          const uint8_t *message, size_t message_size);
-int crypto_check_final  (crypto_check_ctx *ctx);
-
+int crypto_check_final  (crypto_check_ctx_abstract *ctx);
+
+// Custom hash interface
+void crypto_sign_public_key_custom_hash(uint8_t       public_key[32],
+                                        const uint8_t secret_key[32],
+                                        const crypto_sign_vtable *hash);
+void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
+                                             const uint8_t secret_key[32],
+                                             const uint8_t public_key[32],
+                                             const crypto_sign_vtable *hash);
+void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
+                                   const uint8_t signature[64],
+                                   const uint8_t public_key[32],
+                                   const crypto_sign_vtable *hash);
 
 ////////////////////////////
 /// Low level primitives ///
@@ -226,32 +262,47 @@ int crypto_check_final  (crypto_check_ctx *ctx);
 
 // For experts only.  You have been warned.
 
-
 // Chacha20
 // --------
 
 // Specialised hash.
-void crypto_chacha20_H(uint8_t       out[32],
-                       const uint8_t key[32],
-                       const uint8_t in [16]);
-
-void crypto_chacha20_init(crypto_chacha_ctx *ctx,
-                          const uint8_t      key[32],
-                          const uint8_t      nonce[8]);
-
-void crypto_chacha20_x_init(crypto_chacha_ctx *ctx,
-                            const uint8_t      key[32],
-                            const uint8_t      nonce[24]);
-
-void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, uint64_t ctr);
-
-void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
-                             uint8_t           *cipher_text,
-                             const uint8_t     *plain_text,
-                             size_t             text_size);
-
-void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
-                            uint8_t *stream, size_t size);
+void crypto_hchacha20(uint8_t       out[32],
+                      const uint8_t key[32],
+                      const uint8_t in [16]);
+
+void crypto_chacha20(uint8_t       *cipher_text,
+                     const uint8_t *plain_text,
+                     size_t         text_size,
+                     const uint8_t  key[32],
+                     const uint8_t  nonce[8]);
+void crypto_xchacha20(uint8_t       *cipher_text,
+                      const uint8_t *plain_text,
+                      size_t         text_size,
+                      const uint8_t  key[32],
+                      const uint8_t  nonce[24]);
+void crypto_ietf_chacha20(uint8_t       *cipher_text,
+                          const uint8_t *plain_text,
+                          size_t         text_size,
+                          const uint8_t  key[32],
+                          const uint8_t  nonce[12]);
+uint64_t crypto_chacha20_ctr(uint8_t       *cipher_text,
+                             const uint8_t *plain_text,
+                             size_t         text_size,
+                             const uint8_t  key[32],
+                             const uint8_t  nonce[8],
+                             uint64_t       ctr);
+uint64_t crypto_xchacha20_ctr(uint8_t       *cipher_text,
+                              const uint8_t *plain_text,
+                              size_t         text_size,
+                              const uint8_t  key[32],
+                              const uint8_t  nonce[24],
+                              uint64_t       ctr);
+uint32_t crypto_ietf_chacha20_ctr(uint8_t       *cipher_text,
+                                  const uint8_t *plain_text,
+                                  size_t         text_size,
+                                  const uint8_t  key[32],
+                                  const uint8_t  nonce[12],
+                                  uint32_t       ctr);
 
 
 // Poly 1305
@@ -273,8 +324,8 @@ void crypto_poly1305_final (crypto_poly1305_ctx *ctx, uint8_t mac[16]);
 // -------
 void crypto_x25519_public_key(uint8_t       public_key[32],
                               const uint8_t secret_key[32]);
-int crypto_x25519(uint8_t       raw_shared_secret[32],
-                  const uint8_t your_secret_key  [32],
-                  const uint8_t their_public_key [32]);
+void crypto_x25519(uint8_t       raw_shared_secret[32],
+                   const uint8_t your_secret_key  [32],
+                   const uint8_t their_public_key [32]);
 
 #endif // MONOCYPHER_H
diff --git a/src/monocypher.ml b/src/monocypher.ml
index 4870520..4f99952 100644
--- a/src/monocypher.ml
+++ b/src/monocypher.ml
@@ -197,7 +197,7 @@ module DH = struct
   external neuterize : Bigstring.t -> Bigstring.t -> unit =
     "caml_monocypher_crypto_key_exchange_public_key" [@@noalloc]
 
-  external exchange : Bigstring.t -> Bigstring.t -> Bigstring.t -> int =
+  external exchange : Bigstring.t -> Bigstring.t -> Bigstring.t -> unit =
     "caml_monocypher_crypto_key_exchange" [@@noalloc]
 
   type shared
@@ -248,14 +248,8 @@ module DH = struct
 
   let shared (Sk sk) (Pk pk) =
     let k = Bigstring.create bytes in
-    match exchange k sk pk with
-    | 0 -> Some (K k)
-    | _ -> None
-
-  let shared_exn sk pk =
-    match shared sk pk with
-    | None -> invalid_arg "DH.shared_exn"
-    | Some k -> k
+    exchange k sk pk ;
+    K k
 
   let blit : type a. a key -> Bigstring.t -> int -> int = fun k buf pos ->
     begin match k with
@@ -306,7 +300,7 @@ end
 
 module Sign = struct
   external neuterize : Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_sign_public_key" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_public_key" [@@noalloc]
 
   external neuterize_extended : Bigstring.t -> Bigstring.t -> unit =
     "caml_monocypher_crypto_sign_public_key_extended" [@@noalloc]
@@ -322,32 +316,32 @@ module Sign = struct
 
   external sign_init_first_pass :
     Bigstring.t -> Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_sign_init_first_pass" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_sign_init_first_pass" [@@noalloc]
 
   external sign_init_first_pass_extended :
     Bigstring.t -> Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_sign_init_first_pass_extended" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_sign_init_first_pass_extended" [@@noalloc]
 
   external sign_init_second_pass :
     Bigstring.t -> unit =
-    "caml_monocypher_crypto_sign_init_second_pass" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_sign_init_second_pass" [@@noalloc]
 
   external sign_update :
     Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_sign_update" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_sign_update" [@@noalloc]
 
   external sign_final :
     Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_sign_final" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_sign_final" [@@noalloc]
 
   external check_init : Bigstring.t -> Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_check_init" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_check_init" [@@noalloc]
 
   external check_update : Bigstring.t -> Bigstring.t -> unit =
-    "caml_monocypher_crypto_check_update" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_check_update" [@@noalloc]
 
   external check_final : Bigstring.t -> int =
-    "caml_monocypher_crypto_check_final" [@@noalloc]
+    "caml_monocypher_crypto_ed25519_check_final" [@@noalloc]
 
   type _ key =
     | Sk : Bigstring.t -> secret key
@@ -574,7 +568,7 @@ module Ed25519 = struct
     "caml_monocypher_ge_add" [@@noalloc]
 
   external double_scalarmult :
-    t -> t -> Bigstring.t -> Bigstring.t -> unit =
+    t -> Bigstring.t -> Bigstring.t -> unit =
     "caml_monocypher_ge_double_scalarmult" [@@noalloc]
 
   external scalarmult_base : t -> Bigstring.t -> unit =
@@ -594,21 +588,17 @@ module Ed25519 = struct
     Bigstring.blit_of_string bits 0 buf 0 (String.length bits)
 
   let scalarmult p z =
-    let ge = Bigstring.create ge_bytes in
     let z1_buf = Bigstring.create 32 in
     let z2_buf = Bigstring.make 32 '\x00' in
     blit_z z z1_buf ;
-    double_scalarmult ge p z1_buf z2_buf ;
-    ge
+    double_scalarmult p z1_buf z2_buf
 
   let double_scalarmult p z1 z2 =
-    let ge = Bigstring.create ge_bytes in
     let z1_buf = Bigstring.create 32 in
     let z2_buf = Bigstring.create 32 in
     blit_z z1 z1_buf ;
     blit_z z2 z2_buf ;
-    double_scalarmult ge p z1_buf z2_buf ;
-    ge
+    double_scalarmult p z1_buf z2_buf
 
   let scalarmult_base z =
     let ge = Bigstring.create ge_bytes in
diff --git a/src/monocypher.mli b/src/monocypher.mli
index 1e33a0b..af8d64a 100644
--- a/src/monocypher.mli
+++ b/src/monocypher.mli
@@ -73,8 +73,7 @@ module DH : sig
   val copy : 'a key -> 'a key
   val sk_of_bytes : ?pos:int -> Bigstring.t -> secret key
   val neuterize : _ key -> public key
-  val shared : secret key -> public key -> shared key option
-  val shared_exn : secret key -> public key -> shared key
+  val shared : secret key -> public key -> shared key
   val wipe : _ key -> unit
 
   val buffer : _ key -> Bigstring.t
@@ -176,9 +175,9 @@ module Ed25519 : sig
       serialization in it. *)
 
   val add : t -> cached -> t
-  val scalarmult : t -> Z.t -> t
+  val scalarmult : t -> Z.t -> unit
 
-  val double_scalarmult : t -> Z.t -> Z.t -> t
+  val double_scalarmult : t -> Z.t -> Z.t -> unit
   (** [double_scalarmult t a b] is [at + bB] where B is the curve's
       base point. *)
 
diff --git a/src/monocypher_stubs.c b/src/monocypher_stubs.c
index 2598760..dd1bb27 100644
--- a/src/monocypher_stubs.c
+++ b/src/monocypher_stubs.c
@@ -1,31 +1,71 @@
+// Monocypher version __git__
+//
+// This file is dual-licensed.  Choose whichever licence you want from
+// the two licences listed below.
+//
+// The first licence is a regular 2-clause BSD licence.  The second licence
+// is the CC-0 from Creative Commons. It is intended to release Monocypher
+// to the public domain.  The BSD licence serves as a fallback option.
+//
+// SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
+//
+// ------------------------------------------------------------------------
+//
+// Copyright (c) 2017-2020, Loup Vaillant
+// All rights reserved.
+//
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the
+//    distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ------------------------------------------------------------------------
+//
+// Written in 2017-2020 by Loup Vaillant
+//
+// To the extent possible under law, the author(s) have dedicated all copyright
+// and related neighboring rights to this software to the public domain
+// worldwide.  This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication along
+// with this software.  If not, see
+// <https://creativecommons.org/publicdomain/zero/1.0/>
+
 #include "monocypher.h"
 
 /////////////////
 /// Utilities ///
 /////////////////
-
-// By default, EdDSA signatures use blake2b.  SHA-512 is provided as an
-// option for full ed25519 compatibility. To use with SHA-512, compile
-// with option -DED25519_SHA512 and provide the "sha512" header.
-#ifdef ED25519_SHA512
-    #define HASH crypto_sha512
-#else
-    #define HASH crypto_blake2b
-#endif
-#define COMBINE1(x, y) x ## y
-#define COMBINE2(x, y) COMBINE1(x, y)
-#define HASH_CTX    COMBINE2(HASH, _ctx)
-#define HASH_INIT   COMBINE2(HASH, _init)
-#define HASH_UPDATE COMBINE2(HASH, _update)
-#define HASH_FINAL  COMBINE2(HASH, _final)
-
-#define FOR(i, start, end)   for (size_t (i) = (start); (i) < (end); (i)++)
-#define WIPE_CTX(ctx)        crypto_wipe(ctx   , sizeof(*(ctx)))
-#define WIPE_BUFFER(buffer)  crypto_wipe(buffer, sizeof(buffer))
-#define MIN(a, b)            ((a) <= (b) ? (a) : (b))
-#define ALIGN(x, block_size) ((~(x) + 1) & ((block_size) - 1))
+#define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
+#define FOR(i, start, end)         FOR_T(size_t, i, start, end)
+#define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
+#define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
+#define MIN(a, b)                  ((a) <= (b) ? (a) : (b))
+#define MAX(a, b)                  ((a) >= (b) ? (a) : (b))
+#define ALIGN(x, block_size)       ((~(x) + 1) & ((block_size) - 1))
 typedef int8_t   i8;
 typedef uint8_t  u8;
+typedef int16_t  i16;
 typedef uint32_t u32;
 typedef int32_t  i32;
 typedef int64_t  i64;
@@ -82,7 +122,7 @@ static u64 x16(const u8 a[16], const u8 b[16])
     return (load64_le(a + 0) ^ load64_le(b + 0))
         |  (load64_le(a + 8) ^ load64_le(b + 8));
 }
-static u64 x32(const u8 a[16],const u8 b[16]){return x16(a,b)| x16(a+16, b+16);}
+static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
@@ -134,150 +174,145 @@ static void chacha20_rounds(u32 out[16], const u32 in[16])
     out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
 }
 
-static void chacha20_init_key(crypto_chacha_ctx *ctx, const u8 key[32])
+static void chacha20_init_key(u32 block[16], const u8 key[32])
 {
     // constant
-    ctx->input[0] = load32_le((u8*)"expa");
-    ctx->input[1] = load32_le((u8*)"nd 3");
-    ctx->input[2] = load32_le((u8*)"2-by");
-    ctx->input[3] = load32_le((u8*)"te k");
+    block[0] = load32_le((const u8*)"expa");
+    block[1] = load32_le((const u8*)"nd 3");
+    block[2] = load32_le((const u8*)"2-by");
+    block[3] = load32_le((const u8*)"te k");
     // key
     FOR (i, 0, 8) {
-        ctx->input[i+4] = load32_le(key + i*4);
+        block[i+4] = load32_le(key + i*4);
     }
 }
 
-static u8 chacha20_pool_byte(crypto_chacha_ctx *ctx)
+static u64 chacha20_core(u32 input[16], u8 *cipher_text, const u8 *plain_text,
+                         size_t text_size)
 {
-    u32 pool_word = ctx->pool[ctx->pool_idx >> 2];
-    u8  pool_byte = pool_word >> (8*(ctx->pool_idx & 3));
-    ctx->pool_idx++;
-    return pool_byte;
-}
-
-// Fill the pool if needed, update the counters
-static void chacha20_refill_pool(crypto_chacha_ctx *ctx)
-{
-    chacha20_rounds(ctx->pool, ctx->input);
-    FOR (j, 0, 16) {
-        ctx->pool[j] += ctx->input[j];
+    // Whole blocks
+    u32    pool[16];
+    size_t nb_blocks = text_size >> 6;
+    FOR (i, 0, nb_blocks) {
+        chacha20_rounds(pool, input);
+        if (plain_text != 0) {
+            FOR (j, 0, 16) {
+                u32 p = pool[j] + input[j];
+                store32_le(cipher_text, p ^ load32_le(plain_text));
+                cipher_text += 4;
+                plain_text  += 4;
+            }
+        } else {
+            FOR (j, 0, 16) {
+                u32 p = pool[j] + input[j];
+                store32_le(cipher_text, p);
+                cipher_text += 4;
+            }
+        }
+        input[12]++;
+        if (input[12] == 0) {
+            input[13]++;
+        }
     }
-    ctx->pool_idx = 0;
-    ctx->input[12]++;
-    if (ctx->input[12] == 0) {
-        ctx->input[13]++;
+    text_size &= 63;
+
+    // Last (incomplete) block
+    if (text_size > 0) {
+        if (plain_text == 0) {
+            plain_text = zero;
+        }
+        chacha20_rounds(pool, input);
+        u8 tmp[64];
+        FOR (i, 0, 16) {
+            store32_le(tmp + i*4, pool[i] + input[i]);
+        }
+        FOR (i, 0, text_size) {
+            cipher_text[i] = tmp[i] ^ plain_text[i];
+        }
+        WIPE_BUFFER(tmp);
     }
+    WIPE_BUFFER(pool);
+    return input[12] + ((u64)input[13] << 32) + (text_size > 0);
 }
 
-void crypto_chacha20_H(u8 out[32], const u8 key[32], const u8 in[16])
+void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
 {
-    crypto_chacha_ctx ctx;
-    chacha20_init_key(&ctx, key);
+    u32 block[16];
+    chacha20_init_key(block, key);
+    // input
     FOR (i, 0, 4) {
-        ctx.input[i+12] = load32_le(in + i*4);
+        block[i+12] = load32_le(in + i*4);
     }
-    u32 buffer[16];
-    chacha20_rounds(buffer, ctx.input);
+    chacha20_rounds(block, block);
     // prevents reversal of the rounds by revealing only half of the buffer.
     FOR (i, 0, 4) {
-        store32_le(out      + i*4, buffer[i     ]); // constant
-        store32_le(out + 16 + i*4, buffer[i + 12]); // counter and nonce
+        store32_le(out      + i*4, block[i     ]); // constant
+        store32_le(out + 16 + i*4, block[i + 12]); // counter and nonce
     }
-    WIPE_CTX(&ctx);
-    WIPE_BUFFER(buffer);
+    WIPE_BUFFER(block);
 }
 
-static void chacha20_encrypt(crypto_chacha_ctx *ctx,
-                             u8                *cipher_text,
-                             const u8          *plain_text,
-                             size_t             text_size)
+u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
+                        size_t text_size, const u8 key[32], const u8 nonce[8],
+                        u64 ctr)
 {
-    FOR (i, 0, text_size) {
-        if (ctx->pool_idx == 64) {
-            chacha20_refill_pool(ctx);
-        }
-        u8 plain = 0;
-        if (plain_text != 0) {
-            plain = *plain_text;
-            plain_text++;
-        }
-        *cipher_text = chacha20_pool_byte(ctx) ^ plain;
-        cipher_text++;
-    }
+    u32 input[16];
+    chacha20_init_key(input, key);
+    input[12] = (u32) ctr;
+    input[13] = (u32)(ctr >> 32);
+    input[14] = load32_le(nonce);
+    input[15] = load32_le(nonce + 4);
+    ctr = chacha20_core(input, cipher_text, plain_text, text_size);
+    WIPE_BUFFER(input);
+    return ctr;
 }
 
-void crypto_chacha20_init(crypto_chacha_ctx *ctx,
-                          const u8           key[32],
-                          const u8           nonce[8])
+u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
+                             size_t text_size,
+                             const u8 key[32], const u8 nonce[12], u32 ctr)
 {
-    chacha20_init_key      (ctx, key);     // key
-    crypto_chacha20_set_ctr(ctx, 0  );     // counter
-    ctx->input[14] = load32_le(nonce + 0); // nonce
-    ctx->input[15] = load32_le(nonce + 4); // nonce
+    u32 input[16];
+    chacha20_init_key(input, key);
+    input[12] = (u32) ctr;
+    input[13] = load32_le(nonce);
+    input[14] = load32_le(nonce + 4);
+    input[15] = load32_le(nonce + 8);
+    ctr = (u32)chacha20_core(input, cipher_text, plain_text, text_size);
+    WIPE_BUFFER(input);
+    return ctr;
 }
 
-void crypto_chacha20_x_init(crypto_chacha_ctx *ctx,
-                            const u8           key[32],
-                            const u8           nonce[24])
+u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text,
+                         size_t text_size,
+                         const u8 key[32], const u8 nonce[24], u64 ctr)
 {
-    u8 derived_key[32];
-    crypto_chacha20_H(derived_key, key, nonce);
-    crypto_chacha20_init(ctx, derived_key, nonce + 16);
-    WIPE_BUFFER(derived_key);
+    u8 sub_key[32];
+    crypto_hchacha20(sub_key, key, nonce);
+    ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size,
+                              sub_key, nonce+16, ctr);
+    WIPE_BUFFER(sub_key);
+    return ctr;
 }
 
-void crypto_chacha20_set_ctr(crypto_chacha_ctx *ctx, u64 ctr)
+void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
+                     const u8 key[32], const u8 nonce[8])
 {
-    ctx->input[12] = ctr & 0xffffffff;
-    ctx->input[13] = ctr >> 32;
-    ctx->pool_idx  = 64;  // The random pool (re)starts empty
-}
+    crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
 
-void crypto_chacha20_encrypt(crypto_chacha_ctx *ctx,
-                             u8                *cipher_text,
-                             const u8          *plain_text,
-                             size_t             text_size)
+}
+void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text,
+                          size_t text_size,
+                          const u8 key[32], const u8 nonce[12])
 {
-    // Align ourselves with block boundaries
-    size_t align = MIN(ALIGN(ctx->pool_idx, 64), text_size);
-    chacha20_encrypt(ctx, cipher_text, plain_text, align);
-    if (plain_text != 0) {
-        plain_text += align;
-    }
-    cipher_text += align;
-    text_size   -= align;
-
-    // Process the message block by block
-    FOR (i, 0, text_size >> 6) {  // number of blocks
-        chacha20_refill_pool(ctx);
-        if (plain_text != 0) {
-            FOR (j, 0, 16) {
-                u32 plain = load32_le(plain_text);
-                store32_le(cipher_text, ctx->pool[j] ^ plain);
-                plain_text  += 4;
-                cipher_text += 4;
-            }
-        } else {
-            FOR (j, 0, 16) {
-                store32_le(cipher_text, ctx->pool[j]);
-                cipher_text += 4;
-            }
-        }
-        ctx->pool_idx = 64;
-    }
-    text_size &= 63;
-
-    // remaining bytes
-    chacha20_encrypt(ctx, cipher_text, plain_text, text_size);
+    crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
 }
 
-void crypto_chacha20_stream(crypto_chacha_ctx *ctx,
-                            uint8_t *stream, size_t size)
+void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
+                      const u8 key[32], const u8 nonce[24])
 {
-    crypto_chacha20_encrypt(ctx, stream, 0, size);
+    crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
 }
 
-
 /////////////////
 /// Poly 1305 ///
 /////////////////
@@ -309,11 +344,11 @@ static void poly_block(crypto_poly1305_ctx *ctx)
     const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
 
     // (h + c) * r, without carry propagation
-    const u64 x0 = s0*r0 + s1*rr3 + s2*rr2 + s3*rr1 +s4*rr0;//<=97ffffe007fffff8
-    const u64 x1 = s0*r1 + s1*r0  + s2*rr3 + s3*rr2 +s4*rr1;//<=8fffffe20ffffff6
-    const u64 x2 = s0*r2 + s1*r1  + s2*r0  + s3*rr3 +s4*rr2;//<=87ffffe417fffff4
-    const u64 x3 = s0*r3 + s1*r2  + s2*r1  + s3*r0  +s4*rr3;//<=7fffffe61ffffff2
-    const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits      //<=               f
+    const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8
+    const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6
+    const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4
+    const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2
+    const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits    // <=                f
 
     // partial reduction modulo 2^130 - 5
     const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
@@ -324,20 +359,19 @@ static void poly_block(crypto_poly1305_ctx *ctx)
     const u64 u4 = (u3 >> 32)     + (u5 & 3);
 
     // Update the hash
-    ctx->h[0] = u0 & 0xffffffff; // u0 <= 1_9ffffff0
-    ctx->h[1] = u1 & 0xffffffff; // u1 <= 1_97ffffe0
-    ctx->h[2] = u2 & 0xffffffff; // u2 <= 1_8fffffe2
-    ctx->h[3] = u3 & 0xffffffff; // u3 <= 1_87ffffe4
-    ctx->h[4] = (u32)u4;         // u4 <=          4
+    ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0
+    ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0
+    ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2
+    ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4
+    ctx->h[4] = (u32)u4; // u4 <=          4
 }
 
-// (re-)initializes the input counter and input buffer
+// (re-)initialises the input counter and input buffer
 static void poly_clear_c(crypto_poly1305_ctx *ctx)
 {
-    ctx->c[0]  = 0;
-    ctx->c[1]  = 0;
-    ctx->c[2]  = 0;
-    ctx->c[3]  = 0;
+    FOR (i, 0, 4) {
+        ctx->c[i] = 0;
+    }
     ctx->c_idx = 0;
 }
 
@@ -388,10 +422,9 @@ void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
     // Process the message block by block
     size_t nb_blocks = message_size >> 4;
     FOR (i, 0, nb_blocks) {
-        ctx->c[0] = load32_le(message +  0);
-        ctx->c[1] = load32_le(message +  4);
-        ctx->c[2] = load32_le(message +  8);
-        ctx->c[3] = load32_le(message + 12);
+        FOR (j, 0, 4) {
+            ctx->c[j] = load32_le(message +  j*4);
+        }
         poly_block(ctx);
         message += 16;
     }
@@ -482,6 +515,8 @@ static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
         { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
         {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
         { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
+        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
     };
 
     // init work vector
@@ -491,16 +526,16 @@ static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
     u64 v3 = ctx->hash[3];  u64 v11 = iv[3];
     u64 v4 = ctx->hash[4];  u64 v12 = iv[4] ^ ctx->input_offset[0];
     u64 v5 = ctx->hash[5];  u64 v13 = iv[5] ^ ctx->input_offset[1];
-    u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ is_last_block;
+    u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
     u64 v7 = ctx->hash[7];  u64 v15 = iv[7];
 
     // mangle work vector
-    uint64_t *input = ctx->input;
+    u64 *input = ctx->input;
 #define BLAKE2_G(v, a, b, c, d, x, y)                  \
     v##a += v##b + x;  v##d = rotr64(v##d ^ v##a, 32); \
     v##c += v##d;      v##b = rotr64(v##b ^ v##c, 24); \
     v##a += v##b + y;  v##d = rotr64(v##d ^ v##a, 16); \
-    v##c += v##d;      v##b = rotr64(v##b ^ v##c, 63);
+    v##c += v##d;      v##b = rotr64(v##b ^ v##c, 63)
 #define BLAKE2_ROUND(i)                                                 \
     BLAKE2_G(v, 0, 4,  8, 12, input[sigma[i][ 0]], input[sigma[i][ 1]]);\
     BLAKE2_G(v, 1, 5,  9, 13, input[sigma[i][ 2]], input[sigma[i][ 3]]);\
@@ -511,9 +546,15 @@ static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
     BLAKE2_G(v, 2, 7,  8, 13, input[sigma[i][12]], input[sigma[i][13]]);\
     BLAKE2_G(v, 3, 4,  9, 14, input[sigma[i][14]], input[sigma[i][15]])
 
+#ifdef BLAKE2_NO_UNROLLING
+    FOR (i, 0, 12) {
+        BLAKE2_ROUND(i);
+    }
+#else
     BLAKE2_ROUND(0);  BLAKE2_ROUND(1);  BLAKE2_ROUND(2);  BLAKE2_ROUND(3);
     BLAKE2_ROUND(4);  BLAKE2_ROUND(5);  BLAKE2_ROUND(6);  BLAKE2_ROUND(7);
     BLAKE2_ROUND(8);  BLAKE2_ROUND(9);  BLAKE2_ROUND(0);  BLAKE2_ROUND(1);
+#endif
 
     // update hash
     ctx->hash[0] ^= v0 ^ v8;
@@ -567,8 +608,8 @@ void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
     }
     ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
 
-    ctx->input_offset[0] = 0;         // begining of the input, no offset
-    ctx->input_offset[1] = 0;         // begining of the input, no offset
+    ctx->input_offset[0] = 0;         // beginning of the input, no offset
+    ctx->input_offset[1] = 0;         // beginning of the input, no offset
     ctx->hash_size       = hash_size; // remember the hash size we want
     ctx->input_idx       = 0;
 
@@ -614,13 +655,13 @@ void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
     FOR (i, ctx->input_idx, 128) {
         blake2b_set_input(ctx, 0, i);
     }
-    blake2b_incr(ctx);         // update the input offset
-    blake2b_compress(ctx, -1); // compress the last block
+    blake2b_incr(ctx);        // update the input offset
+    blake2b_compress(ctx, 1); // compress the last block
     size_t nb_words = ctx->hash_size >> 3;
     FOR (i, 0, nb_words) {
         store64_le(hash + i*8, ctx->hash[i]);
     }
-    FOR (i, nb_words * 8, ctx->hash_size) {
+    FOR (i, nb_words << 3, ctx->hash_size) {
         hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
     }
     WIPE_CTX(ctx);
@@ -641,6 +682,28 @@ void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
     crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
 }
 
+static void blake2b_vtable_init(void *ctx)
+{
+    crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash);
+}
+
+static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s)
+{
+    crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s);
+}
+
+static void blake2b_vtable_final(void *ctx, u8 *h)
+{
+    crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h);
+}
+
+const crypto_sign_vtable crypto_blake2b_vtable = {
+    crypto_blake2b,
+    blake2b_vtable_init,
+    blake2b_vtable_update,
+    blake2b_vtable_final,
+    sizeof(crypto_sign_ctx),
+};
 
 ////////////////
 /// Argon2 i ///
@@ -658,7 +721,7 @@ static void wipe_block(block *b)
     }
 }
 
-// updates a blake2 hash with a 32 bit word, little endian.
+// updates a Blake2 hash with a 32 bit word, little endian.
 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
 {
     u8 buf[4];
@@ -701,7 +764,7 @@ static void extended_hash(u8       *digest, u32 digest_size,
     if (digest_size > 64) {
         // the conversion to u64 avoids integer overflow on
         // ludicrously big hash sizes.
-        u32 r   = (((u64)digest_size + 31) >> 5) - 2;
+        u32 r   = (u32)(((u64)digest_size + 31) >> 5) - 2;
         u32 i   =  1;
         u32 in  =  0;
         u32 out = 32;
@@ -778,23 +841,21 @@ static void g_xor(block *result, const block *x, const block *y, block *tmp)
     xor_block (result, tmp); // result = R ^ old ^ Z
 }
 
-// unary version of the compression function.
+// Unary version of the compression function.
 // The missing argument is implied zero.
 // Does the transformation in place.
-static void unary_g(block *work_block)
+static void unary_g(block *work_block, block *tmp)
 {
     // work_block == R
-    block tmp;
-    copy_block(&tmp, work_block); // tmp        = R
-    g_rounds(work_block);         // work_block = Z
-    xor_block(work_block, &tmp);  // work_block = Z ^ R
-    wipe_block(&tmp);
+    copy_block(tmp, work_block); // tmp        = R
+    g_rounds  (work_block);      // work_block = Z
+    xor_block (work_block, tmp); // work_block = Z ^ R
 }
 
 // Argon2i uses a kind of stream cipher to determine which reference
 // block it will take to synthesise the next block.  This context hold
 // that stream's state.  (It's very similar to Chacha20.  The block b
-// is anologous to Chacha's own pool)
+// is analogous to Chacha's own pool)
 typedef struct {
     block b;
     u32 pass_number;
@@ -811,7 +872,7 @@ typedef struct {
 // easier, but timing attacks are the bigger threat in many settings.
 static void gidx_refresh(gidx_ctx *ctx)
 {
-    // seed the begining of the block...
+    // seed the beginning of the block...
     ctx->b.a[0] = ctx->pass_number;
     ctx->b.a[1] = 0;  // lane number (we have only one)
     ctx->b.a[2] = ctx->slice_number;
@@ -823,8 +884,10 @@ static void gidx_refresh(gidx_ctx *ctx)
 
     // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
     // (G "square" function), to get cheap pseudo-random numbers.
-    unary_g(&ctx->b);
-    unary_g(&ctx->b);
+    block tmp;
+    unary_g(&ctx->b, &tmp);
+    unary_g(&ctx->b, &tmp);
+    wipe_block(&tmp);
 }
 
 static void gidx_init(gidx_ctx *ctx,
@@ -837,7 +900,7 @@ static void gidx_init(gidx_ctx *ctx,
     ctx->nb_iterations = nb_iterations;
     ctx->ctr           = 0;
 
-    // Offset from the begining of the segment.  For the first slice
+    // Offset from the beginning of the segment.  For the first slice
     // of the first pass, we start at the *third* block, so the offset
     // starts at 2, not 0.
     if (pass_number != 0 || slice_number != 0) {
@@ -943,25 +1006,24 @@ void crypto_argon2i_general(u8       *hash,      u32 hash_size,
 
     // fill (then re-fill) the rest of the blocks
     block tmp;
-    gidx_ctx ctx;
-    FOR (pass_number, 0, nb_iterations) {
+    gidx_ctx ctx; // public information, no need to wipe
+    FOR_T (u32, pass_number, 0, nb_iterations) {
         int first_pass = pass_number == 0;
 
-        FOR (segment, 0, 4) {
-            gidx_init(&ctx, (u32)pass_number, (u32)segment,
-                      nb_blocks, nb_iterations);
+        FOR_T (u32, segment, 0, 4) {
+            gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
 
             // On the first segment of the first pass,
             // blocks 0 and 1 are already filled.
             // We use the offset to skip them.
             u32 start_offset  = first_pass && segment == 0 ? 2 : 0;
-            u32 segment_start = (u32)segment * segment_size + start_offset;
-            u32 segment_end   = ((u32)segment + 1) * segment_size;
-            FOR (current_block, segment_start, segment_end) {
+            u32 segment_start = segment * segment_size + start_offset;
+            u32 segment_end   = (segment + 1) * segment_size;
+            FOR_T (u32, current_block, segment_start, segment_end) {
                 u32 reference_block = gidx_next(&ctx);
                 u32 previous_block  = current_block == 0
                                     ? nb_blocks - 1
-                                    : (u32)current_block - 1;
+                                    : current_block - 1;
                 block *c = blocks + current_block;
                 block *p = blocks + previous_block;
                 block *r = blocks + reference_block;
@@ -970,7 +1032,6 @@ void crypto_argon2i_general(u8       *hash,      u32 hash_size,
             }
         }
     }
-    wipe_block(&ctx.b);
     wipe_block(&tmp);
     // hash the very last block with H' into the output hash
     u8 final_block[1024];
@@ -1003,7 +1064,7 @@ void crypto_argon2i(u8       *hash,      u32 hash_size,
 ////////////////////////////////////
 /// Arithmetic modulo 2^255 - 19 ///
 ////////////////////////////////////
-//  Taken from Supercop's ref10 implementation.
+//  Taken from SUPERCOP's ref10 implementation.
 //  A bit bigger than TweetNaCl, over 4 times faster.
 
 // field element
@@ -1019,8 +1080,9 @@ static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
 
 static void fe_cswap(fe f, fe g, int b)
 {
+    i32 mask = -b; // -1 = 0xffffffff
     FOR (i, 0, 10) {
-        i32 x = (f[i] ^ g[i]) & -b;
+        i32 x = (f[i] ^ g[i]) & mask;
         f[i] = f[i] ^ x;
         g[i] = g[i] ^ x;
     }
@@ -1028,24 +1090,25 @@ static void fe_cswap(fe f, fe g, int b)
 
 static void fe_ccopy(fe f, const fe g, int b)
 {
+    i32 mask = -b; // -1 = 0xffffffff
     FOR (i, 0, 10) {
-        i32 x = (f[i] ^ g[i]) & -b;
+        i32 x = (f[i] ^ g[i]) & mask;
         f[i] = f[i] ^ x;
     }
 }
 
 #define FE_CARRY                                                        \
     i64 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;                         \
-    c9 = (t9 + (i64) (1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * (1 << 25); \
-    c1 = (t1 + (i64) (1<<24)) >> 25; t2 += c1;      t1 -= c1 * (1 << 25); \
-    c3 = (t3 + (i64) (1<<24)) >> 25; t4 += c3;      t3 -= c3 * (1 << 25); \
-    c5 = (t5 + (i64) (1<<24)) >> 25; t6 += c5;      t5 -= c5 * (1 << 25); \
-    c7 = (t7 + (i64) (1<<24)) >> 25; t8 += c7;      t7 -= c7 * (1 << 25); \
-    c0 = (t0 + (i64) (1<<25)) >> 26; t1 += c0;      t0 -= c0 * (1 << 26); \
-    c2 = (t2 + (i64) (1<<25)) >> 26; t3 += c2;      t2 -= c2 * (1 << 26); \
-    c4 = (t4 + (i64) (1<<25)) >> 26; t5 += c4;      t4 -= c4 * (1 << 26); \
-    c6 = (t6 + (i64) (1<<25)) >> 26; t7 += c6;      t6 -= c6 * (1 << 26); \
-    c8 = (t8 + (i64) (1<<25)) >> 26; t9 += c8;      t8 -= c8 * (1 << 26); \
+    c9 = (t9 + (i64)(1<<24)) >> 25; t0 += c9 * 19; t9 -= c9 * (1 << 25); \
+    c1 = (t1 + (i64)(1<<24)) >> 25; t2 += c1;      t1 -= c1 * (1 << 25); \
+    c3 = (t3 + (i64)(1<<24)) >> 25; t4 += c3;      t3 -= c3 * (1 << 25); \
+    c5 = (t5 + (i64)(1<<24)) >> 25; t6 += c5;      t5 -= c5 * (1 << 25); \
+    c7 = (t7 + (i64)(1<<24)) >> 25; t8 += c7;      t7 -= c7 * (1 << 25); \
+    c0 = (t0 + (i64)(1<<25)) >> 26; t1 += c0;      t0 -= c0 * (1 << 26); \
+    c2 = (t2 + (i64)(1<<25)) >> 26; t3 += c2;      t2 -= c2 * (1 << 26); \
+    c4 = (t4 + (i64)(1<<25)) >> 26; t5 += c4;      t4 -= c4 * (1 << 26); \
+    c6 = (t6 + (i64)(1<<25)) >> 26; t7 += c6;      t6 -= c6 * (1 << 26); \
+    c8 = (t8 + (i64)(1<<25)) >> 26; t9 += c8;      t8 -= c8 * (1 << 26); \
     h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
     h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
 
@@ -1060,10 +1123,11 @@ static void fe_frombytes(fe h, const u8 s[32])
     i64 t6 =  load24_le(s + 20) << 7;
     i64 t7 =  load24_le(s + 23) << 5;
     i64 t8 =  load24_le(s + 26) << 4;
-    i64 t9 = (load24_le(s + 29) & 8388607) << 2;
+    i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;
     FE_CARRY;
 }
 
+// multiply a field element by a signed 32-bit integer
 static void fe_mul_small(fe h, const fe f, i32 g)
 {
     i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
@@ -1124,7 +1188,7 @@ static void fe_mul(fe h, const fe f, const fe g)
     c9 = (h9 + (i64) (1<<24)) >> 25; h0 += c9 * 19; h9 -= c9 * (1 << 25); \
     c0 = (h0 + (i64) (1<<25)) >> 26; h1 += c0;      h0 -= c0 * (1 << 26); \
     h[0]=(i32)h0;  h[1]=(i32)h1;  h[2]=(i32)h2;  h[3]=(i32)h3;  h[4]=(i32)h4; \
-    h[5]=(i32)h5;  h[6]=(i32)h6;  h[7]=(i32)h7;  h[8]=(i32)h8;  h[9]=(i32)h9; \
+    h[5]=(i32)h5;  h[6]=(i32)h6;  h[7]=(i32)h7;  h[8]=(i32)h8;  h[9]=(i32)h9
 
     CARRY;
 }
@@ -1163,36 +1227,13 @@ static void fe_sq(fe h, const fe f)
     CARRY;
 }
 
+// h = 2 * (f^2)
 static void fe_sq2(fe h, const fe f)
 {
     fe_sq(h, f);
     fe_mul_small(h, h, 2);
 }
 
-// This could be simplified, but it would be slower
-static void fe_invert(fe out, const fe z)
-{
-    fe t0, t1, t2, t3;
-    fe_sq(t0, z );
-    fe_sq(t1, t0);
-    fe_sq(t1, t1);
-    fe_mul(t1,  z, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t2, t0);                                fe_mul(t1 , t1, t2);
-    fe_sq(t2, t1); FOR (i, 1,   5) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t2, t1); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
-    fe_sq(t3, t2); FOR (i, 1,  20) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
-    fe_sq(t2, t2); FOR (i, 1,  10) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t2, t1); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t2 , t2, t1);
-    fe_sq(t3, t2); FOR (i, 1, 100) fe_sq(t3, t3); fe_mul(t2 , t3, t2);
-    fe_sq(t2, t2); FOR (i, 1,  50) fe_sq(t2, t2); fe_mul(t1 , t2, t1);
-    fe_sq(t1, t1); FOR (i, 1,   5) fe_sq(t1, t1); fe_mul(out, t1, t0);
-    WIPE_BUFFER(t0);
-    WIPE_BUFFER(t1);
-    WIPE_BUFFER(t2);
-    WIPE_BUFFER(t3);
-}
-
 // This could be simplified, but it would be slower
 static void fe_pow22523(fe out, const fe z)
 {
@@ -1214,6 +1255,20 @@ static void fe_pow22523(fe out, const fe z)
     WIPE_BUFFER(t2);
 }
 
+// Inverting means multiplying by 2^255 - 21
+// 2^255 - 21 = (2^252 - 3) * 8 + 3
+// So we reuse the multiplication chain of fe_pow22523
+static void fe_invert(fe out, const fe z)
+{
+    fe tmp;
+    fe_pow22523(tmp, z);
+    // tmp2^8 * z^3
+    fe_sq(tmp, tmp);                        // 0
+    fe_sq(tmp, tmp);  fe_mul(tmp, tmp, z);  // 1
+    fe_sq(tmp, tmp);  fe_mul(out, tmp, z);  // 1
+    WIPE_BUFFER(tmp);
+}
+
 static void fe_tobytes(u8 s[32], const fe h)
 {
     i32 t[10];
@@ -1264,15 +1319,12 @@ static int fe_isnonzero(const fe f)
 {
     u8 s[32];
     fe_tobytes(s, f);
-    u8 isnonzero = zerocmp32(s);
+    int isnonzero = zerocmp32(s);
     WIPE_BUFFER(s);
     return isnonzero;
 }
 
-///////////////
-/// X-25519 /// Taken from Supercop's ref10 implementation.
-///////////////
-
+// trim a scalar for scalar multiplication
 static void trim_scalar(u8 s[32])
 {
     s[ 0] &= 248;
@@ -1280,11 +1332,19 @@ static void trim_scalar(u8 s[32])
     s[31] |= 64;
 }
 
-static int scalar_bit(const u8 s[32], int i) { return (s[i>>3] >> (i&7)) & 1; }
+// get bit from scalar at position i
+static int scalar_bit(const u8 s[32], int i) {
+    if (i < 0) { return 0; } // handle -1 for sliding windows
+    return (s[i>>3] >> (i&7)) & 1;
+}
+
+///////////////
+/// X-25519 /// Taken from SUPERCOP's ref10 implementation.
+///////////////
 
-int crypto_x25519(u8       raw_shared_secret[32],
-                  const u8 your_secret_key  [32],
-                  const u8 their_public_key [32])
+void crypto_x25519(u8       raw_shared_secret[32],
+                   const u8 your_secret_key  [32],
+                   const u8 their_public_key [32])
 {
     // computes the scalar product
     fe x1;
@@ -1300,7 +1360,7 @@ int crypto_x25519(u8       raw_shared_secret[32],
     // computes the actual scalar product (the result is in x2 and z2)
     fe x2, z2, x3, z3, t0, t1;
     // Montgomery ladder
-    // In projective coordinates, to avoid divisons: x = X / Z
+    // In projective coordinates, to avoid divisions: x = X / Z
     // We don't care about the y coordinate, it's only 1 bit of information
     fe_1(x2);        fe_0(z2); // "zero" point
     fe_copy(x3, x1); fe_1(z3); // "one"  point
@@ -1336,10 +1396,6 @@ int crypto_x25519(u8       raw_shared_secret[32],
     WIPE_BUFFER(x2);  WIPE_BUFFER(z2);
     WIPE_BUFFER(x3);  WIPE_BUFFER(z3);
     WIPE_BUFFER(t0);  WIPE_BUFFER(t1);
-
-    // Returns -1 if the output is all zero
-    // (happens with some malicious public keys)
-    return -1 - zerocmp32(raw_shared_secret);
 }
 
 void crypto_x25519_public_key(u8       public_key[32],
@@ -1353,11 +1409,12 @@ void crypto_x25519_public_key(u8       public_key[32],
 /// Ed25519 ///
 ///////////////
 
-static const  u64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
+static const  i64 L[32] = { 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
                             0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10};
 
+// r = x mod L (little-endian)
 static void modL(u8 *r, i64 x[64])
 {
     for (unsigned i = 63; i >= 32; i--) {
@@ -1385,11 +1442,12 @@ static void modL(u8 *r, i64 x[64])
     }
 }
 
+// Reduces a 64-byte hash modulo L (little endian)
 static void reduce(u8 r[64])
 {
     i64 x[64];
     FOR (i, 0, 64) {
-        x[i] = (u64) r[i];
+        x[i] = (i64)(u64)r[i]; // preserve unsigned
         r[i] = 0;
     }
     modL(r, x);
@@ -1400,17 +1458,18 @@ static void reduce(u8 r[64])
 static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
 {
     i64 s[64];
-    FOR (i,  0, 32) { s[i] = (u64) c[i]; }
-    FOR (i, 32, 64) { s[i] = 0;          }
+    FOR (i,  0, 32) { s[i] = (i64)(u64)c[i]; } // preserve unsigned
+    FOR (i, 32, 64) { s[i] = 0;              }
     FOR (i,  0, 32) {
         FOR (j, 0, 32) {
-            s[i+j] += a[i] * (u64) b[j];
+            s[i+j] += a[i] * (u64)b[j];
         }
     }
     modL(r, s);
     WIPE_BUFFER(s);
 }
 
+// Variable time! a must not be secret!
 static int is_above_L(const u8 a[32])
 {
     for (int i = 31; i >= 0; i--) {
@@ -1420,7 +1479,7 @@ static int is_above_L(const u8 a[32])
     return 1;
 }
 
-// Point in a twisted Edwards curve,
+// Point (group element, ge) in a twisted Edwards curve,
 // in extended projective coordinates.
 // x = X/Z, y = Y/Z, T = XY/Z
 typedef struct { fe X;  fe Y;  fe Z; fe T;  } ge;
@@ -1448,18 +1507,24 @@ static void ge_tobytes(u8 s[32], const ge *h)
     WIPE_BUFFER(y);
 }
 
-// Variable time! s must not be secret!
+// h = -s, where s is a point encoded in 32 bytes
+// ge_double_scalarmult_vartime() performs addition, but the algorithm it is
+// used for requires subtraction; thus we negate s on load so that we can do
+// addition in ge_double_scalarmult_vartime() later.
+//
+// Variable time! Inputs must not be secret!
+// => Use only to *check* signatures.
 static int ge_frombytes_neg_vartime(ge *h, const u8 s[32])
 {
     static const fe d = {
         -10913610, 13857413, -15372611, 6949391, 114729,
         -8787816, -6275908, -3247719, -18696448, -12055116
-    } ;
+    };
     static const fe sqrtm1 = {
         -32595792, -7943725, 9377950, 3500415, 12389472,
         -272473, -25146209, -2005654, 326686, 11406482
-    } ;
-    fe u, v, v3, vxx, check; // no secret, no wipe
+    };
+    fe u, v, v3; // no secret, no wipe
     fe_frombytes(h->Y, s);
     fe_1(h->Z);
     fe_sq(u, h->Y);            // y^2
@@ -1477,6 +1542,7 @@ static int ge_frombytes_neg_vartime(ge *h, const u8 s[32])
     fe_mul(h->X, h->X, v3);
     fe_mul(h->X, h->X, u);     // x = uv^3(uv^7)^((q-5)/8)
 
+    fe vxx, check; // no secret, no wipe
     fe_sq(vxx, h->X);
     fe_mul(vxx, vxx, v);
     fe_sub(check, vxx, u);     // vx^2-u
@@ -1506,9 +1572,11 @@ static void ge_cache(ge_cached *c, const ge *p)
     fe_mul (c->T2, p->T, D2  );
 }
 
+// Internal buffers are not wiped! Inputs must not be secret!
+// => Use only to *check* signatures.
 static void ge_add(ge *s, const ge *p, const ge_cached *q)
 {
-    fe a, b; // not used to process secrets, no need to wipe
+    fe a, b;
     fe_add(a   , p->Y, p->X );
     fe_sub(b   , p->Y, p->X );
     fe_mul(a   , a   , q->Yp);
@@ -1528,6 +1596,8 @@ static void ge_add(ge *s, const ge *p, const ge_cached *q)
     fe_mul(s->Z, a   , b   );
 }
 
+// Internal buffers are not wiped! Inputs must not be secret!
+// => Use only to *check* signatures.
 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
 {
     ge_cached neg;
@@ -1559,6 +1629,16 @@ static void ge_madd(ge *s, const ge *p, const fe yp, const fe ym, const fe t2,
     fe_mul(s->Z, a   , b   );
 }
 
+// Internal buffers are not wiped! Inputs must not be secret!
+// => Use only to *check* signatures.
+static void ge_msub(ge *s, const ge *p, const fe yp, const fe ym, const fe t2,
+                    fe a, fe b)
+{
+    fe n2;
+    fe_neg(n2, t2);
+    ge_madd(s, p, ym, yp, n2, a, b);
+}
+
 static void ge_double(ge *s, const ge *p, ge *q)
 {
     fe_sq (q->X, p->X);
@@ -1577,86 +1657,159 @@ static void ge_double(ge *s, const ge *p, ge *q)
     fe_mul(s->T, q->X , q->T);
 }
 
-// Compute signed sliding windows (either 0, or odd numbers between -15 and 15)
-static void slide(i8 adds[258], const u8 scalar[32])
-{
-    FOR (i,   0, 256) { adds[i] = scalar_bit(scalar, i); }
-    FOR (i, 256, 258) { adds[i] = 0;                     }
-    FOR (i, 0, 254) {
-        if (adds[i] != 0) {
-            // base value of the 5-bit window
-            FOR (j, 1, 5) {
-                adds[i  ] |= adds[i+j] << j;
-                adds[i+j]  = 0;
-            }
-            if (adds[i] > 16) {
-                // go back to [-15, 15], propagate carry.
-                adds[i] -= 32;
-                int j = i + 5;
-                while (adds[j] != 0) {
-                    adds[j] = 0;
-                    j++;
-                }
-                adds[j] = 1;
-            }
-        }
+static const fe window_Yp[8] = {
+    {25967493, -14356035, 29566456, 3660896, -12694345,
+     4014787, 27544626, -11754271, -6079156, 2047605},
+    {15636291, -9688557, 24204773, -7912398, 616977,
+     -16685262, 27787600, -14772189, 28944400, -1550024},
+    {10861363, 11473154, 27284546, 1981175, -30064349,
+     12577861, 32867885, 14515107, -15438304, 10819380},
+    {5153746, 9909285, 1723747, -2777874, 30523605,
+     5516873, 19480852, 5230134, -23952439, -15175766},
+    {-22518993, -6692182, 14201702, -8745502, -23510406,
+     8844726, 18474211, -1361450, -13062696, 13821877},
+    {-25154831, -4185821, 29681144, 7868801, -6854661,
+     -9423865, -12437364, -663000, -31111463, -16132436},
+    {-33521811, 3180713, -2394130, 14003687, -16903474,
+     -16270840, 17238398, 4729455, -18074513, 9256800},
+    {-3151181, -5046075, 9282714, 6866145, -31907062,
+     -863023, -18940575, 15033784, 25105118, -7894876},
+};
+static const fe window_Ym[8] = {
+    {-12545711, 934262, -2722910, 3049990, -727428,
+     9406986, 12720692, 5043384, 19500929, -15469378},
+    {16568933, 4717097, -11556148, -1102322, 15682896,
+     -11807043, 16354577, -11775962, 7689662, 11199574},
+    {4708026, 6336745, 20377586, 9066809, -11272109,
+     6594696, -25653668, 12483688, -12668491, 5581306},
+    {-30269007, -3463509, 7665486, 10083793, 28475525,
+     1649722, 20654025, 16520125, 30598449, 7715701},
+    {-6455177, -7839871, 3374702, -4740862, -27098617,
+     -10571707, 31655028, -7212327, 18853322, -14220951},
+    {25576264, -2703214, 7349804, -11814844, 16472782,
+     9300885, 3844789, 15725684, 171356, 6466918},
+    {-25182317, -4174131, 32336398, 5036987, -21236817,
+     11360617, 22616405, 9761698, -19827198, 630305},
+    {-24326370, 15950226, -31801215, -14592823, -11662737,
+     -5090925, 1573892, -2625887, 2198790, -15804619},
+};
+static const fe window_T2[8] = {
+    {-8738181, 4489570, 9688441, -14785194, 10184609,
+     -12363380, 29287919, 11864899, -24514362, -4438546},
+    {30464156, -5976125, -11779434, -15670865, 23220365,
+     15915852, 7512774, 10017326, -17749093, -9920357},
+    {19563160, 16186464, -29386857, 4097519, 10237984,
+     -4348115, 28542350, 13850243, -23678021, -15815942},
+    {28881845, 14381568, 9657904, 3680757, -20181635,
+     7843316, -31400660, 1370708, 29794553, -1409300},
+    {4566830, -12963868, -28974889, -12240689, -7602672,
+     -2830569, -8514358, -10431137, 2207753, -3209784},
+    {23103977, 13316479, 9739013, -16149481, 817875,
+     -15038942, 8965339, -14088058, -30714912, 16193877},
+    {-13720693, 2639453, -24237460, -7406481, 9494427,
+     -5774029, -6554551, -15960994, -2449256, -14291300},
+    {-3099351, 10324967, -2241613, 7453183, -5446979,
+     -2735503, -13812022, -16236442, -32461234, -12290683},
+};
+
+// Incremental sliding windows (left to right)
+// Based on Roberto Maria Avanzi[2005]
+typedef struct {
+    i16 next_index; // position of the next signed digit
+    i8  next_digit; // next signed digit (odd number below 2^window_width)
+    u8  next_check; // point at which we must check for a new window
+} slide_ctx;
+
+static void slide_init(slide_ctx *ctx, const u8 scalar[32])
+{
+    // scalar is guaranteed to be below L, either because we checked (s),
+    // or because we reduced it modulo L (h_ram). L is under 2^253, so
+    // so bits 253 to 255 are guaranteed to be zero. No need to test them.
+    //
+    // Note however that L is very close to 2^252, so bit 252 is almost
+    // always zero.  If we were to start at bit 251, the tests wouldn't
+    // catch the off-by-one error (constructing one that does would be
+    // prohibitively expensive).
+    //
+    // We should still check bit 252, though.
+    int i = 252;
+    while (i > 0 && scalar_bit(scalar, i) == 0) {
+        i--;
     }
+    ctx->next_check = (u8)(i + 1);
+    ctx->next_index = -1;
+    ctx->next_digit = -1;
 }
 
-// Look up table for sliding windows
-static void ge_precompute(ge_cached lut[8], const ge *P1)
+static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
 {
-    ge P2, tmp;
-    ge_double(&P2, P1, &tmp);
-    ge_cache(&lut[0], P1);
-    FOR (i, 0, 7) {
-        ge_add(&tmp, &P2, &lut[i]);
-        ge_cache(&lut[i+1], &tmp);
+    if (i == ctx->next_check) {
+        if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
+            ctx->next_check--;
+        } else {
+            // compute digit of next window
+            int w = MIN(width, i + 1);
+            int v = -(scalar_bit(scalar, i) << (w-1));
+            FOR_T (int, j, 0, w-1) {
+                v += scalar_bit(scalar, i-(w-1)+j) << j;
+            }
+            v += scalar_bit(scalar, i-w);
+            int lsb = v & (~v + 1);            // smallest bit of v
+            int s   = (   ((lsb & 0xAA) != 0)  // log2(lsb)
+                       | (((lsb & 0xCC) != 0) << 1)
+                       | (((lsb & 0xF0) != 0) << 2));
+            ctx->next_index  = (i16)(i-(w-1)+s);
+            ctx->next_digit  = (i8) (v >> s   );
+            ctx->next_check -= w;
+        }
     }
+    return i == ctx->next_index ? ctx->next_digit: 0;
 }
 
-// Could be a function, but the macro avoids some overhead.
-#define LUT_ADD(sum, lut, adds, i)                             \
-    if (adds[i] > 0) { ge_add(sum, sum, &lut[ adds[i] / 2]); } \
-    if (adds[i] < 0) { ge_sub(sum, sum, &lut[-adds[i] / 2]); }
-
-// Variable time! P, sP, and sB must not be secret!
-static void ge_double_scalarmult_vartime(ge *sum, const ge *P,
-                                         u8 p[32], u8 b[32])
-{
-    static const fe X = { -14297830, -7645148, 16144683, -16471763, 27570974,
-                          -2696100, -26142465, 8378389, 20764389, 8758491 };
-    static const fe Y = { -26843541, -6710886, 13421773, -13421773, 26843546,
-                          6710886, -13421773, 13421773, -26843546, -6710886 };
-    ge B;
-    fe_copy(B.X, X);
-    fe_copy(B.Y, Y);
-    fe_1   (B.Z);
-    fe_mul (B.T, X, Y);
-
-    // cached points for addition
-    ge_cached cP[8];  ge_precompute(cP,  P);
-    ge_cached cB[8];  ge_precompute(cB, &B);
-    i8 p_adds[258];   slide(p_adds, p);
-    i8 b_adds[258];   slide(b_adds, b);
-
-    // Avoid the first doublings
-    int i = 253;
-    while (i >= 0         &&
-           p_adds[i] == 0 &&
-           b_adds[i] == 0) {
-        i--;
+#define P_W_WIDTH 3 // Affects the size of the stack
+#define B_W_WIDTH 5 // Affects the size of the binary
+#define P_W_SIZE  (1<<(P_W_WIDTH-2))
+
+// P = [b]B + [p]P, where B is the base point
+//
+// Variable time! Internal buffers are not wiped! Inputs must not be secret!
+// => Use only to *check* signatures.
+static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
+{
+    // cache P window for addition
+    ge_cached cP[P_W_SIZE];
+    {
+        ge P2, tmp;
+        ge_double(&P2, P, &tmp);
+        ge_cache(&cP[0], P);
+        FOR (i, 0, (P_W_SIZE)-1) {
+            ge_add(&tmp, &P2, &cP[i]);
+            ge_cache(&cP[i+1], &tmp);
+        }
     }
 
-    // Merged double and add ladder
+    // Merged double and add ladder, fused with sliding
+    slide_ctx p_slide;  slide_init(&p_slide, p);
+    slide_ctx b_slide;  slide_init(&b_slide, b);
+    int i = MAX(p_slide.next_check, b_slide.next_check);
+    ge *sum = P;
     ge_zero(sum);
-    LUT_ADD(sum, cP, p_adds, i);
-    LUT_ADD(sum, cB, b_adds, i);
-    i--;
     while (i >= 0) {
-        ge_double(sum, sum, &B); // B is no longer used, we can overwrite it
-        LUT_ADD(sum, cP, p_adds, i);
-        LUT_ADD(sum, cB, b_adds, i);
+        ge tmp;
+        ge_double(sum, sum, &tmp);
+        int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
+        int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
+        if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
+        if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
+        fe t1, t2;
+        if (b_digit > 0) { ge_madd(sum, sum,
+                                   window_Yp[ b_digit / 2],
+                                   window_Ym[ b_digit / 2],
+                                   window_T2[ b_digit / 2], t1, t2); }
+        if (b_digit < 0) { ge_msub(sum, sum,
+                                   window_Yp[-b_digit / 2],
+                                   window_Ym[-b_digit / 2],
+                                   window_T2[-b_digit / 2], t1, t2); }
         i--;
     }
 }
@@ -1765,6 +1918,7 @@ static const fe comb_T2[16] = {
      -7350198, 21035059, -14970947, 25910190, 11122681},
 };
 
+// p = [scalar]B, where B is the base point
 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
 {
     // 5-bits signed comb, from Mike Hamburg's
@@ -1786,8 +1940,8 @@ static void ge_scalarmult_base(ge *p, const u8 scalar[32])
     mul_add(s_scalar, scalar, half_mod_L, half_ones);
 
     // Double and add ladder
-    fe yp, ym, t2, n2, a, b; // temporaries for addition
-    ge dbl;                  // temporary for doublings
+    fe yp, ym, t2, n2, a; // temporaries for addition
+    ge dbl;               // temporary for doubling
     ge_zero(p);
     for (int i = 50; i >= 0; i--) {
         if (i < 50) {
@@ -1796,11 +1950,11 @@ static void ge_scalarmult_base(ge *p, const u8 scalar[32])
         fe_1(yp);
         fe_1(ym);
         fe_0(t2);
-        u8 teeth =  scalar_bit(s_scalar, i)
-            +      (scalar_bit(s_scalar, i +  51) << 1)
-            +      (scalar_bit(s_scalar, i + 102) << 2)
-            +      (scalar_bit(s_scalar, i + 153) << 3)
-            +      (scalar_bit(s_scalar, i + 204) << 4);
+        u8 teeth = (u8)((scalar_bit(s_scalar, i)           ) +
+                        (scalar_bit(s_scalar, i +  51) << 1) +
+                        (scalar_bit(s_scalar, i + 102) << 2) +
+                        (scalar_bit(s_scalar, i + 153) << 3) +
+                        (scalar_bit(s_scalar, i + 204) << 4));
         u8 high  = teeth >> 4;
         u8 index = (teeth ^ (high - 1)) & 15;
         FOR (j, 0, 16) {
@@ -1813,19 +1967,20 @@ static void ge_scalarmult_base(ge *p, const u8 scalar[32])
         fe_neg(n2, t2);
         fe_cswap(t2, n2, high);
         fe_cswap(yp, ym, high);
-        ge_madd(p, p, ym, yp, n2, a, b);
+        ge_madd(p, p, ym, yp, n2, a, t2); // reuse t2 as temporary
     }
     WIPE_CTX(&dbl);
-    WIPE_BUFFER(a);  WIPE_BUFFER(yp);  WIPE_BUFFER(t2);
-    WIPE_BUFFER(b);  WIPE_BUFFER(ym);  WIPE_BUFFER(n2);
+    WIPE_BUFFER(yp);  WIPE_BUFFER(t2);  WIPE_BUFFER(a);
+    WIPE_BUFFER(ym);  WIPE_BUFFER(n2);
     WIPE_BUFFER(s_scalar);
 }
 
-void crypto_sign_public_key(u8       public_key[32],
-                            const u8 secret_key[32])
+void crypto_sign_public_key_custom_hash(u8       public_key[32],
+                                        const u8 secret_key[32],
+                                        const crypto_sign_vtable *hash)
 {
     u8 a[64];
-    HASH(a, secret_key, 32);
+    hash->hash(a, secret_key, 32);
     trim_scalar(a);
     ge A;
     ge_scalarmult_base(&A, a);
@@ -1834,44 +1989,62 @@ void crypto_sign_public_key(u8       public_key[32],
     WIPE_CTX(&A);
 }
 
-void crypto_sign_init_first_pass(crypto_sign_ctx *ctx,
-                                 const u8  secret_key[32],
-                                 const u8  public_key[32])
+void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
+{
+    crypto_sign_public_key_custom_hash(public_key, secret_key,
+                                       &crypto_blake2b_vtable);
+}
+
+void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
+                                             const u8 secret_key[32],
+                                             const u8 public_key[32],
+                                             const crypto_sign_vtable *hash)
 {
+    ctx->hash  = hash; // set vtable
     u8 *a      = ctx->buf;
     u8 *prefix = ctx->buf + 32;
-    HASH(a, secret_key, 32);
+    ctx->hash->hash(a, secret_key, 32);
     trim_scalar(a);
 
     if (public_key == 0) {
-        crypto_sign_public_key(ctx->pk, secret_key);
+        crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
     } else {
         FOR (i, 0, 32) {
             ctx->pk[i] = public_key[i];
         }
     }
 
-    // Constructs the "random" nonce from the secret key and message.
+    // Deterministic part of EdDSA: Construct a nonce by hashing the message
+    // instead of generating a random number.
     // An actual random number would work just fine, and would save us
     // the trouble of hashing the message twice.  If we did that
     // however, the user could fuck it up and reuse the nonce.
-    HASH_INIT  (&ctx->hash);
-    HASH_UPDATE(&ctx->hash, prefix , 32);
+    ctx->hash->init  (ctx);
+    ctx->hash->update(ctx, prefix , 32);
 }
 
-void crypto_sign_update(crypto_sign_ctx *ctx, const u8 *msg, size_t msg_size)
+void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
+                                 const u8 secret_key[32],
+                                 const u8 public_key[32])
 {
-    HASH_UPDATE(&ctx->hash, msg, msg_size);
+    crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
+                                            &crypto_blake2b_vtable);
 }
 
-void crypto_sign_init_second_pass(crypto_sign_ctx *ctx)
+void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
+                        const u8 *msg, size_t msg_size)
+{
+    ctx->hash->update(ctx, msg, msg_size);
+}
+
+void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
 {
     u8 *r        = ctx->buf + 32;
     u8 *half_sig = ctx->buf + 64;
-    HASH_FINAL(&ctx->hash, r);
+    ctx->hash->final(ctx, r);
     reduce(r);
 
-    // first half of the signature = "random" nonce times basepoint
+    // first half of the signature = "random" nonce times the base point
     ge R;
     ge_scalarmult_base(&R, r);
     ge_tobytes(half_sig, &R);
@@ -1879,25 +2052,25 @@ void crypto_sign_init_second_pass(crypto_sign_ctx *ctx)
 
     // Hash R, the public key, and the message together.
     // It cannot be done in parallel with the first hash.
-    HASH_INIT  (&ctx->hash);
-    HASH_UPDATE(&ctx->hash, half_sig, 32);
-    HASH_UPDATE(&ctx->hash, ctx->pk , 32);
+    ctx->hash->init  (ctx);
+    ctx->hash->update(ctx, half_sig, 32);
+    ctx->hash->update(ctx, ctx->pk , 32);
 }
 
-void crypto_sign_final(crypto_sign_ctx *ctx, u8 signature[64])
+void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
 {
     u8 *a        = ctx->buf;
     u8 *r        = ctx->buf + 32;
     u8 *half_sig = ctx->buf + 64;
-    u8 h_ram[64];
-    HASH_FINAL(&ctx->hash, h_ram);
-    reduce(h_ram);  // reduce the hash modulo L
+    u8  h_ram[64];
+    ctx->hash->final(ctx, h_ram);
+    reduce(h_ram);
     FOR (i, 0, 32) {
         signature[i] = half_sig[i];
     }
     mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
-    WIPE_CTX(ctx);
     WIPE_BUFFER(h_ram);
+    crypto_wipe(ctx, ctx->hash->ctx_size);
 }
 
 void crypto_sign(u8        signature[64],
@@ -1906,43 +2079,63 @@ void crypto_sign(u8        signature[64],
                  const u8 *message, size_t message_size)
 {
     crypto_sign_ctx ctx;
-    crypto_sign_init_first_pass (&ctx, secret_key, public_key);
-    crypto_sign_update          (&ctx, message, message_size);
-    crypto_sign_init_second_pass(&ctx);
-    crypto_sign_update          (&ctx, message, message_size);
-    crypto_sign_final           (&ctx, signature);
+    crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
+    crypto_sign_init_first_pass (actx, secret_key, public_key);
+    crypto_sign_update          (actx, message, message_size);
+    crypto_sign_init_second_pass(actx);
+    crypto_sign_update          (actx, message, message_size);
+    crypto_sign_final           (actx, signature);
 }
 
-void crypto_check_init(crypto_check_ctx *ctx,
-                      const u8 signature[64],
-                      const u8 public_key[32])
+void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
+                                   const u8 signature[64],
+                                   const u8 public_key[32],
+                                   const crypto_sign_vtable *hash)
 {
-    FOR (i, 0, 64) { ctx->sig[i] = signature [i]; }
+    ctx->hash = hash; // set vtable
+    FOR (i, 0, 64) { ctx->buf[i] = signature [i]; }
     FOR (i, 0, 32) { ctx->pk [i] = public_key[i]; }
-    HASH_INIT  (&ctx->hash);
-    HASH_UPDATE(&ctx->hash, signature , 32);
-    HASH_UPDATE(&ctx->hash, public_key, 32);
+    ctx->hash->init  (ctx);
+    ctx->hash->update(ctx, signature , 32);
+    ctx->hash->update(ctx, public_key, 32);
+}
+
+void crypto_check_init(crypto_check_ctx_abstract *ctx,
+                       const u8 signature[64],
+                       const u8 public_key[32])
+{
+    crypto_check_init_custom_hash(ctx, signature, public_key,
+                                  &crypto_blake2b_vtable);
 }
 
-void crypto_check_update(crypto_check_ctx *ctx, const u8 *msg, size_t msg_size)
+void crypto_check_update(crypto_check_ctx_abstract *ctx,
+                         const u8 *msg, size_t msg_size)
 {
-    HASH_UPDATE(&ctx->hash, msg , msg_size);
+    ctx->hash->update(ctx, msg, msg_size);
 }
 
-int crypto_check_final(crypto_check_ctx *ctx)
+int crypto_check_final(crypto_check_ctx_abstract *ctx)
 {
-    ge diff, A;
-    u8 h_ram[64], R_check[32];
-    u8 *s = ctx->sig + 32;                       // s
-    u8 *R = ctx->sig;                            // R
+    ge  A;
+    u8 *h_ram   = ctx->pk; // save stack space
+    u8 *R_check = ctx->pk; // save stack space
+    u8 *R       = ctx->buf;                      // R
+    u8 *s       = ctx->buf + 32;                 // s
+    ge *diff    = &A;                            // -A is overwritten...
     if (ge_frombytes_neg_vartime(&A, ctx->pk) ||
         is_above_L(s)) { // prevent s malleability
         return -1;
     }
-    HASH_FINAL(&ctx->hash, h_ram);
-    reduce(h_ram);
-    ge_double_scalarmult_vartime(&diff, &A, h_ram, s);
-    ge_tobytes(R_check, &diff);                  // R_check = s*B - h_ram*A
+    {
+        u8 tmp[64];
+        ctx->hash->final(ctx, tmp);
+        reduce(tmp);
+        FOR (i, 0, 32) { // the extra copy saves 32 bytes of stack
+            h_ram[i] = tmp[i];
+        }
+    }
+    ge_double_scalarmult_vartime(&A, h_ram, s);  // ...here
+    ge_tobytes(R_check, diff);                   // R_check = s*B - h_ram*A
     return crypto_verify32(R, R_check);          // R == R_check ? OK : fail
     // No secret, no wipe
 }
@@ -1952,97 +2145,41 @@ int crypto_check(const u8  signature[64],
                  const u8 *message, size_t message_size)
 {
     crypto_check_ctx ctx;
-    crypto_check_init(&ctx, signature, public_key);
-    crypto_check_update(&ctx, message, message_size);
-    return crypto_check_final(&ctx);
+    crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
+    crypto_check_init  (actx, signature, public_key);
+    crypto_check_update(actx, message, message_size);
+    return crypto_check_final(actx);
 }
 
 ////////////////////
 /// Key exchange ///
 ////////////////////
-int crypto_key_exchange(u8       shared_key[32],
-                        const u8 your_secret_key [32],
-                        const u8 their_public_key[32])
+void crypto_key_exchange(u8       shared_key[32],
+                         const u8 your_secret_key [32],
+                         const u8 their_public_key[32])
 {
-    u8 raw_shared_secret[32];
-    int status = crypto_x25519(raw_shared_secret,
-                               your_secret_key, their_public_key);
-    crypto_chacha20_H(shared_key, raw_shared_secret, zero);
-    WIPE_BUFFER(raw_shared_secret);
-    return status;
+    crypto_x25519(shared_key, your_secret_key, their_public_key);
+    crypto_hchacha20(shared_key, shared_key, zero);
 }
 
 ////////////////////////////////
 /// Authenticated encryption ///
 ////////////////////////////////
-static void lock_ad_padding(crypto_lock_ctx *ctx)
-{
-    if (ctx->ad_phase) {
-        ctx->ad_phase = 0;
-        crypto_poly1305_update(&ctx->poly, zero, ALIGN(ctx->ad_size, 16));
-    }
-}
-
-void crypto_lock_init(crypto_lock_ctx *ctx,
-                      const u8 key[32], const u8 nonce[24])
-{
-    u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
-    ctx->ad_phase     = 1;
-    ctx->ad_size      = 0;
-    ctx->message_size = 0;
-    crypto_chacha20_x_init(&ctx->chacha, key, nonce);
-    crypto_chacha20_stream(&ctx->chacha, auth_key, 64);
-    crypto_poly1305_init  (&ctx->poly  , auth_key);
-    WIPE_BUFFER(auth_key);
-}
-
-void crypto_lock_auth_ad(crypto_lock_ctx *ctx, const u8 *msg, size_t msg_size)
-{
-    crypto_poly1305_update(&ctx->poly, msg, msg_size);
-    ctx->ad_size += msg_size;
-}
-
-void crypto_lock_auth_message(crypto_lock_ctx *ctx,
-                              const u8 *cipher_text, size_t text_size)
-{
-    lock_ad_padding(ctx);
-    ctx->message_size += text_size;
-    crypto_poly1305_update(&ctx->poly, cipher_text, text_size);
-}
-
-void crypto_lock_update(crypto_lock_ctx *ctx, u8 *cipher_text,
-                        const u8 *plain_text, size_t text_size)
+static void lock_auth(u8 mac[16], const u8  auth_key[32],
+                      const u8 *ad         , size_t ad_size,
+                      const u8 *cipher_text, size_t text_size)
 {
-    crypto_chacha20_encrypt(&ctx->chacha, cipher_text, plain_text, text_size);
-    crypto_lock_auth_message(ctx, cipher_text, text_size);
-}
-
-void crypto_lock_final(crypto_lock_ctx *ctx, u8 mac[16])
-{
-    lock_ad_padding(ctx);
     u8 sizes[16]; // Not secret, not wiped
-    store64_le(sizes + 0, ctx->ad_size);
-    store64_le(sizes + 8, ctx->message_size);
-    crypto_poly1305_update(&ctx->poly, zero, ALIGN(ctx->message_size, 16));
-    crypto_poly1305_update(&ctx->poly, sizes, 16);
-    crypto_poly1305_final (&ctx->poly, mac);
-    WIPE_CTX(ctx);
-}
-
-void crypto_unlock_update(crypto_lock_ctx *ctx, u8 *plain_text,
-                          const u8 *cipher_text, size_t text_size)
-{
-    crypto_unlock_auth_message(ctx, cipher_text, text_size);
-    crypto_chacha20_encrypt(&ctx->chacha, plain_text, cipher_text, text_size);
-}
-
-int crypto_unlock_final(crypto_lock_ctx *ctx, const u8 mac[16])
-{
-    u8 real_mac[16];
-    crypto_lock_final(ctx, real_mac);
-    int mismatch = crypto_verify16(real_mac, mac);
-    WIPE_BUFFER(real_mac);
-    return mismatch;
+    store64_le(sizes + 0, ad_size);
+    store64_le(sizes + 8, text_size);
+    crypto_poly1305_ctx poly_ctx;           // auto wiped...
+    crypto_poly1305_init  (&poly_ctx, auth_key);
+    crypto_poly1305_update(&poly_ctx, ad         , ad_size);
+    crypto_poly1305_update(&poly_ctx, zero       , ALIGN(ad_size, 16));
+    crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
+    crypto_poly1305_update(&poly_ctx, zero       , ALIGN(text_size, 16));
+    crypto_poly1305_update(&poly_ctx, sizes      , 16);
+    crypto_poly1305_final (&poly_ctx, mac); // ...here
 }
 
 void crypto_lock_aead(u8        mac[16],
@@ -2052,11 +2189,15 @@ void crypto_lock_aead(u8        mac[16],
                       const u8 *ad        , size_t ad_size,
                       const u8 *plain_text, size_t text_size)
 {
-    crypto_lock_ctx ctx;
-    crypto_lock_init   (&ctx, key, nonce);
-    crypto_lock_auth_ad(&ctx, ad, ad_size);
-    crypto_lock_update (&ctx, cipher_text, plain_text, text_size);
-    crypto_lock_final  (&ctx, mac);
+    u8 sub_key[32];
+    u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
+    crypto_hchacha20(sub_key, key, nonce);
+    crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
+    crypto_chacha20_ctr(cipher_text, plain_text, text_size,
+                        sub_key, nonce + 16, 1);
+    lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
+    WIPE_BUFFER(sub_key);
+    WIPE_BUFFER(auth_key);
 }
 
 int crypto_unlock_aead(u8       *plain_text,
@@ -2066,17 +2207,22 @@ int crypto_unlock_aead(u8       *plain_text,
                        const u8 *ad         , size_t ad_size,
                        const u8 *cipher_text, size_t text_size)
 {
-    crypto_unlock_ctx ctx;
-    crypto_unlock_init        (&ctx, key, nonce);
-    crypto_unlock_auth_ad     (&ctx, ad, ad_size);
-    crypto_unlock_auth_message(&ctx, cipher_text, text_size);
-    crypto_chacha_ctx chacha_ctx = ctx.chacha; // avoid the wiping...
-    if (crypto_unlock_final(&ctx, mac)) {      // ...that occurs here
-        WIPE_CTX(&chacha_ctx);
-        return -1; // reject forgeries before wasting our time decrypting
+    u8 sub_key[32];
+    u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
+    crypto_hchacha20(sub_key, key, nonce);
+    crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
+    u8 real_mac[16];
+    lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
+    WIPE_BUFFER(auth_key);
+    if (crypto_verify16(mac, real_mac)) {
+        WIPE_BUFFER(sub_key);
+        WIPE_BUFFER(real_mac);
+        return -1;
     }
-    crypto_chacha20_encrypt(&chacha_ctx, plain_text, cipher_text, text_size);
-    WIPE_CTX(&chacha_ctx);
+    crypto_chacha20_ctr(plain_text, cipher_text, text_size,
+                        sub_key, nonce + 16, 1);
+    WIPE_BUFFER(sub_key);
+    WIPE_BUFFER(real_mac);
     return 0;
 }
 
@@ -2109,6 +2255,20 @@ int crypto_unlock(u8       *plain_text,
 #include <caml/bigarray.h>
 #include "platform-specific.h"
 
+void crypto_sign_public_key_extended(u8       public_key[32],
+                                     const u8 extended_key[64]) {
+    ge A;
+    ge_scalarmult_base(&A, extended_key);
+    ge_tobytes(public_key, &A);
+    WIPE_CTX(&A);
+}
+
+CAMLprim value caml_monocypher_crypto_sign_public_key_extended(value pk, value ek) {
+    crypto_sign_public_key_extended(Caml_ba_data_val(pk),
+                                    Caml_ba_data_val(ek));
+    return Val_unit;
+}
+
 CAMLprim value caml_monocypher_trim_scalar(value buf) {
     trim_scalar(Caml_ba_data_val(buf));
     return Val_unit;
@@ -2142,28 +2302,6 @@ CAMLprim value caml_monocypher_crypto_verify64(value a, value b) {
     return Val_int(crypto_verify64(Caml_ba_data_val(a), Caml_ba_data_val(b)));
 }
 
-CAMLprim value caml_monocypher_sizeof_crypto_sha512_ctx(value unit) {
-    return Val_long(sizeof(crypto_sha512_ctx));
-}
-
-CAMLprim value caml_monocypher_crypto_sha512_init(value ctx) {
-    crypto_sha512_init(Caml_ba_data_val(ctx));
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_sha512_update(value ctx, value msg) {
-    crypto_sha512_update(Caml_ba_data_val(ctx),
-                         Caml_ba_data_val(msg),
-                         Caml_ba_array_val(msg)->dim[0]);
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_sha512_final(value ctx, value hash) {
-    crypto_sha512_final(Caml_ba_data_val(ctx),
-                        Caml_ba_data_val(hash));
-    return Val_unit;
-}
-
 CAMLprim value caml_monocypher_sizeof_crypto_blake2b_ctx(value unit) {
     return Val_long(sizeof(crypto_blake2b_ctx));
 }
@@ -2215,9 +2353,10 @@ CAMLprim value caml_monocypher_crypto_key_exchange_public_key(value pk, value sk
 }
 
 CAMLprim value caml_monocypher_crypto_key_exchange(value k, value sk, value pk) {
-    return Val_int(crypto_key_exchange(Caml_ba_data_val(k),
-                                       Caml_ba_data_val(sk),
-                                       Caml_ba_data_val(pk)));
+    crypto_key_exchange(Caml_ba_data_val(k),
+                        Caml_ba_data_val(sk),
+                        Caml_ba_data_val(pk));
+    return Val_unit;
 }
 
 CAMLprim value caml_monocypher_crypto_lock(value mac, value cmsg, value key, value nonce) {
@@ -2239,109 +2378,6 @@ CAMLprim value caml_monocypher_crypto_unlock(value mac, value msg, value key, va
                                  Caml_ba_array_val(msg)->dim[0]));
 }
 
-CAMLprim value caml_monocypher_crypto_sign_public_key(value pk, value sk) {
-    crypto_sign_public_key(Caml_ba_data_val(pk),
-                           Caml_ba_data_val(sk));
-    return Val_unit;
-}
-
-void crypto_sign_public_key_extended(u8       public_key[32],
-                                     const u8 extended_key[64])
-{
-    ge A;
-    ge_scalarmult_base(&A, extended_key);
-    ge_tobytes(public_key, &A);
-    WIPE_CTX(&A);
-}
-
-CAMLprim value caml_monocypher_crypto_sign_public_key_extended(value pk, value ek) {
-    crypto_sign_public_key_extended(Caml_ba_data_val(pk),
-                                    Caml_ba_data_val(ek));
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_sizeof_crypto_sign_ctx(value unit) {
-    return Val_int(sizeof(crypto_sign_ctx));
-}
-
-CAMLprim value caml_monocypher_crypto_sign_init_first_pass(value ctx, value sk, value pk) {
-    crypto_sign_init_first_pass(Caml_ba_data_val(ctx),
-                                Caml_ba_data_val(sk),
-                                Caml_ba_data_val(pk));
-    return Val_unit;
-}
-
-void crypto_sign_init_first_pass_extended(crypto_sign_ctx *ctx,
-                                          const u8  extended_key[64],
-                                          const u8  public_key[32])
-{
-    u8 *a      = ctx->buf;
-    u8 *prefix = ctx->buf + 32;
-    FOR(i, 0, 64) { a[i] = extended_key[i]; }
-
-    if (public_key == 0) {
-        crypto_sign_public_key_extended(ctx->pk, extended_key);
-    } else {
-        FOR (i, 0, 32) {
-            ctx->pk[i] = public_key[i];
-        }
-    }
-
-    // Constructs the "random" nonce from the secret key and message.
-    // An actual random number would work just fine, and would save us
-    // the trouble of hashing the message twice.  If we did that
-    // however, the user could fuck it up and reuse the nonce.
-    HASH_INIT  (&ctx->hash);
-    HASH_UPDATE(&ctx->hash, prefix , 32);
-}
-
-CAMLprim value caml_monocypher_crypto_sign_init_first_pass_extended(value ctx, value ek, value pk) {
-    crypto_sign_init_first_pass_extended(Caml_ba_data_val(ctx),
-                                         Caml_ba_data_val(ek),
-                                         Caml_ba_data_val(pk));
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_sign_update(value ctx, value msg) {
-    crypto_sign_update(Caml_ba_data_val(ctx),
-                       Caml_ba_data_val(msg),
-                       Caml_ba_array_val(msg)->dim[0]);
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_sign_init_second_pass(value ctx) {
-    crypto_sign_init_second_pass(Caml_ba_data_val(ctx));
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_sign_final(value ctx, value signature) {
-    crypto_sign_final(Caml_ba_data_val(ctx),
-                      Caml_ba_data_val(signature));
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_sizeof_crypto_check_ctx(value unit) {
-    return Val_int(sizeof(crypto_check_ctx));
-}
-
-CAMLprim value caml_monocypher_crypto_check_init(value ctx, value signature, value pk) {
-    crypto_check_init(Caml_ba_data_val(ctx),
-                      Caml_ba_data_val(signature),
-                      Caml_ba_data_val(pk));
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_check_update(value ctx, value msg) {
-    crypto_check_update(Caml_ba_data_val(ctx),
-                        Caml_ba_data_val(msg),
-                        Caml_ba_array_val(msg)->dim[0]);
-    return Val_unit;
-}
-
-CAMLprim value caml_monocypher_crypto_check_final(value ctx) {
-    return Val_int(crypto_check_final(Caml_ba_data_val(ctx)));
-}
-
 CAMLprim value caml_monocypher_ge_cache(value cached, value ge) {
     ge_cache(Caml_ba_data_val(cached),
              Caml_ba_data_val(ge));
@@ -2412,11 +2448,10 @@ CAMLprim value caml_monocypher_ge_add(value s, value p, value q) {
     return Val_unit;
 }
 
-CAMLprim value caml_monocypher_ge_double_scalarmult(value p, value q, value s1, value s2) {
-    ge_double_scalarmult_vartime(Caml_ba_data_val(p),
-                                 Caml_ba_data_val(q),
-                                 Caml_ba_data_val(s1),
-                                 Caml_ba_data_val(s2));
+CAMLprim value caml_monocypher_ge_double_scalarmult(value P, value p, value b) {
+    ge_double_scalarmult_vartime(Caml_ba_data_val(P),
+                                 Caml_ba_data_val(p),
+                                 Caml_ba_data_val(b));
     return Val_unit;
 }
 
diff --git a/src/sha512.c b/src/sha512.c
deleted file mode 100644
index 5833d19..0000000
--- a/src/sha512.c
+++ /dev/null
@@ -1,207 +0,0 @@
-#include "sha512.h"
-
-#define FOR(i, min, max)     for (size_t i = min; i < max; i++)
-#define WIPE_CTX(ctx)        crypto_wipe(ctx   , sizeof(*(ctx)))
-#define MIN(a, b)            ((a) <= (b) ? (a) : (b))
-#define ALIGN(x, block_size) ((~(x) + 1) & ((block_size) - 1))
-typedef uint8_t u8;
-typedef uint64_t u64;
-
-static u64 load64_be(const u8 s[8])
-{
-    return((u64)s[0] << 56)
-        | ((u64)s[1] << 48)
-        | ((u64)s[2] << 40)
-        | ((u64)s[3] << 32)
-        | ((u64)s[4] << 24)
-        | ((u64)s[5] << 16)
-        | ((u64)s[6] <<  8)
-        |  (u64)s[7];
-}
-
-static void store64_be(u8 out[8], u64 in)
-{
-    out[0] = (in >> 56) & 0xff;
-    out[1] = (in >> 48) & 0xff;
-    out[2] = (in >> 40) & 0xff;
-    out[3] = (in >> 32) & 0xff;
-    out[4] = (in >> 24) & 0xff;
-    out[5] = (in >> 16) & 0xff;
-    out[6] = (in >>  8) & 0xff;
-    out[7] =  in        & 0xff;
-}
-
-static void crypto_wipe(void *secret, size_t size)
-{
-    volatile u8 *v_secret = (u8*)secret;
-    FOR (i, 0, size) {
-        v_secret[i] = 0;
-    }
-}
-
-static u64 rot(u64 x, int c       ) { return (x >> c) | (x << (64 - c));   }
-static u64 ch (u64 x, u64 y, u64 z) { return (x & y) ^ (~x & z);           }
-static u64 maj(u64 x, u64 y, u64 z) { return (x & y) ^ ( x & z) ^ (y & z); }
-static u64 big_sigma0(u64 x) { return rot(x, 28) ^ rot(x, 34) ^ rot(x, 39); }
-static u64 big_sigma1(u64 x) { return rot(x, 14) ^ rot(x, 18) ^ rot(x, 41); }
-static u64 lit_sigma0(u64 x) { return rot(x,  1) ^ rot(x,  8) ^ (x >> 7);   }
-static u64 lit_sigma1(u64 x) { return rot(x, 19) ^ rot(x, 61) ^ (x >> 6);   }
-
-static const u64 K[80] = {
-    0x428a2f98d728ae22,0x7137449123ef65cd,0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc,
-    0x3956c25bf348b538,0x59f111f1b605d019,0x923f82a4af194f9b,0xab1c5ed5da6d8118,
-    0xd807aa98a3030242,0x12835b0145706fbe,0x243185be4ee4b28c,0x550c7dc3d5ffb4e2,
-    0x72be5d74f27b896f,0x80deb1fe3b1696b1,0x9bdc06a725c71235,0xc19bf174cf692694,
-    0xe49b69c19ef14ad2,0xefbe4786384f25e3,0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65,
-    0x2de92c6f592b0275,0x4a7484aa6ea6e483,0x5cb0a9dcbd41fbd4,0x76f988da831153b5,
-    0x983e5152ee66dfab,0xa831c66d2db43210,0xb00327c898fb213f,0xbf597fc7beef0ee4,
-    0xc6e00bf33da88fc2,0xd5a79147930aa725,0x06ca6351e003826f,0x142929670a0e6e70,
-    0x27b70a8546d22ffc,0x2e1b21385c26c926,0x4d2c6dfc5ac42aed,0x53380d139d95b3df,
-    0x650a73548baf63de,0x766a0abb3c77b2a8,0x81c2c92e47edaee6,0x92722c851482353b,
-    0xa2bfe8a14cf10364,0xa81a664bbc423001,0xc24b8b70d0f89791,0xc76c51a30654be30,
-    0xd192e819d6ef5218,0xd69906245565a910,0xf40e35855771202a,0x106aa07032bbd1b8,
-    0x19a4c116b8d2d0c8,0x1e376c085141ab53,0x2748774cdf8eeb99,0x34b0bcb5e19b48a8,
-    0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb,0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3,
-    0x748f82ee5defb2fc,0x78a5636f43172f60,0x84c87814a1f0ab72,0x8cc702081a6439ec,
-    0x90befffa23631e28,0xa4506cebde82bde9,0xbef9a3f7b2c67915,0xc67178f2e372532b,
-    0xca273eceea26619c,0xd186b8c721c0c207,0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178,
-    0x06f067aa72176fba,0x0a637dc5a2c898a6,0x113f9804bef90dae,0x1b710b35131c471b,
-    0x28db77f523047d84,0x32caab7b40c72493,0x3c9ebe0a15c9bebc,0x431d67c49c100d4c,
-    0x4cc5d4becb3e42b6,0x597f299cfc657e2a,0x5fcb6fab3ad6faec,0x6c44198c4a475817
-};
-
-static void sha512_compress(crypto_sha512_ctx *ctx)
-{
-    u64 *w = ctx->w;
-    FOR(i,  0, 16) { w[i] = ctx->input[i]; }
-    FOR(i, 16, 80) { w[i] = (lit_sigma1(w[i- 2]) + w[i- 7] +
-                             lit_sigma0(w[i-15]) + w[i-16]); }
-
-    u64 a = ctx->hash[0];    u64 b = ctx->hash[1];
-    u64 c = ctx->hash[2];    u64 d = ctx->hash[3];
-    u64 e = ctx->hash[4];    u64 f = ctx->hash[5];
-    u64 g = ctx->hash[6];    u64 h = ctx->hash[7];
-    FOR(i, 0, 80) {
-        u64 t1 = big_sigma1(e) + ch (e, f, g) + h + K[i] + w[i];
-        u64 t2 = big_sigma0(a) + maj(a, b, c);
-        h = g;  g = f;  f = e;  e = d  + t1;
-        d = c;  c = b;  b = a;  a = t1 + t2;
-    }
-    ctx->hash[0] += a;    ctx->hash[1] += b;
-    ctx->hash[2] += c;    ctx->hash[3] += d;
-    ctx->hash[4] += e;    ctx->hash[5] += f;
-    ctx->hash[6] += g;    ctx->hash[7] += h;
-}
-
-static void sha512_set_input(crypto_sha512_ctx *ctx, u8 input)
-{
-    size_t word = ctx->input_idx / 8;
-    size_t byte = ctx->input_idx % 8;
-    ctx->input[word] |= (u64)input << (8 * (7 - byte));
-}
-
-static void sha512_reset_input(crypto_sha512_ctx *ctx)
-{
-    FOR(i, 0, 16) {
-        ctx->input[i] = 0;
-    }
-    ctx->input_idx = 0;
-}
-
-// increment a 128-bit "word".
-static void sha512_incr(u64 x[2], u64 y)
-{
-    x[1] += y;
-    if (x[1] < y) {
-        x[0]++;
-    }
-}
-
-static void sha512_end_block(crypto_sha512_ctx *ctx)
-{
-    if (ctx->input_idx == 128) {
-        sha512_incr(ctx->input_size, 1024); // size is in bits
-        sha512_compress(ctx);
-        sha512_reset_input(ctx);
-    }
-}
-
-static void sha512_update(crypto_sha512_ctx *ctx,
-                          const u8 *message, size_t message_size)
-{
-    FOR (i, 0, message_size) {
-        sha512_set_input(ctx, message[i]);
-        ctx->input_idx++;
-        sha512_end_block(ctx);
-    }
-}
-
-void crypto_sha512_init(crypto_sha512_ctx *ctx)
-{
-    ctx->hash[0] = 0x6a09e667f3bcc908;
-    ctx->hash[1] = 0xbb67ae8584caa73b;
-    ctx->hash[2] = 0x3c6ef372fe94f82b;
-    ctx->hash[3] = 0xa54ff53a5f1d36f1;
-    ctx->hash[4] = 0x510e527fade682d1;
-    ctx->hash[5] = 0x9b05688c2b3e6c1f;
-    ctx->hash[6] = 0x1f83d9abfb41bd6b;
-    ctx->hash[7] = 0x5be0cd19137e2179;
-    ctx->input_size[0] = 0;
-    ctx->input_size[1] = 0;
-    sha512_reset_input(ctx);
-}
-
-void crypto_sha512_update(crypto_sha512_ctx *ctx,
-                          const u8 *message, size_t message_size)
-{
-    // Align ourselves with block boundaries
-    size_t align = MIN(ALIGN(ctx->input_idx, 128), message_size);
-    sha512_update(ctx, message, align);
-    message      += align;
-    message_size -= align;
-
-    // Process the message block by block
-    FOR (i, 0, message_size / 128) { // number of blocks
-        FOR (j, 0, 16) {
-            ctx->input[j] = load64_be(message + j*8);
-        }
-        message        += 128;
-        ctx->input_idx += 128;
-        sha512_end_block(ctx);
-    }
-    message_size &= 127;
-
-    // remaining bytes
-    sha512_update(ctx, message, message_size);
-}
-
-void crypto_sha512_final(crypto_sha512_ctx *ctx, u8 hash[64])
-{
-    sha512_incr(ctx->input_size, ctx->input_idx * 8); // size is in bits
-    sha512_set_input(ctx, 128);                       // padding
-
-    // compress penultimate block (if any)
-    if (ctx->input_idx > 111) {
-        sha512_compress(ctx);
-        sha512_reset_input(ctx);
-    }
-    // compress last block
-    ctx->input[14] = ctx->input_size[0];
-    ctx->input[15] = ctx->input_size[1];
-    sha512_compress(ctx);
-
-    // copy hash to output (big endian)
-    FOR (i, 0, 8) {
-        store64_be(hash + i*8, ctx->hash[i]);
-    }
-
-    WIPE_CTX(ctx);
-}
-
-void crypto_sha512(u8 *hash, const u8 *message, size_t message_size)
-{
-    crypto_sha512_ctx ctx;
-    crypto_sha512_init  (&ctx);
-    crypto_sha512_update(&ctx, message, message_size);
-    crypto_sha512_final (&ctx, hash);
-}
diff --git a/src/sha512.h b/src/sha512.h
deleted file mode 100644
index 1354079..0000000
--- a/src/sha512.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef SHA512_H
-#define SHA512_H
-
-#include <stddef.h>
-#include <inttypes.h>
-
-typedef struct {
-    uint64_t w[80]; // work area
-    uint64_t hash[8];
-    uint64_t input[16];
-    uint64_t input_size[2];
-    size_t   input_idx;
-} crypto_sha512_ctx;
-
-void crypto_sha512_init  (crypto_sha512_ctx *ctx);
-void crypto_sha512_update(crypto_sha512_ctx *ctx,
-                          const uint8_t *message, size_t  message_size);
-void crypto_sha512_final (crypto_sha512_ctx *ctx, uint8_t hash[64]);
-
-void crypto_sha512(uint8_t *out,const uint8_t *message, size_t message_size);
-
-#endif // SHA512_H
diff --git a/test/test.ml b/test/test.ml
index 5ae706f..b796339 100644
--- a/test/test.ml
+++ b/test/test.ml
@@ -97,8 +97,8 @@ let test_dh () =
   let sk2 = DH.sk_of_bytes (Rand.gen DH.bytes) in
   let pk = DH.neuterize sk in
   let pk2 = DH.neuterize sk2 in
-  let k = DH.shared_exn sk pk2 in
-  let k2 = DH.shared_exn sk2 pk in
+  let k = DH.shared sk pk2 in
+  let k2 = DH.shared sk2 pk in
   Alcotest.(check bool "dh" true DH.(equal k k2))
 
 let msg = Bigstring.of_string "Voulez-vous coucher avec moi, ce soir ?"
@@ -108,7 +108,7 @@ let test_box () =
   let sk = DH.sk_of_bytes (Rand.gen DH.bytes) in
   let sk2 = DH.sk_of_bytes (Rand.gen DH.bytes) in
   let pk2 = DH.neuterize sk2 in
-  let k = DH.shared_exn sk pk2 in
+  let k = DH.shared sk pk2 in
   let key = DH.buffer k in
   let buf = Bigstring.copy msg in
   let mac = Bigstring.create Box.macbytes in
@@ -177,15 +177,17 @@ let test_assoc () =
 
 let test_arith () =
   let pk = Ed25519.of_pk Sign.(neuterize (sk_of_bytes (Rand.gen skbytes))) in
-  let pk2 = Ed25519.scalarmult pk (Z.of_int 3) in
-  let pk2' = Ed25519.(add (add pk (cache pk)) (cache pk)) in
-  Alcotest.(check bool "arith2" true (Ed25519.equal pk2 pk2'))
+  let pk_times_3 = Ed25519.copy pk in
+  Ed25519.scalarmult pk_times_3 (Z.of_int 3) ;
+  let pk_cached = Ed25519.cache pk in
+  let pk2 = Ed25519.(add (add pk pk_cached) pk_cached) in
+  Alcotest.(check bool "arith2" true (Ed25519.equal pk_times_3 pk2))
 
 let test_arith2 () =
   let a = Ed25519.scalarmult_base (Z.of_int 3) in
-  let b = Ed25519.scalarmult a (Z.of_int 2) in
-  let b' = Ed25519.scalarmult_base (Z.of_int 6) in
-  Alcotest.(check bool "arith2" true (Ed25519.equal b b'))
+  Ed25519.scalarmult a (Z.of_int 2) ;
+  let b = Ed25519.scalarmult_base (Z.of_int 6) in
+  Alcotest.(check bool "arith2" true (Ed25519.equal a b))
 
 let basic = [
   "Rand.gen", `Quick, test_rand_gen ;