diff --git a/Circuits/README.txt b/Circuits/README.txt
index 55fdf5c5..245890b5 100644
--- a/Circuits/README.txt
+++ b/Circuits/README.txt
@@ -94,3 +94,47 @@ The list of all main files (excluding test ones) to convert (and how
 to do so) are given in ./convert.sh
 
 
+-------------------------------------------------------------
+
+There is a program to utilize YoSys synthesis tools to produce
+a Bristol Fashion circuit representation. This is untested by the
+SCALE maintainers, but has been provided by Mark Will.
+
+
+./convert_yosys.py --help
+usage: convert_yosys.py [-h] [-t TOP_MODULE] [-sy] [-a ADD_FILE]
+                        [-l {verilog,vhdl}] [-v]
+                        input_file output_file
+
+positional arguments:
+  input_file            File to be converted
+  output_file           Bristol output file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -t TOP_MODULE, --top_module TOP_MODULE
+                        Top module of the design
+  -sy, --skip_yosys     Input file is now assumed to have been synthesised.
+                        This allows you to run Yosys manually
+  -a ADD_FILE, --add_file ADD_FILE
+                        Add more files to the design
+  -l {verilog,vhdl}, --lang {verilog,vhdl}
+                        Input langauge. Note vhdl requires Yosys to be built
+                        with Verific support, probably better to convert to
+                        Verilog for this converter
+  -v, --verbose
+
+
+So a basic use could be the following:
+
+> cat mult64_2.v
+module mult64_2 (a,b,res);
+    input signed [63:0] a;
+    input signed [63:0] b;
+    output [127:0] res;
+    assign res = (a * b);
+endmodule
+
+> ./convert_yosys.py mult64_2.v mult64_2.txt
+
+
diff --git a/Circuits/convert_yosys.py b/Circuits/convert_yosys.py
new file mode 100644
index 00000000..72540df3
--- /dev/null
+++ b/Circuits/convert_yosys.py
@@ -0,0 +1,408 @@
+#!/usr/bin/python3
+
+# Author: Mark Will 
+# Copyright (c) 2019 Acronis Asia R&D Pte Ltd 
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import argparse
+import logging
+import re
+import subprocess
+from functools import reduce
+
+# Handle arguments
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "input_file",
+    help = "File to be converted", 
+    type = str
+)
+parser.add_argument(
+    "output_file", 
+    help = "Bristol output file", 
+    type = str
+)
+parser.add_argument(
+    "-t", 
+    "--top_module", 
+    help = "Top module of the design", 
+    type = str
+)
+parser.add_argument(
+    "-sy", 
+    "--skip_yosys", 
+    help = "Input file is now assumed to have been synthesised. This allows you to run Yosys manually", 
+    action = "store_true"
+)
+parser.add_argument(
+    "-a", 
+    "--add_file", 
+    help = "Add more files to the design", 
+    action = 'append', 
+    type = str
+)
+parser.add_argument(
+    "-l", 
+    "--lang", 
+    help = """ 
+        Input langauge. Note vhdl requires Yosys to be built with Verific support, 
+        probably better to convert to Verilog for this converter
+    """, 
+    type = str, 
+    choices = ["verilog", "vhdl"], 
+    default = "verilog"
+)
+parser.add_argument(
+    "-v", 
+    "--verbose", 
+    action = "store_true"
+) 
+args = parser.parse_args()
+
+if args.verbose:
+    logging.basicConfig(level=logging.INFO)
+else:
+    logging.basicConfig(level=logging.ERROR)
+
+if args.top_module == None:
+    tm = re.findall(r"([^/.]+)\.[^/]*$", args.input_file)
+    if len(tm) != 1:
+        tm = ['']
+    args.top_module = tm[0]
+    logging.info("Missing top module, assuming input filename: %s" % args.top_module)
+
+
+if not args.skip_yosys:
+    # Make sure yosys is installed and in the path
+    try:
+        with subprocess.Popen(["yosys", "-V"], stdout=subprocess.PIPE) as proc:
+            proc.wait()
+            version = re.findall(r"Yosys (\d+\.\d+)", str(proc.stdout.read()))
+            if len(version) == 1: 
+                logging.info("Using Yosys version %s" % version[0])
+            else:
+                raise Exception("Yosys version not found.")
+    except Exception as e:
+        logging.error(e)
+        logging.error("Yosys not found. Please install it (http://www.clifford.at/yosys/download.html)")
+        exit()
+    # Process the design
+    try:
+        pipe_stdout = subprocess.PIPE if not args.verbose else None
+        with subprocess.Popen(["yosys"], stdin = subprocess.PIPE, stdout = pipe_stdout) as proc:
+            # Please feel free to change the commands or use -sy to manually run Yosys
+            if args.lang == 'verilog':
+                read_option = "-sv"
+            else:
+                read_option = "-vhdl"
+            input_files = [args.input_file]
+            input_files += args.add_file if args.add_file else []
+            for f in input_files: 
+                proc.stdin.write(("read %s %s\n" % (read_option, f)).encode())
+            proc.stdin.write(("synth -top %s\n" % args.top_module).encode())
+            # Needed to reduce the gates to ones supported by the Bristol format
+            proc.stdin.write(b"proc\n")
+            proc.stdin.write(b"opt\n")
+            proc.stdin.write(b"fsm\n")
+            proc.stdin.write(b"flatten\n")
+            proc.stdin.write(b"memory\n")
+            proc.stdin.write(b"opt\n")
+            proc.stdin.write(b"techmap\n")
+            proc.stdin.write(b"opt\n")
+            proc.stdin.write(b"wreduce\n")
+            # This can be slow
+            #proc.stdin.write(b"freduce -vv\n")
+            proc.stdin.write(b"clean\n")
+            proc.stdin.write(b"abc -g AND\n")
+            proc.stdin.write(b"opt_reduce -fine\n")
+            for i in range(5):
+                proc.stdin.write(b"abc -g XOR,AND\n")
+                proc.stdin.write(b"opt -fine\n")
+                proc.stdin.write(b"clean\n")
+                proc.stdin.write(b"flatten\n")
+            proc.stdin.write(b"torder\n")
+            # Note that -nohex and -noattr are required
+            proc.stdin.write(("write_verilog -nohex -noattr %s.yosys.v\n" % args.input_file).encode())
+            out = proc.communicate(input=b"exit")
+            if not args.verbose:
+                pstdout = out[0].decode("utf-8")
+                logging.info(pstdout)
+                errors = re.findall(r"\nERROR: ([^\n]*)\n", pstdout)
+                for error in errors:
+                    logging.error(error)
+                    if args.lang == 'vhdl' and 'Verific' in error:
+                        logging.error("Note that Yosys needs to be built with Verific support. You could also try https://github.com/pliu6/vhd2vl.git to convert your VHDL code to Verilog.")
+    except Exception as e:
+        logging.error(e)
+        exit()
+
+# Convert yosys verilog output to Bristol format
+# Note only one module should be here.
+wires = []
+inputs = []
+outputs = []
+assigns = []
+logging.info("Loading design from Yosys")
+filename = "%s.yosys.v" % args.input_file if not args.skip_yosys else args.input_file
+with open(filename, 'r') as f:
+    modules = 0
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        line = line.strip()
+        if line.startswith('wire'):
+            wires.append(line[5:-1])
+        elif line.startswith('input'):
+            inputs.append(line[6:-1])
+        elif line.startswith('output'):
+            outputs.append(line[7:-1])
+        elif line.startswith('assign'):
+            assigns.append(line[7:-1])
+        elif line.startswith('module'):
+            modules += 1
+    assert modules == 1
+
+
+
+num_input_wires = 0
+num_output_wires = 0
+wire_mapping = {}
+input_wire = 0
+input_lens = []
+output_lens = []
+prop_lens = {}
+
+# Inputs are first wires
+logging.info("Processing input wires")
+
+def getIOProp(line):
+    line = line.strip()
+    arr = re.findall(r"\[([0-9]+)\:([0-9]+)\]", line)
+    c = 1
+    if arr:
+        c += (reduce(lambda a,b: a + int(b), [0] + list(arr[0])))
+    name = re.findall(r"[^ ]*$", line)[0]
+    return (name, c)
+
+for ip in inputs:
+    name, l = getIOProp(ip)
+    num_input_wires += l
+    input_lens.append(str(l))
+    prop_lens[name] = l
+    if l == 1:
+        wire_mapping[name] = input_wire
+        input_wire += 1
+    else:
+        for i in range(l):
+            wire_mapping["%s[%s]" % (name, i)] = input_wire
+            input_wire += 1
+
+# Fix wires to offset with input_wires
+# Sort, in string mode should be fine
+logging.info("Processing internal wires")
+wires.sort()
+num_wires = 2
+for i in range(len(wires)):
+    name, l = getIOProp(wires[i])
+    prop_lens[name] = l
+    if l == 1:
+        wire_mapping[name] = num_wires + num_input_wires
+        num_wires += 1
+    else:
+        for j in range(l):
+            wire_mapping["%s[%s]" % (name, j)] = num_wires + num_input_wires
+            num_wires += 1
+
+# Outputs are last wires, note gnd and high require a wire
+logging.info("Processing output wires")
+output_wire = num_input_wires + num_wires
+output_wire_names = []
+for op in outputs:
+    name, l = getIOProp(op)
+    num_output_wires += l
+    output_lens.append(str(l))
+    prop_lens[name] = l
+    if l == 1:
+        wire_mapping[name] = output_wire
+        output_wire_names.append(name)
+        output_wire += 1
+    else:
+        for i in range(l):
+            wire_mapping["%s[%s]" % (name, i)] = output_wire
+            output_wire_names.append("%s[%s]" % (name, i))
+            output_wire += 1
+
+total_num_wires = sum([num_input_wires, num_output_wires, num_wires])
+
+# Set gnd and high wire.
+STR_GND_WIRE = "YOSYS_STR_GND_WIRE"
+STR_HIGH_WIRE = "YOSYS_STR_HIGH_WIRE"
+wire_mapping[STR_GND_WIRE] = num_input_wires
+wire_mapping[STR_HIGH_WIRE] = num_input_wires + 1
+
+
+# Process assignments
+logging.info("Processing assigments")
+class Instruction(object):
+    def __init__(self, inputs, output, cmd):
+        self.inputs = list(map(lambda a: a.strip(), inputs))
+        # Only a single output is supported
+        self.output = output.strip()
+        self.cmd = cmd
+    def __str__(self):
+        io = ' '.join(list(map(lambda a: str(wire_mapping[a]), self.inputs + [self.output])))
+        return "%d 1 %s %s" % (len(self.inputs), io, self.cmd) 
+
+code = []
+code.append(Instruction([STR_GND_WIRE], STR_HIGH_WIRE, "INV"))
+try:
+    for assign in assigns:
+        result_wire = re.findall(r"([^=]+)=", assign)[0].strip()
+        operation_side = re.findall(r"=(.*)$", assign)[0].strip()
+        # AND
+        is_and = re.findall(r"(.*)\&(.*)", operation_side)
+        if len(is_and) > 0:
+            assert(len(is_and[0]) == 2)
+            code.append(Instruction([is_and[0][0], is_and[0][1]], result_wire, "AND"))
+            continue
+        # NOT (INV)
+        is_inv = re.findall(r"~(.*)$", operation_side)
+        if len(is_inv) > 0:
+            assert(len(is_inv) == 1)
+            code.append(Instruction([is_inv[0]], result_wire, "INV"))
+            continue
+        # XOR
+        is_xor = re.findall(r"(.*)\^(.*)$", operation_side)
+        if len(is_xor) > 0:
+            assert(len(is_xor[0]) == 2)
+            code.append(Instruction([is_xor[0][0], is_xor[0][1]], result_wire, "XOR"))
+            continue
+
+        # ADD CUSTOM GATES HERE
+
+
+
+
+        # ASSIGNMENT
+        # Note last part handles assignments to wires/vars, 
+        operation_side = "{%s}" % operation_side
+        result_var = re.findall(r"^([^[]+)", result_wire)[0]
+        result_arr1 = re.findall(r"\[(.+)\]", result_wire)
+        result_arr2 = re.findall(r"\[(.+):(.+)\]", result_wire)
+        # Part of variable is being assigned e.g. assign a[1] =
+        if len(result_arr2) > 0:
+            tmp_wire_start = int(result_arr2[0][1])
+            tmp_wire_end = int(result_arr2[0][0])
+        elif len(result_arr1) > 0:
+            tmp_wire_start = int(result_arr1[0][0])
+            tmp_wire_end = tmp_wire_start
+        # All the variable is assigned e.g. assign a =
+        else:
+            tmp_wire_start = 0
+            tmp_wire_end = prop_lens[result_var] - 1
+        tmp_wires = [None for i in range(prop_lens[result_var])]
+        # Since re.findall is non-overlapping
+        operation_side = operation_side.replace(",", ",,")
+        parts = re.findall(r"[{,]([^{},]+)[},]", operation_side)
+        pos_i = tmp_wire_end 
+        for part in parts:
+            part = part.strip()
+            result_arr = re.findall(r"(\d{1,})'b(.*)", part)
+            # Static assignment
+            if len(result_arr) == 1:
+                for i in range(int(result_arr[0][0])):
+                    if result_arr[0][1][i] == '1':
+                        tmp_wires[pos_i] = STR_HIGH_WIRE
+                    elif result_arr[0][1][i] == '0':
+                        tmp_wires[pos_i] = STR_GND_WIRE
+                    # Note x is just skipped
+                    pos_i -= 1
+            else:
+                if part in wire_mapping:
+                    tmp_wires[pos_i] = part
+                    pos_i -= 1
+                else:
+                    a_var = re.findall(r"^([^[]+)", part)[0]
+                    a_arr = re.findall(r"\[(.+):(.+)\]", part)
+                    if len(a_arr) > 0:
+                        for i in range(int(a_arr[0][0]), int(a_arr[0][1]) - 1, -1):
+                            tmp_wires[pos_i] = "%s[%s]" % (a_var, i)
+                            pos_i -= 1
+                    else:
+                        for i in range(prop_lens[part] - 1, -1, -1):
+                            tmp_wires[pos_i] = "%s[%s]" % (part, i)
+                            pos_i -= 1
+        assert pos_i + 1 == tmp_wire_start
+        if result_var in wire_mapping and tmp_wire_start == tmp_wire_end and tmp_wire_start == 0:
+            code.append(Instruction([tmp_wires[0]], "%s" % result_var, "EQW"))
+        else:
+            for i in range(tmp_wire_start, tmp_wire_end + 1):
+                if tmp_wires[i] == None:
+                    continue
+                code.append(Instruction([tmp_wires[i]], "%s[%s]" % (result_var, i), "EQW"))
+
+except Exception as e:
+    logging.error(e)
+    exit()
+
+# Apply some quick optimisations
+#- Remove output wires which are not used and not connected to an output port
+while True:
+    output_wires = set(map(lambda a: a.output, code))
+    input_wires = []
+    remove_wires = []
+    remove_wires_mappings = []
+    for instr in code:
+        input_wires += instr.inputs
+    input_wires = set(input_wires)
+    for output_wire in output_wires:
+        if output_wire not in input_wires and output_wire not in output_wire_names and wire_mapping[output_wire] >= num_input_wires:
+            remove_wires.append(output_wire)
+            remove_wires_mappings.append(wire_mapping[output_wire])
+            del wire_mapping[output_wire]
+    if len(remove_wires) == 0:
+        break
+    # Remove from code
+    i = len(code) - 1
+    while i >= 0:
+        instr = code[i]
+        if instr.output in remove_wires:
+            del code[i]
+        i -= 1
+    total_num_wires -= len(remove_wires_mappings)
+    # Adjust wire mappings
+    for wire in wire_mapping:
+        curr = wire_mapping[wire]
+        if curr < num_input_wires:
+            continue
+        num = reduce(lambda c, a: c + int(a < curr), [0] + remove_wires_mappings)
+        wire_mapping[wire] -= num
+
+# Output the Bristol Format
+logging.info("Outputting the Bristol formatted netlist to %s" % args.output_file)
+with open(args.output_file, 'w') as f:
+    f.write("%s %s\n" % (len(code), total_num_wires))
+    f.write("%s %s\n" % (len(input_lens), ' '.join(input_lens)))
+    f.write("%s %s\n" % (len(output_lens), ' '.join(output_lens)))
+    f.write("\n")
+    for instr in code:
+        f.write("%s\n" % str(instr))
+    f.write("\n\n")
diff --git a/Compiler/floatingpoint.py b/Compiler/floatingpoint.py
index 681a0cb0..6f320014 100644
--- a/Compiler/floatingpoint.py
+++ b/Compiler/floatingpoint.py
@@ -571,6 +571,22 @@ def SDiv_mono(a, b, l, kappa):
     y = TruncPr(y, 3 * l, 2 * l, kappa)
     return y
 
+##
+# SDiv as annotated in ABZS12. It perfroms 
+# division using rapson newton from approx on 
+# 2^l
+def SDiv_ABZS12(a, b, l, kappa):
+    theta = int(ceil(log(l, 2)))
+    x = b
+    y = a
+    for i in range(theta -1):
+        y = y * ((2 ** (l + 1)) - x)
+        y = TruncPr(y, 2 * l + 1, l, kappa)
+        x = x * ((2 ** (l + 1)) - x)
+        x = TruncPr(x, 2 * l + 1, l, kappa)
+    y = y * ((2 ** (l + 1)) - x)
+    y = TruncPr(y, 2 * l + 1, l, kappa)
+    return y
 
 def AdditionKOp(a,b):
     return a + b
diff --git a/Compiler/instructions.py b/Compiler/instructions.py
index d1b0dc8f..3a828eba 100644
--- a/Compiler/instructions.py
+++ b/Compiler/instructions.py
@@ -31,6 +31,8 @@
   LDARG= 0x11,
   REQBL= 0x12,
   STARG= 0x13,
+  CALL= 0x14,
+  RETURN= 0x15,
   RUN_TAPE= 0x19,
   JOIN_TAPE= 0x1A,
   CRASH= 0x1B,
@@ -87,6 +89,7 @@
   TRIPLE= 0x50,
   BIT= 0x51,
   SQUARE= 0x52,
+  DABIT= 0x53,
 
   # sregint/sbit instructions
   LDMSINT= 0x60,
@@ -136,7 +139,6 @@
   LTINT= 0x95,
   GTINT= 0x96,
   EQINT= 0x97,
-  JMPI= 0x98,
 
   # Integers
   LDINT= 0x9A,
@@ -153,16 +155,15 @@
   CONVSREGSINT= 0xC4,
 
   # Debug Printing
-  PRINTMEM= 0xB0,
-  PRINTREG= 0XB1,
-  PRINTREGPLAIN= 0xB2,
-  PRINTCHR= 0xB3,
-  PRINTSTR= 0xB4,
-  PRINTCHRINT= 0xB5,
-  PRINTSTRINT= 0xB6,
-  PRINTFLOATPLAIN= 0xB7,
-  PRINTFIXPLAIN= 0xB8,
-  PRINTINT= 0xB9,
+  PRINT_MEM= 0xB0,
+  PRINT_REG= 0xB2,
+  PRINT_CHAR= 0xB3,
+  PRINT_CHAR4= 0xB4,
+  PRINT_CHAR_REGINT= 0xB5,
+  PRINT_CHAR4_REGINT= 0xB6,
+  PRINT_FLOAT= 0xB7,
+  PRINT_FIX= 0xB8,
+  PRINT_INT= 0xB9,
 
   # Comparison of sregints
   EQZSINT = 0xD0,
@@ -187,8 +188,8 @@
 
   # Others
   RAND= 0xE0,
-  START_TIMER= 0xE1,
-  STOP_TIMER= 0xE2,
+  START_CLOCK= 0xE1,
+  STOP_CLOCK= 0xE2,
 
   # Local functions
   LF_CINT= 0xEA,
@@ -305,8 +306,9 @@ class movsint(base.Instruction):
 
 @base.vectorize
 class opensint(base.Instruction):
-    """ Open the sregint in sr_j and assign it to r_i.
-         This instruction is vectorizable
+    """ OPENSINT i j
+        Open the sregint in sr_j and assign it to r_i.
+        This instruction is vectorizable
      """
     __slots__ = []
     code = base.opcodes['OPENSINT']
@@ -315,8 +317,9 @@ class opensint(base.Instruction):
 
 @base.vectorize
 class opensbit(base.Instruction):
-    """ Open the sbit in sb_j and assign it to r_i.
-         This instruction is vectorizable
+    """ OPENSBIT i j
+        Open the sbit in sb_j and assign it to r_i.
+        This instruction is vectorizable
      """
     __slots__ = []
     code = base.opcodes['OPENSBIT']
@@ -368,7 +371,7 @@ class mulsintc(base.Instruction):
 
 @base.vectorize
 class mul2sint(base.Instruction):
-    r""" MUL2SINTC i j u v
+    r""" MUL2SINT i j u v
          Full multiplication of secret registers (sr_i || sr_j )=sr_u \cdot sr_v.
          Where sr_i is the most significant word and sr_j is the least
          significant word of the output.
@@ -395,7 +398,7 @@ class LF_CINT(base.VarArgsInstruction):
     r""" LF_CINT i0, i1, i2, i3, i4, i5 [outputs], [inputs]
          This calls the Local Function with index i0, which
          produces i1 cints as output, and takes i2 rints,
-         i3 srints, i4 cints and i5 srints as input.
+         i3 srints, i4 cints and i5 sints as input.
     """
     code = base.opcodes['LF_CINT']
     def __init__(self, *args):
@@ -413,7 +416,7 @@ class LF_SINT(base.VarArgsInstruction):
     r""" LF_SINT i0, i1, i2, i3, i4, i5 [outputs], [inputs]
          This calls the Local Function with index i0, which
          produces i1 sints as output, and takes i2 rints,
-         i3 srints, i4 cints and i5 srints as input.
+         i3 srints, i4 cints and i5 sints as input.
     """
     code = base.opcodes['LF_SINT']
     def __init__(self, *args):
@@ -430,7 +433,7 @@ class LF_REGINT(base.VarArgsInstruction):
     r""" LF_REGINT i0, i1, i2, i3, i4, i5 [outputs], [inputs]
          This calls the Local Function with index i0, which
          produces i1 regints as output, and takes i2 rints,
-         i3 srints, i4 cints and i5 srints as input.
+         i3 srints, i4 cints and i5 sints as input.
     """
     code = base.opcodes['LF_REGINT']
     def __init__(self, *args):
@@ -447,7 +450,7 @@ class LF_SREGINT(base.VarArgsInstruction):
     r""" LF_SREGINT i0, i1, i2, i3, i4, i5 [outputs], [inputs]
          This calls the Local Function with index i0, which
          produces i1 sregints as output, and takes i2 rints,
-         i3 srints, i4 cints and i5 srints as input.
+         i3 srints, i4 cints and i5 sints as input.
     """
     code = base.opcodes['LF_SREGINT']
     def __init__(self, *args):
@@ -464,7 +467,7 @@ def __init__(self, *args):
 
 @base.vectorize
 class subsintc(base.Instruction):
-    r""" SUBC i j k
+    r""" SUBSINTC i j k
          Subtracts secret and clear registers sr_i=sr_j-r_k.
          This instruction is vectorizable
      """
@@ -475,7 +478,7 @@ class subsintc(base.Instruction):
 
 @base.vectorize
 class subsint(base.Instruction):
-    r""" SUBS i j k
+    r""" SUBSINT i j k
          Subtracts secret registers sr_i=sr_j-sr_k.
          This instruction is vectorizable
      """
@@ -486,7 +489,7 @@ class subsint(base.Instruction):
 
 @base.vectorize
 class subcints(base.Instruction):
-    r""" SUBC i j k
+    r""" SUBCINTS i j k
          Subtracts clear and secret registers sr_i=r_j-sr_k.
          This instruction is vectorizable
      """
@@ -529,7 +532,7 @@ class shrsint(base.Instruction):
 @base.vectorize
 class neg(base.Instruction):
     r""" NEG i j
-         Negation of a a secret register s_i=-s_j .
+         Negation of a regint sr_i=-sr_j .
          This instruction is vectorizable
      """
     __slots__ = []
@@ -700,7 +703,7 @@ class bitsint(base.Instruction):
 @base.vectorize
 class sintbit(base.Instruction):
     r""" SINTBIT i j k n
-         Assigns si to sj, and then sets the n-th bit to be sb_k
+         Assigns sri to srj, and then sets the n-th bit to be sb_k
          This instruction is vectorizable
      """
     __slots__ = ["code"]
@@ -986,6 +989,7 @@ class ldtn(base.Instruction):
 class ldarg(base.Instruction):
     r""" LDARG i
          Assigns the argument passed to the current thread to the regint register r_i.
+         This is also used to pass variables to functions.
          This instruction is vectorizable
      """
     code = base.opcodes['LDARG']
@@ -996,6 +1000,7 @@ class ldarg(base.Instruction):
 class starg(base.Instruction):
     r""" STARG i
          Assigns register r_i to variable in the thread argument.
+         This is also used to pass variables to functions.
          This instruction is vectorizable
      """
     code = base.opcodes['STARG']
@@ -1034,6 +1039,25 @@ class crash(base.IOInstruction):
     code = base.opcodes['CRASH']
     arg_format = []
 
+class CALL(base.JumpInstruction):
+    r""" CALL n
+         Pushes the current PC onto the stack, and then performs an unconditional relative jump of n+1 instructions. 
+     """
+    __slots__ = []
+    code = base.opcodes['CALL']
+    arg_format = ['int']
+    jump_arg = 0
+
+class RETURN(base.JumpInstruction):
+    r""" RETURN
+         Pops an integer off the stack, and sets the program counter
+         to this value. Used to return from sub-routines executed
+         by CALL
+     """
+    code = base.opcodes['RETURN']
+    arg_format = []
+    jump_arg = -1
+
 class restart(base.IOInstruction):
     r""" RESTART
          Restart the runtime by reloading the schedule file. 
@@ -1351,7 +1375,7 @@ class mulci(base.Instruction):
 
 @base.vectorize
 class mulsi(base.Instruction):
-    r""" MULCI i j n
+    r""" MULSI i j n
          Multiplication of secret register by immediate value s_i=s_j \cdot n.
          This instruction is vectorizable
      """
@@ -1374,7 +1398,7 @@ class divci(base.Instruction):
 
 @base.vectorize
 class modci(base.Instruction):
-    r""" MODC i j n
+    r""" MODCI i j n
          Clear division with remainder c_i=c_j%n (after lifting to the integers) by an immediate
          This instruction is vectorizable
      """
@@ -1496,6 +1520,16 @@ class bit(base.DataInstruction):
     arg_format = ['sw']
     data_type = 'bit'
 
+@base.vectorize
+class dabit(base.DataInstruction):
+    r""" DABIT i j
+         Load sint, sbit registers s_i and sb_j with the next secret dabit.
+         This instruction is vectorizable
+     """
+    __slots__ = []
+    code = base.opcodes['DABIT']
+    arg_format = ['sw', 'sbw']
+    data_type = 'dabit'
 
 @base.vectorize
 class square(base.DataInstruction):
@@ -1525,83 +1559,68 @@ class private_input(base.IOInstruction):
 
 @base.vectorize
 class print_mem(base.IOInstruction):
-    r""" PRINTMEM i
+    r""" PRINT_MEM i
          Print value in clear memory C[i] to debug IO channel.
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
     __slots__ = []
-    code = base.opcodes['PRINTMEM']
+    code = base.opcodes['PRINT_MEM']
     arg_format = ['int']
 
 
 @base.vectorize
 class print_reg(base.IOInstruction):
-    r""" PRINTREG i j
-         Print value of cint register c_i to debug IO channel and 4-char comment j
-         Can only be executed in thread zero.
-         This instruction is vectorizable
-     """
-    __slots__ = []
-    code = base.opcodes['PRINTREG']
-    arg_format = ['c', 'i']
-
-    def __init__(self, reg, comment=''):
-        super(print_reg_class, self).__init__(reg, self.str_to_int(comment))
-
-
-@base.vectorize
-class print_reg_plain(base.IOInstruction):
-    r""" PRINTREGPLAIN i
+    r""" PRINT_REG i
          As above but skips the comment j
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
     __slots__ = []
-    code = base.opcodes['PRINTREGPLAIN']
+    code = base.opcodes['PRINT_REG']
     arg_format = ['c']
 
 @base.vectorize
-class print_fix_plain(base.IOInstruction):
-    r""" PRINTFIXPLAIN i f k
+class print_fix(base.IOInstruction):
+    r""" PRINT_FIX i f k
          Prints the fixed point number in cint register c_i using parameters f and k
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
     __slots__ = []
-    code = base.opcodes['PRINTFIXPLAIN']
+    code = base.opcodes['PRINT_FIX']
     arg_format = ['c', 'i', 'i']
 
 @base.vectorize
-class print_float_plain(base.IOInstruction):
-    r""" PRINTFLOATPLAIN i j k l
+class print_float(base.IOInstruction):
+    r""" PRINT_FLOAT i j k l
          Prints the floating point number in cint registers (c_i, c_j, c_k, c_l) assuming they map to the representation (v,p,z,s)
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
     __slots__ = []
-    code = base.opcodes['PRINTFLOATPLAIN']
+    code = base.opcodes['PRINT_FLOAT']
     arg_format = ['c', 'c', 'c', 'c']
 
-
+@base.vectorize
 class print_int(base.IOInstruction):
-    r""" PRINTINT i
+    r""" PRINT_INT i
          Prints the value of register r_i to debug IO channel. 
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
     __slots__ = []
-    code = base.opcodes['PRINTINT']
+    code = base.opcodes['PRINT_INT']
     arg_format = ['r']
 
 
 class print_char(base.IOInstruction):
-    r""" PRINTCHAR i
+    r""" PRINT_CHAR i
          Prints a single character i to debug IO channel.
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
-    code = base.opcodes['PRINTCHR']
+    code = base.opcodes['PRINT_CHAR']
     arg_format = ['int']
 
     def __init__(self, ch):
@@ -1609,12 +1628,12 @@ def __init__(self, ch):
 
 
 class print_char4(base.IOInstruction):
-    r""" PRINTSTR i
+    r""" PRINT_CHAR4 i
          Print a 4 character string i to debug IO channel.
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
-    code = base.opcodes['PRINTSTR']
+    code = base.opcodes['PRINT_CHAR4']
     arg_format = ['int']
 
     def __init__(self, val):
@@ -1623,22 +1642,22 @@ def __init__(self, val):
 
 @base.vectorize
 class print_char_regint(base.IOInstruction):
-    r""" PRINTCHRINT i
+    r""" PRINT_CHAR_REGINT i
          Print regint register r_i as a single character to debug IO channel.
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
-    code = base.opcodes['PRINTCHRINT']
+    code = base.opcodes['PRINT_CHAR_REGINT']
     arg_format = ['r']
 
 @base.vectorize
 class print_char4_regint(base.IOInstruction):
-    r""" PRINTSTRINT i
+    r""" PRINTi_CHAR4_REGINT i
          Print regint register r_i as a four character string to debug IO channel.
          Can only be executed in thread zero.
          This instruction is vectorizable
      """
-    code = base.opcodes['PRINTSTRINT']
+    code = base.opcodes['PRINT_CHAR4_REGINT']
     arg_format = ['r']
 
 @base.vectorize
@@ -1667,11 +1686,11 @@ class input_int(base.IOInstruction):
     arg_format = ['rw','i']
 
 class open_chan(base.IOInstruction):
-    r""" OPEN_CHAN n
+    r""" OPEN_CHAN i n
          Opens channel number n for reading/writing on the IO class.
          Channels are assumed to be bi-directional, i.e. can read and write.
          This is provided as some IO classes may require this to be called explicitly, the default one does not need this.
-         The return value *can* be some error code which the IO class may want to return.
+         The return value r_i *can* be some error code which the IO class may want to return.
          Can only be executed in thread zero.
      """
     __slots__ = []
@@ -1720,7 +1739,7 @@ def has_var_args(self):
 
 @base.vectorize
 class output_clear(base.IOInstruction):
-    r""" OUTPUT i n
+    r""" OUTPUT_CLEAR i n
          Public output of cint register c_i to IO class on channel n.
          This instruction is vectorizable
          Can only be executed in thread zero.
@@ -1829,7 +1848,7 @@ class divint(base.Instruction):
 #
 
 @base.vectorize
-class eqzc(base.Instruction):
+class eqzint(base.Instruction):
     r""" EQZINT i j
          Clear comparison to zero test of regint registers r_i = (r_j == 0).
          This instruction is vectorizable
@@ -1840,7 +1859,7 @@ class eqzc(base.Instruction):
 
 
 @base.vectorize
-class ltzc(base.Instruction):
+class ltzint(base.Instruction):
     r""" LTZINT i j
          Clear comparison of regint registers r_i = (r_j < 0).
          This instruction is vectorizable
@@ -1851,7 +1870,7 @@ class ltzc(base.Instruction):
 
 
 @base.vectorize
-class ltc(base.Instruction):
+class ltint(base.Instruction):
     r""" LTINT i j k
          Clear comparison of regint registers r_i = (r_j < r_k).
          This instruction is vectorizable
@@ -1862,7 +1881,7 @@ class ltc(base.Instruction):
 
 
 @base.vectorize
-class gtc(base.Instruction):
+class gtint(base.Instruction):
     r""" GTINT i j k
          Clear comparison of regint registers r_i = (r_j > r_k).
          This instruction is vectorizable
@@ -1873,7 +1892,7 @@ class gtc(base.Instruction):
 
 
 @base.vectorize
-class eqc(base.Instruction):
+class eqint(base.Instruction):
     r""" EQINT i j k
          Clear comparison of regint registers r_i = (r_j == r_k).
          This instruction is vectorizable
@@ -1897,16 +1916,6 @@ class jmp(base.JumpInstruction):
     jump_arg = 0
 
 
-class jmpi(base.JumpInstruction):
-    r""" JMPI i
-         Unconditional relative jump of r_i+1 instructions. 
-     """
-    __slots__ = []
-    code = base.opcodes['JMPI']
-    arg_format = ['r']
-    jump_arg = 0
-
-
 class jmpnz(base.JumpInstruction):
     r""" JMPNZ i n
          Jump of n+1 instructions if regint register r_i \neq 0.
@@ -1948,8 +1957,9 @@ class convint(base.Instruction):
 
 @base.vectorize
 class convmodp(base.Instruction):
-    """ CONVMODP i j
-        Convert from cint register c_j to regint register r_i.
+    """ CONVMODP i j n
+        Convert from cint register c_j to regint register r_i with
+        bitlength of c_j equal to n
         This instruction is vectorizable
      """
     __slots__ = []
@@ -1988,17 +1998,17 @@ class stopopen(base.VarArgsInstruction):
 
 
 class start_clock(base.Instruction):
-    r""" START_TIMER n
+    r""" START_CLOCK n
          Re-initializes the specified timer n 
      """
-    code = base.opcodes['START_TIMER']
+    code = base.opcodes['START_CLOCK']
     arg_format = ['i']
 
 class stop_clock(base.Instruction):
-    r""" STOP_TIMER n
+    r""" STOP_CLOCK n
          Prints the time since the last initialization of timer n 
      """
-    code = base.opcodes['STOP_TIMER']
+    code = base.opcodes['STOP_CLOCK']
     arg_format = ['i']
 
 
diff --git a/Compiler/instructions_base.py b/Compiler/instructions_base.py
index c6e75688..0f42096a 100644
--- a/Compiler/instructions_base.py
+++ b/Compiler/instructions_base.py
@@ -41,6 +41,8 @@
   LDARG= 0x11,
   REQBL= 0x12,
   STARG= 0x13,
+  CALL= 0x14,
+  RETURN=0x15,
   RUN_TAPE= 0x19,
   JOIN_TAPE= 0x1A,
   CRASH= 0x1B,
@@ -97,6 +99,7 @@
   TRIPLE= 0x50,
   BIT= 0x51,
   SQUARE= 0x52,
+  DABIT= 0x53,
 
   # sregint/sbit instructions
   LDMSINT= 0x60,
@@ -146,7 +149,6 @@
   LTINT= 0x95,
   GTINT= 0x96,
   EQINT= 0x97,
-  JMPI= 0x98,
 
   # Integers
   LDINT= 0x9A,
@@ -163,16 +165,15 @@
   CONVSREGSINT= 0xC4,
 
   # Debug Printing
-  PRINTMEM= 0xB0,
-  PRINTREG= 0XB1,
-  PRINTREGPLAIN= 0xB2,
-  PRINTCHR= 0xB3,
-  PRINTSTR= 0xB4,
-  PRINTCHRINT= 0xB5,
-  PRINTSTRINT= 0xB6,
-  PRINTFLOATPLAIN= 0xB7,
-  PRINTFIXPLAIN= 0xB8,
-  PRINTINT= 0xB9,
+  PRINT_MEM= 0xB0,
+  PRINT_REG= 0xB2,
+  PRINT_CHAR= 0xB3,
+  PRINT_CHAR4= 0xB4,
+  PRINT_CHAR_REGINT= 0xB5,
+  PRINT_CHAR4_REGINT= 0xB6,
+  PRINT_FLOAT= 0xB7,
+  PRINT_FIX= 0xB8,
+  PRINT_INT= 0xB9,
 
   # Comparison of sregints
   EQZSINT = 0xD0,
@@ -197,8 +198,8 @@
 
   # Others
   RAND= 0xE0,
-  START_TIMER= 0xE1,
-  STOP_TIMER= 0xE2,
+  START_CLOCK= 0xE1,
+  STOP_CLOCK= 0xE2,
 
   # Local functions
   LF_CINT= 0xEA,
@@ -631,6 +632,8 @@ def set_relative_jump(self, value):
         self.args[self.jump_arg] = value
 
     def get_relative_jump(self):
+        if self.jump_arg == -1:
+	  return -1
         return self.args[self.jump_arg]
 
 
diff --git a/Compiler/library.py b/Compiler/library.py
index 65296046..ddd930e1 100644
--- a/Compiler/library.py
+++ b/Compiler/library.py
@@ -68,15 +68,15 @@ def print_plain_str(ss):
                 val = args[i]
             if isinstance(val, program.Tape.Register):
                 if val.is_clear:
-                    val.print_reg_plain()
+                    val.print_reg()
                 else:
                     raise CompilerError('Cannot print secret value:', args[i])
             elif isinstance(val, sfix) or isinstance(val, sfloat):
                 raise CompilerError('Cannot print secret value:', args[i])
             elif isinstance(val, cfloat):
-                val.print_float_plain()
+                val.print_float()
             elif isinstance(val, cfix):
-                val.print_fix_plain()
+                val.print_fix()
             elif isinstance(val, list):
                 print_str('[' + ', '.join('%s' for i in range(len(val))) + ']', *val)
             else:
@@ -322,8 +322,7 @@ def on_first_call(self, wrapped_function):
         print 'Done compiling function', self.name
         p_return_address = get_tape().program.malloc(1, 'r')
         get_tape().function_basicblocks[block] = p_return_address
-        return_address = regint.load_mem(p_return_address)
-        get_tape().active_basicblock.set_exit(instructions.jmpi(return_address, add_to_prog=False))
+        get_tape().active_basicblock.set_exit(instructions.RETURN(add_to_prog=False))
         self.last_sub_block = get_tape().active_basicblock
         get_tape().close_scope(old_block, parent_node, 'end-' + self.name)
         old_block.set_exit(instructions.jmp(0, add_to_prog=False), get_tape().active_basicblock)
@@ -336,11 +335,9 @@ def on_call(self, base, bases):
         if block not in get_tape().function_basicblocks:
             raise CompilerError('unknown function')
         old_block = get_tape().active_basicblock
-        old_block.set_exit(instructions.jmp(0, add_to_prog=False), block)
+        old_block.set_exit(instructions.CALL(0, add_to_prog=False), block)
         p_return_address = get_tape().function_basicblocks[block]
         return_address = get_tape().new_reg('r')
-        old_block.return_address_store = instructions.ldint(return_address, 0)
-        instructions.stmint(return_address, p_return_address)
         get_tape().start_new_basicblock(name='call-' + self.name)
         get_tape().active_basicblock.set_return(old_block, self.last_sub_block)
         get_tape().req_node.children.append(self.node)
diff --git a/Compiler/oram.py b/Compiler/oram.py
index 0cbeec27..093e26b8 100644
--- a/Compiler/oram.py
+++ b/Compiler/oram.py
@@ -430,7 +430,7 @@ def __repr__(self):
 class RAM(RefRAM):
     """ List of entries in memory. """
     def __init__(self, size, entry_type, index=0):
-        #print_reg(cint(0), 'r in')
+        #print_reg_char4(cint(0), 'r in')
         self.size = size
         self.entry_type = entry_type
         self.l = [t.dynamic_array(self.size, t) for t in entry_type]
@@ -885,16 +885,16 @@ def __repr__(self, depth=0):
                 self.ref_children(1).__repr__(depth + 1)
         return result
     def output(self):
-        print_reg(cint(self.depth), 'buck')
+        print_reg_char4(cint(self.depth), 'buck')
         Program.prog.curr_tape.start_new_basicblock()
         self.bucket.output()
-        print_reg(cint(self.depth), 'dep')
+        print_reg_char4(cint(self.depth), 'dep')
         Program.prog.curr_tape.start_new_basicblock()
         @if_(self.p_children(1) < oram.n_buckets())
         def f():
             for i in (0,1):
                 child = self.ref_children(i)
-                print_reg(cint(i), 'chil')
+                print_reg_char4(cint(i), 'chil')
                 Program.prog.curr_tape.start_new_basicblock()
                 child.output()
 
@@ -954,7 +954,7 @@ def update(self, index, value):
         return self.value_type(read_value)
     def output(self):
         for i,v in enumerate(self):
-            print_reg(v.reveal(), 'i %d' % i)
+            print_reg_char4(v.reveal(), 'i %d' % i)
     __getitem__ = lambda self,index: List.__getitem__(self, index)[0]
 
 def get_n_threads_for_tree(size):
@@ -1013,7 +1013,7 @@ def add_to_root(self, state, is_empty, v, *x):
         l = state
         self.root.bucket.add(Entry(v, (l,) + x, is_empty))
     def evict_bucket(self, bucket, d):
-        #print_reg(cint(0), 'evb')
+        #print_reg_char4(cint(0), 'evb')
         #print 'pre', bucket
         entry = bucket.bucket.pop()
         #print 'evict', entry
@@ -1107,7 +1107,7 @@ def read_and_remove(self, u):
     def add(self, entry, state=None):
         if state is None:
             state = self.state.read()
-        #print_reg(cint(0), 'add')
+        #print_reg_char4(cint(0), 'add')
         #print 'add', id(self)
         #print 'pre-add', self
         maybe_start_timer(4)
@@ -1122,21 +1122,21 @@ def add(self, entry, state=None):
         #print 'post-evict', self
     def evict(self):
         #print 'evict root', id(self)
-        #print_reg(cint(0), 'ev_r')
+        #print_reg_char4(cint(0), 'ev_r')
         self.evict_bucket(self.root, 0)
         self.check()
         if self.D > 1:
             #print 'evict 1', id(self)
-            #print_reg(cint(0), 'ev1')
+            #print_reg_char4(cint(0), 'ev1')
             self.evict2(self.root.p_children(0), self.root.p_children(1), 1)
             self.check()
         if self.D > 2:
-            #print_reg(cint(self.D), 'D')
+            #print_reg_char4(cint(self.D), 'D')
             @for_range(2, self.D)
             def f(d):
-                #print_reg(d, 'ev2')
+                #print_reg_char4(d, 'ev2')
                 #print 'evict 2', id(self)
-                #print_reg(d, 'evl2')
+                #print_reg_char4(d, 'evl2')
                 s1 = regint.get_random(d)
                 s2 = MemValue(regint(0))
                 @do_while
@@ -1537,8 +1537,8 @@ def __repr__(self):
         return repr(self.l)
     def output(self):
         if self.small:
-            print_reg(self.l[0].reveal(), 'i0')
-            print_reg(self.l[1].reveal(), 'i1')
+            print_reg_char4(self.l[0].reveal(), 'i0')
+            print_reg_char4(self.l[1].reveal(), 'i1')
 
 class PackedORAMWithEmpty(AbstractORAM, PackedIndexStructure):
     def __init__(self, size, entry_size=None, value_type=sint, init_rounds=-1):
@@ -1657,7 +1657,7 @@ def test_oram_access(oram_type, N, value_type=sint, index_size=None, iterations=
     oram = oram_type(N, value_type=value_type, entry_size=32, \
                          init_rounds=0)
     print 'initialized'
-    print_reg(cint(0), 'init')
+    print_reg_char4(cint(0), 'init')
     stop_timer()
     # synchronize
     Program.prog.curr_tape.start_new_basicblock(name='sync')
@@ -1686,9 +1686,9 @@ def test_batch_init(oram_type, N):
     value_type = sint
     oram = oram_type(N, value_type)
     print 'initialized'
-    print_reg(cint(0), 'init')
+    print_reg_char4(cint(0), 'init')
     oram.batch_init([value_type(i) for i in range(N)])
-    print_reg(cint(0), 'done')
+    print_reg_char4(cint(0), 'done')
     @for_range(N)
     def f(i):
         x = oram[value_type(i)]
diff --git a/Compiler/program.py b/Compiler/program.py
index b1a92552..c7a71547 100644
--- a/Compiler/program.py
+++ b/Compiler/program.py
@@ -22,6 +22,7 @@
     bit = 2,
     inverse = 3,
     bittriple = 4,
+    dabit = 5,
 )
 
 field_types = dict(
@@ -279,9 +280,9 @@ def finalize_tape(self, tape):
         if not tape.purged:
             tape.optimize(self.options)
             tape.write_bytes()
+            if self.options.asmoutfile:
+                tape.write_str(self.options.asmoutfile + '-' + tape.name+'.asm')
             tape.purge()
-        if self.options.asmoutfile:
-            tape.write_str(self.options.asmoutfile + '-' + tape.name)
     
     
     def restart_main_thread(self):
@@ -349,6 +350,7 @@ def finalize_memory(self):
         self.curr_tape.start_new_basicblock(None, 'memory-usage')
         # reset register counter to 0
         self.curr_tape.init_registers()
+        library.jmp(0);  # Create a new basic block for the set memory instructions
         for mem_type,size in self.allocated_mem.items():
             if size:
                 print "Memory of type '%s' of size %d" % (mem_type, size)
@@ -417,10 +419,6 @@ def set_return(self, previous_block, sub_block):
             self.previous_block = previous_block
             self.sub_block = sub_block
 
-        def adjust_return(self):
-            offset = self.sub_block.get_offset(self)
-            self.previous_block.return_address_store.args[1] = offset
-        
         def set_exit(self, condition, exit_true=None):
             """ Sets the block which we start from next, depending on the condition.
 
@@ -554,8 +552,6 @@ def optimize(self, options):
         for block in self.basicblocks:
             if block.exit_block is not None:
                 block.adjust_jump()
-            if block.previous_block is not None:
-                block.adjust_return()
 
         # now remove any empty blocks (must be done after setting jumps)
         self.basicblocks = filter(lambda x: len(x.instructions) != 0, self.basicblocks)
@@ -579,7 +575,9 @@ def alloc_loop(block):
                         (block.name, i, len(self.basicblocks))
                 if block.exit_condition is not None:
                     jump = block.exit_condition.get_relative_jump()
-                    if isinstance(jump, (int,long)) and jump < 0 and \
+                    if jump != -1 and  \
+                            isinstance(jump, (int,long)) and \
+                            jump < 0 and \
                             block.exit_block.scope is not None:
                         alloc_loop(block.exit_block.scope)
                 allocator.process(block.instructions, block.alloc_pool)
diff --git a/Compiler/types.py b/Compiler/types.py
index 1525bbcc..ede0c09c 100644
--- a/Compiler/types.py
+++ b/Compiler/types.py
@@ -254,12 +254,15 @@ def convert_from(self, val):
     @set_instruction_type
     @vectorize
     def print_reg(self, comment=''):
-        print_reg(self, comment)
+        print_reg(self)
+        print_char4("  # ")
+        print_char4(comment)
+	print_char("\n")
 
     @set_instruction_type
     @vectorize
-    def print_reg_plain(self):
-        print_reg_plain(self)
+    def print_reg_pl(self):
+        print_reg(self)
 
     @set_instruction_type
     @vectorize
@@ -652,17 +655,17 @@ def __rpow__(self, other):
         return other ** cint(self)
 
     def __eq__(self, other):
-        return self.int_op(other, eqc)
+        return self.int_op(other, eqint)
 
     def __ne__(self, other):
         if isinstance(other, _secretMod2): return NotImplemented
         return 1 - (self == other)
 
     def __lt__(self, other):
-        return self.int_op(other, ltc)
+        return self.int_op(other, ltint)
 
     def __gt__(self, other):
-        return self.int_op(other, gtc)
+        return self.int_op(other, gtint)
 
     def __le__(self, other):
         if isinstance(other, _secretMod2): return NotImplemented
@@ -731,7 +734,8 @@ def bit_compose(bits):
     def reveal(self):
         return self
 
-    def print_reg_plain(self):
+    @vectorize
+    def print_reg(self):
         print_int(self)
 
 
@@ -1145,6 +1149,12 @@ def __or__(self, other):
             orsb(res, self, other)
         return res
 
+    # There are two ways of doing negation, neg and invert
+    def __invert__(self):
+        res = sbit()
+        negb(res, self)
+        return res
+
     def __neg__(self):
         res = sbit()
         negb(res, self)
@@ -1203,7 +1213,7 @@ def eqz(self):
 
     def ltz(self):
         res = sbit()
-        ltzint(res, self)
+        ltzsint(res, self)
         return res
 
 
@@ -1596,6 +1606,7 @@ def __init__(self, v, size=None):
             self.v = v
         else:
             raise NotImplementedError
+        program.curr_tape.require_bit_length(2*self.k+1)
 
     @vectorize
     def load_int(self, v):
@@ -1785,9 +1796,9 @@ def __div__(self, other):
             raise TypeError('Incompatible fixed point types in division')
 
     @vectorize
-    def print_fix_plain(self):
+    def print_fix(self):
         r"""Prints the cfix in as a vector (y,f,k)"""
-        print_fix_plain(self.v, self.f, self.k)
+        print_fix(self.v, self.f, self.k)
 
 
 ##
@@ -1870,6 +1881,7 @@ def __init__(self, _v=None, size=None):
         elif isinstance(_v, regint):
             self.v = sint(_v, size=self.size) * 2 ** f
         self.kappa = sfix.kappa
+        program.curr_tape.require_bit_length(2*self.k+self.kappa+1)
 
     @vectorize
     def load_int(self, v):
@@ -2107,7 +2119,7 @@ def __init__(self, v, p=None, z=None, s=None, err=None, size=None):
             else:
                 v, p, z, s, err = self.convert_float(v, self.vlen, self.plen)
 
-        if isinstance(v, int):
+        if isinstance(v, (int,long)):
             if not ((v >= 2 ** (self.vlen - 1) and v < 2 ** (self.vlen)) or v == 0):
                 raise CompilerError('Floating point number malformed: significand')
             self.v = library.load_int_to_secret(v)
@@ -2140,6 +2152,7 @@ def __init__(self, v, p=None, z=None, s=None, err=None, size=None):
             self.err = library.load_int_to_secret(err)
         else:
             self.err = err
+        program.curr_tape.require_bit_length(2*self.vlen+self.kappa+1)
 
     def __iter__(self):
         yield self.v
@@ -2289,18 +2302,37 @@ def mul(self, other):
             other_parse = parse_float(other)
             return self * other_parse  # self.mul(scalar_float)
 
-    def __sub__(self, other):
-        return (self + -other)
+    ##
+    # float division (FLDiv) as described in ABZS12.
+    # Additional conditions for err added in algiment with
+    # the way we work with floating point numbers.
+    def local_division_ABZS12(self, other):
+        if isinstance(other, (cfloat, sfloat)):
+            l = self.vlen
+            v = floatingpoint.SDiv_ABZS12(self.v, other.v + other.z, l, self.kappa)
+            b = v.less_than(2 ** l, l + 1, self.kappa)
+            v = floatingpoint.Trunc(v * b + v, l + 1, 1, self.kappa)
+            p = (1 - self.z) * (self.p - other.p - l + 1 - b)
+            z = self.z
+            
+            #simple xor of sign
+            s = self.s + other.s - 2*self.s * other.s
+            if isinstance(other, sfloat):
+                err = other.err
+            err = err + self.err
+            err = err + self.__flow_detect__(p)
+            err = err + other.z
+            return sfloat(v, p, z, s, err)
+        else:
 
-    def __rsub__(self, other):
-        return -1 * self + other
+            other_parse = parse_float(other)
+            return self / other_parse
 
     ##
     # realizes the division protocol for several different types.
     # @param other: value dividing self, could be any type
-    # @return sloat: new sfloat instance
-    def __div__(self, other):
-
+    # @return sloat: new sfloat instance   
+    def local_division(self, other):
         if isinstance(other, (cfloat, sfloat)):
             v = floatingpoint.SDiv(self.v, other.v + other.z * (2 ** self.vlen - 1),
                                    self.vlen, self.kappa)
@@ -2328,6 +2360,16 @@ def __div__(self, other):
             other_parse = parse_float(other)
             return self / other_parse
 
+
+    def __sub__(self, other):
+        return (self + -other)
+
+    def __rsub__(self, other):
+        return -1 * self + other
+
+    def __div__(self, other):
+        return self.local_division_ABZS12(other)
+
     @vectorize
     def __neg__(self):
         return sfloat(self.v, self.p, self.z, (1 - self.s) * (1 - self.z), self.err)
@@ -2503,6 +2545,7 @@ def __init__(self, v, p=None, z=None, s=None, size=None):
             self.v = v
         else:  # missmatch of types validation
             raise CompilerError('Missmatching input type ')
+        program.curr_tape.require_bit_length(2*self.vlen+1)
 
         # validation of p
         if isinstance(p, int):
@@ -2554,8 +2597,8 @@ def set_zero(self, flag):
     # facade method that evokes low level instructions
     # to print float number.
     # No params, uses instance records.
-    def print_float_plain(self):
-        print_float_plain(self.v, self.p, self.z, self.s)
+    def print_float(self):
+        print_float(self.v, self.p, self.z, self.s)
 
     ##
     # computes the product times -1 of the cfloat
diff --git a/Documentation/ByteCodes.tex b/Documentation/ByteCodes.tex
index ac8e09eb..519cf4e1 100644
--- a/Documentation/ByteCodes.tex
+++ b/Documentation/ByteCodes.tex
@@ -61,67 +61,66 @@ \subsection{Overview}
 
 \begin{figure}[htb!]
 \begin{center}
-\begin{picture}{(370,280)}
-
-\put(0,25){\framebox(50,30){FHE Fact 1}}
-\put(0,75){\framebox(50,30){FHE Fact 2}}
-\put(50,90){\line(1,-1){25}}
-\put(50,40){\line(1,1){25}}
-\put(75,65){\line(1,0){25}}
-\put(100,15){\line(0,1){220}}
+\begin{picture}{(370,360)}
+
+\put(0,65){\framebox(50,30){FHE Fact 1}}
+\put(0,115){\framebox(50,30){FHE Fact 2}}
+\put(50,130){\line(1,-1){25}}
+\put(50,80){\line(1,1){25}}
+\put(75,105){\line(1,0){25}}
+\put(100,15){\line(0,1){300}}
+\put(100,315){\vector(1,0){20}}
+\put(100,275){\vector(1,0){20}}
 \put(100,235){\vector(1,0){20}}
 \put(100,195){\vector(1,0){20}}
-\put(100,155){\vector(1,0){20}}
+\put(100,135){\vector(1,0){20}}
 \put(100,95){\vector(1,0){20}}
 \put(100,55){\vector(1,0){20}}
 \put(100,15){\vector(1,0){20}}
-\put(100,125){\line(1,0){130}}
-\put(230,125){\vector(0,1){55}}
-\put(230,125){\vector(0,-1){55}}
 
 
-\put(40,195){\framebox(50,30){aAND}}
-\put(0,150){\framebox(50,30){aBit}}
+\put(40,235){\framebox(50,30){aAND}}
+\put(0,190){\framebox(50,30){aBit}}
 
 %aBit->aAND
-\put(25,180){\vector(1,1){15}}
+\put(25,220){\vector(1,1){15}}
 
 %aBit->Online
-\put(25,180){\line(0,1){100}}
-\put(25,280){\line(1,0){345}}
-\put(370,280){\line(0,-1){235}}
-\put(370,45){\vector(-1,0){20}}
-\put(370,185){\vector(-1,0){20}}
+\put(25,220){\line(0,1){140}}
+\put(25,360){\line(1,0){345}}
+\put(370,360){\line(0,-1){275}}
+\put(370,85){\vector(-1,0){57}}
+\put(370,225){\vector(-1,0){57}}
 
 %aAND->Online
-\put(65,225){\line(0,1){40}}
-\put(65,265){\line(1,0){295}}
-\put(360,265){\line(0,-1){210}}
-\put(360,55){\vector(-1,0){10}}
-\put(360,195){\vector(-1,0){10}}
-
-\put(120,220){\framebox(50,30){Mult Triples}}
-\put(170,235){\vector(1,-1){27}}
-\put(120,180){\framebox(50,30){Square Pairs}}
-\put(170,195){\vector(1,0){27}}
-\put(120,140){\framebox(50,30){Bits Pairs}}
-\put(170,155){\vector(1,1){27}}
-
-\put(200,180){\framebox(70,30){Sacrifice/Inputs}}
-\put(270,195){\vector(1,0){30}}
-\put(300,180){\framebox(50,30){Online Two}}
-
-
-\put(120,80){\framebox(50,30){Mult Triples}}
-\put(170,95){\vector(1,-1){27}}
-\put(120,40){\framebox(50,30){Square Pairs}}
-\put(170,55){\vector(1,0){27}}
-\put(120,0){\framebox(50,30){Bits Pairs}}
-\put(170,15){\vector(1,1){27}}
-
-\put(200,40){\framebox(70,30){Sacrifice/Inputs}}
-\put(270,55){\vector(1,0){30}}
-\put(300,40){\framebox(50,30){Online One}}
+\put(65,265){\line(0,1){80}}
+\put(65,345){\line(1,0){295}}
+\put(360,345){\line(0,-1){250}}
+\put(360,95){\vector(-1,0){47}}
+\put(360,235){\vector(-1,0){47}}
+
+\put(120,300){\framebox(50,30){Mult Triples}}
+\put(170,315){\vector(3,-2){92}}
+\put(120,260){\framebox(50,30){Square Pairs}}
+\put(170,275){\vector(3,-1){83}}
+\put(120,220){\framebox(50,30){Bits Pairs}}
+\put(170,235){\vector(1,0){87}}
+\put(120,180){\framebox(70,30){Sacrifice/Inputs}}
+\put(190,185){\vector(2,1){65}}
+
+\put(260,220){\framebox(50,30){Online Two}}
+
+
+\put(120,120){\framebox(50,30){Mult Triples}}
+\put(170,135){\vector(3,-1){84}}
+\put(120,80){\framebox(50,30){Square Pairs}}
+\put(170,95){\vector(1,0){83}}
+\put(120,40){\framebox(50,30){Bits Pairs}}
+\put(170,55){\vector(3,1){88}}
+\put(120,0){\framebox(70,30){Sacrifice/Inputs}}
+\put(190,15){\vector(1,1){67}}
+
+\put(260,80){\framebox(50,30){Online One}}
 
 \end{picture}
 \end{center}
@@ -224,6 +223,7 @@ \subsection{Overview}
 \end{center}
 \caption{Pictorial Representation of Memory and Registers:
 With Two Online Threads}
+\label{fig:memory}
 \end{figure}
 
 \subsection{Byte-code instructions}
@@ -341,6 +341,8 @@ \subsection{Preprocessing loading instructions}
 respectively.
 The associated data is loaded from the concurrently running
 offline threads and loaded into the registers given as arguments.
+There is also an instruction \verb+DABIT+ to load a doubly authenticated
+bit into a \verb|sint| and an \verb|sbit| register.
 
 \subsection{Open instructions}
 There are tailor-made approaches to open registers depending on whether they are $\modp$ or $\modn$. We detail both in this section. 
@@ -502,9 +504,9 @@ \subsection{Debuging Output}
 debugging information to the \verb+Input_Output+ class.
 These byte-codes are
 \begin{verbatim}
-    PRINTINT,           PRINTMEM,          PRINTREG,           PRINTREGPLAIN,
-    PRINTCHR,           PRINTSTR,          PRINTCHRINT,        PRINTSTRINT,
-    PRINTFLOATPLAIN,    PRINTFIXPLAIN.
+    PRINT_INT,        PRINT_MEM,            PRINT_REG,              PRINT_CHAR,         
+    PRINT_CHAR4,      PRINT_CHAR_REGINT,    PRINT_CHAR4_REGINT,     PRINT_FLOAT,  
+    PRINT_FIX.
 \end{verbatim}
 
 \subsection{Data input and output}
@@ -518,14 +520,16 @@ \subsection{Data input and output}
              OPEN_CHAN,              CLOSE_CHAN
 \end{verbatim}
 
-\subsection{Branching on clear registers}
-Branching on clear registers is supported by the following
-instructions
+\subsection{Branching}
+Branching is supported by the following instructions
  \verb+JMP+,
     \verb+JMPNZ+,
     \verb+JMPEQZ+,
-    and
-    \verb+JMPI+.
+
+\subsection{Call/Return}
+Call and return to subroutines is supported by the following
+instructions
+\verb+CALL+ and \verb+RETURN+.
 
 \subsection{Comparison Tests for $\modn$}
 We support comparison on $\modn$ clear registers via the instructions
@@ -563,11 +567,11 @@ \subsection{Other Commands}
 See Section \ref{sec:restart} for more details on how this is used.
 \item \verb+CLEAR_REGISTERS+ which clears the registers of this processor core (i.e. thread).
 See Section \ref{sec:restart} for more details on how this is used.
-\item \verb+START_TIMER+ and \verb+STOP_TIMER+ are used to time different
+\item \verb+START_CLOCK+ and \verb+STOP_CLOCK+ are used to time different
 parts of the code. There are 100 times available in the system;
 each is initialized to zero at the start of the machine running.
-The operation \verb+START_TIMER+ re-initializes a specified timer,
-whereas \verb+STOP_TIMER+ prints the elapsed time since the last
+The operation \verb+START_CLOCK+ re-initializes a specified timer,
+whereas \verb+STOP_CLOCK+ prints the elapsed time since the last
 initialization (it does not actually reinitialise/stop the timer itself).
 These are accessed from MAMBA via the functions
 \verb+start_timer(n)+ and \verb+stop_timer(n)+.
diff --git a/Documentation/Changes.tex b/Documentation/Changes.tex
index 2dd66fd6..8427b77a 100644
--- a/Documentation/Changes.tex
+++ b/Documentation/Changes.tex
@@ -6,7 +6,7 @@
 \vspace{5mm}
 
 \noindent
-This is currently {\bf version 1.5} of the SCALE and MAMBA software.
+This is currently {\bf version 1.6} of the SCALE and MAMBA software.
 There are two main components to the system, a run time system called
 SCALE,
 \begin{center}
@@ -26,6 +26,55 @@
 You {\em should not} assume it works the same so please read
 the documentation fully before proceeding.
 
+
+\subsection{Changes in version 1.6 from 1.5}
+Apart from the usual minor bug fixes...
+\begin{enumerate}
+\item Call/Return byte-codes have been added, removing the need for the \verb+JMPI+
+instruction. This makes things much simplier going forward we hope.
+\item Some byte-code names have been changed, and made more consistent. This
+is purely cosmetic, unless you look at the assembler output from the compiler.
+\item \verb|sfloat|, \verb|sfix| are now a little more robust. A program will
+now definitely abort if the desired security bounds are not met. 
+We also removed a limitation on the mantissa size in \verb|sfloat|.
+\item Fake offline mode is now depreciated and you {\em cannot} select it
+from the Setup menu. This is due to a bug, and the fact we never actually
+use/test it ourselves.
+\item Re the change in version 1.4 for Full Threshold input production.
+Turns out the method in SPDZ-2 and in Overdrive are both insecure.
+One needs to execute a ZKPoK to prove that party $P_i$'s input is correct;
+which is basically the TopGear proof done without a summation. Then
+one operates as in SPDZ-2. This has now been alterred.
+\item Compiler now outputs assembler for all tapes which are compiled, if 
+directed to.
+\item Direct access to the DABITs via a DABIT opcode.
+\item Upgrade to the DABIT production for dishonest majority.
+\item Change to the threading to place sacrificing into the production
+threads. This makes the configuration in \verb+config.h+ a little
+simpler to understand, and also removes a potential security hole
+we discussed in earlier documentation. 
+This means some of the data structures previously defined in 
+the RunTime are no longer needed.
+\item The Discrete Gaussians can now be selected using different
+bounds on the NewHope loop. This bound is defined in \verb|config.h|.
+The previous value was hardwired to 20, we now allow the user
+to compile the system with any value. The default is now one.
+See Section \ref{sec:fhe} for details of this.
+\item We have extended the range of ring dimensions available,
+so bigger parameters can be utilized. Note, this is untested
+in terms of memory usage, so could result in huge memory 
+and/or network problems.
+\item You can also, by editing \verb|config.h| use discrete Gaussian
+secret keys instead of short Hamming weight keys if you so desire.
+This makes the FHE parameters a little bigger though.
+\item We fixed some bugs in the \verb|sfloat| class in relation to
+some division operations.
+\item We have added a program provided by Mark Will which allows
+people using YoSys synthesis tools to produce circuits in Bristol
+Fashion.
+\end{enumerate}
+
+
 \subsection{Changes in version 1.5 from 1.4.1}
 Apart from the usual minor bug fixes...
 \begin{enumerate}
diff --git a/Documentation/Compiler.tex b/Documentation/Compiler.tex
index a7bdbe28..967bb817 100644
--- a/Documentation/Compiler.tex
+++ b/Documentation/Compiler.tex
@@ -811,7 +811,6 @@ \subsubsection{Operations on Data Types}
 \paragraph{Exponentiation:}
 We also provide a built-in exponentiation operator for exponentiation over integer and 
 fractional $\modp$ data types ($**$), when the base is secret shared. 
-For the fractional case, 
 This overload provides a simple implementation for successive multiplications. 
 We provide more comprehensive protocols for exponentiation of fractional inputs 
 in the following sections. 
@@ -832,6 +831,13 @@ \subsubsection{Operations on Data Types}
 This is not the case for integer types. We invite the reader to revise the Advance Protocols 
 section for a revision of alternatives to compute the exponent on fractional data types.
 
+One can also raise a \verb|sfix| value to another \verb|sfix| value using the 
+function \verb|mpc_math.pow_fx|. However, when doing this the routine uses 
+\verb|sfloat| variables internally, thus you need to ensure the two types
+are parametrized correctly.
+In particular you must have \verb|k| parameter for \verb|sfix| being greater than
+the \verb|vlen| for \verb|sfloat|.
+
 \paragraph{Comparisons (Inequality Tests):}
 We have in-built operators for comparisons as well. 
 Indeed, comparison of secret values is supported, 
@@ -987,10 +993,10 @@ \subsubsection{Printing}
 
 \noindent
 There are other member functions which perform printing as well, these are
-\func{cfix.print_fix_plain()\\
-regint.print_reg_plain()\\
-cfloat.print_float_plain()\\
-cint.print_reg_plain()}
+\func{cfix.print_fix()\\
+regint.print_reg()\\
+cfloat.print_float()\\
+cint.print_reg()}
 
 
 \subsubsection{How to print Vectorized data}
@@ -1028,7 +1034,7 @@ \subsubsection{class sfix}
 	\func{v}
 	\textbf{Accessed by:} \verb|Default|.\\
 	\textbf{Type:} \verb|sint|.\\
-	S	Stores a register of type \verb|sint| on the $\{-2^k-1, 2^k-1\}$ interval, encoding of the rational original value.
+	Stores a register of type \verb|sint| on the $\{-2^k-1, 2^k-1\}$ interval, encoding of the rational original value.
 	\func{f}
 	\textbf{Accessed by:} \verb|Default|.\\
 	\textbf{Type:} \verb|int|.\\
@@ -1078,7 +1084,7 @@ \subsubsection{class sfix}
 a = sfix()
 a.load_int(5)
 b = a*3.0
-print_ln(b.reveal(b))  #the output is 15   	
+print_ln("the answer is %s", b.reveal())  #the output is 15   	
 \end{lstlisting}	 
 \func{sfix.conv()}
 			\textbf{Accessed by:} \verb|Default|.		 \\
@@ -1092,7 +1098,7 @@ \subsubsection{class sfix}
 a =sfix()
 a.load_int(4.5)
 v = a.conv()
-print_ln(v.reveal())  # the output is 4718592   	
+print_ln("the answer is %s", v.reveal())  # the output is 4718592   	
 \end{lstlisting}	 
 %\func{sfix.store_in_mem}
 		
@@ -1110,7 +1116,7 @@ \subsubsection{class sfix}
 a =sfix()
 a.load_int(4.5)
 r = a.sizeof()
-print_ln(r)  # the output is 1. 
+print_ln("the answer is %s", r)  # the output is 1. 
              # By Default the global_vector_size is set to 1.    	
 \end{lstlisting}	 
 \func{sfix.compute_reciprocal()}		
@@ -1126,7 +1132,7 @@ \subsubsection{class sfix}
 a =sfix()
 a.load_int(4.5)
 r = a.compute_reciprocal()
-print_ln(r.reveal())  # the output is 0.222222   	
+print_ln("the answer is %s", r.reveal())  # the output is 0.222222   	
 \end{lstlisting}	 
 	\paragraph{Observations:}
 	\begin{description}
@@ -1203,7 +1209,7 @@ \subsubsection{class cfix}
 a = cfix()
 a.load_int(5)
 b = a*3.0
-print_ln(b)  #the output is 15   	
+print_ln("the answer is %s", b)  #the output is 15   	
 \end{lstlisting}	 
 \func{cfix.conv()}
 			\textbf{Accessed by:} \verb|Default|.		 \\
@@ -1217,7 +1223,7 @@ \subsubsection{class cfix}
 a =cfix()
 a.load_int(4.5)
 v = a.conv()
-print_ln(v)  # the output is 4718592   	
+print_ln("the answer is %s", v)  # the output is 4718592   	
 \end{lstlisting}	 
 %\func{sfix.store_in_mem}
 		
@@ -1235,7 +1241,7 @@ \subsubsection{class cfix}
 a =cfix()
 a.load_int(4.5)
 r = a.sizeof()
-print_ln(r)  # the output is 4. 
+print_ln("the answer is %s", r)  # the output is 4. 
 			 # By Default the global_vector_size is set to 1.    	
 \end{lstlisting}	 
 \func{cfix.compute_reciprocal()}		
@@ -1250,7 +1256,7 @@ \subsubsection{class cfix}
 a =sfix()
 a.load_int(4.5)
 r = a.compute_reciprocal()
-print_ln(r)  # the output is 0.222222   	
+print_ln("the answer is %s", r)  # the output is 0.222222   	
 \end{lstlisting}	 
 	\paragraph{Observations:}
 	\begin{description}
@@ -1414,7 +1420,7 @@ \subsubsection{class sfloat}
 \begin{lstlisting}
 a = sfloat(4.5)
 r = a.sizeof()
-print_ln(r)  # the output is 5. 
+print_ln("the answer is %s", r)  # the output is 5. 
 			 # By Default the global_vector_size is set to 1.
 \end{lstlisting}
                         
@@ -1473,7 +1479,7 @@ \subsubsection{class cfloat}
 \begin{lstlisting}
 a =cfloat(4.5)             
 r = a.sizeof()
-print_ln(r)  # the output is 4. 
+print_ln("the answer is %s", r)  # the output is 4. 
 			 # By Default the global_vector_size is set to 1.
 \end{lstlisting}
 
@@ -1612,7 +1618,7 @@ \subsubsection{Arrays}
 
 @while_do(lambda x: x < 5, 0)
 def while_body(i):
-    print_ln("%s",new_array[i].reveal())
+    print_ln("%s", new_array[i].reveal())
     return i+1
 \end{lstlisting}
 Declares new array of size 10, then fills it with values from 1 to 10, at the end prints first five values, so it prints numbers from 1 to 5. Note that the values of array can be modified inside the function, they are exacly like MemValue. \\
diff --git a/Documentation/Documentation.tex b/Documentation/Documentation.tex
index 48b9845f..5bf2d7d0 100644
--- a/Documentation/Documentation.tex
+++ b/Documentation/Documentation.tex
@@ -3,6 +3,7 @@
 
 \usepackage[nottoc]{tocbibind}
 
+\usepackage{placeins}
 \usepackage{verbatim}
 \usepackage{fullpage}
 \usepackage{times}
@@ -32,7 +33,7 @@
 \newcommand{\msubsection}[1]{\newpage \subsection{#1}}
 \newcommand{\msubsubsection}[1]{\subsubsection{#1}}
 
-\title{SCALE--MAMBA v1.5 : Documentation}
+\title{SCALE--MAMBA v1.6 : Documentation}
 \author{
 A. Aly
 \and D. Cozzo
diff --git a/Documentation/FHE.tex b/Documentation/FHE.tex
index ee9d237d..cdb5df88 100644
--- a/Documentation/FHE.tex
+++ b/Documentation/FHE.tex
@@ -44,25 +44,27 @@
 In fact this is an under-estimate of the probability.
 From $\epsilon$ we define $e_i$ such that 
 $\mathsf{erfc}(e_i)^i \approx 2^{-\epsilon}$ and then we set $\gc_i = e_i^i$.
+\item \verb+NewHopeB+ (default $1$):
+This defines how Gaussians are selected in the FHE system for
+Full Threshold. We use the NewHope approximation of
+$\sum b_i - b_i'$, where $b_i, b_i' \in \{0,1\}$,
+with the sum being over \verb+NewHopeB+ values of $i$.
+This gives an approximation to a discrete Gaussian with
+standard deviation $\sigma = \sqrt{\texttt{NewHopeB}/2}$.
+\item \verb+HwtSK+ (default $64$):
+The Hamming weight of the secret key. If this is a negative
+number then the discrete Gaussian is used for the secret key.
 \end{itemize}
 All of these parameters can be tweaked in the file
 \verb+config.h+.
-Another two parameters related to the FHE scheme are
-\verb+hwt+, which is defined in \verb+Setup.cpp+ to
-be equal to $h=64$.
-This defines the number of non-zero coefficients in the
-FHE secret key.
-Another parameter is $\sigma$ which is the standard deviation
-for our approximate discrete Gaussians, which we
-hardwire to be $3.16 = \sqrt{10}$ (as we use the NewHope method
-of producing approximate Gaussians).
 
 \subsection{Main Security Parameters}
 Our Ring-LWE samples are (essentially) always from an
-approximate Gaussian distribution with standard deviation $3.16$
+approximate Gaussian distribution with standard deviation 
+$\sigma=\sqrt{\texttt{NewHopeB}/2}$
 and from a ring with a two-power degree of $N$.
 This is not strictly true as some noise samples come
-from small Hamming Weight distributions, and some come
+from (possibly) small Hamming Weight distributions, and some come
 from distributions over $\{-1,0,1\}$. 
 But the above are the main parameters, and have been used in prior works to
 estimate Ring-LWE security in SHE settings.
@@ -80,32 +82,82 @@ \subsection{Main Security Parameters}
 This is done by running the following \verb+sage+ code
 \begin{verbatim}
 load("estimator.py")
-n = 1024
-comp_sec = 128
-for i in range(10, 500, 5):
-   q= 2^i
-   costs = estimate_lwe(n, 3.16*sqrt(2*pi)/q, q, reduction_cost_model=BKZ.sieve,  \
-                        skip=["arora-gb", "bkw", "dec", "mitm"])
-   if any([cost.values()[0]<2^comp_sec for cost in costs.values()]):
-       break
-print i-5
+import sys
+
+for n in range(10,17):
+  N = 2^n
+  #for B in [1,2,4,8,16,20]:
+  for B in [1]:
+    sigma=sqrt(B/2.0)
+    ans=[1,1,1]
+    cnt=0
+    for sec in [80,128,256]:
+      bot=0
+      top=40
+      if N>5000:
+	top=100
+      if N>20000:
+	top=256
+      if N>40000:
+	top=900
+      repeat=true
+      while repeat:
+        repeat=false
+        q=2^top
+        costs = estimate_lwe(N, RealField(512)(sigma*sqrt(2*pi)/q), q, \
+                             reduction_cost_model=BKZ.sieve, skip=["arora-gb", \
+                             "bkw", "dec", "mitm"])
+        if not any([cost.values()[0]<2^sec for cost in costs.values()]):
+	     bot=top
+             top=2*top
+	     repeat=true
+      while top <> bot:
+         mid=round((top+bot)/2-.5)
+         if (mid==bot):
+		break
+         sys.stdout.flush()
+         q = 2^mid
+         costs = estimate_lwe(N, RealField(512)(sigma*sqrt(2*pi)/q), q, \
+                              reduction_cost_model=BKZ.sieve, skip=["arora-gb", \
+                              "bkw", "dec", "mitm"])
+         if any([cost.values()[0]<2^sec for cost in costs.values()]):
+	     top=mid
+         else:
+	     bot=mid
+      sys.stdout.flush()
+      ans[cnt]=bot
+      cnt=cnt+1
+    print N,"&",B,"&",sigma,"&",ans[0],"&",ans[1],"&",ans[2],"\\\\"
+    sys.stdout.flush()
 \end{verbatim}
-This will print a bunch of rubbish and then the number $29$. 
-This means any $q < 2^{29}$ will be ``secure'' by the above definition of secure.
+When run via 
+\begin{center}
+  \verb+sage < SCALE-Est.py > Res.txt+
+\end{center}
+this will produce lines of the form
+\begin{center}
+\verb+1024 & 1 & 0.707106781186548 & 40 & 25 & 12 \\+
+\end{center}
+This that for ring dimension $1024$, with \verb+NewHopeB+ equal to
+one, and so $\sigma=0.707$, that at the 80-bit security level any $q < 2^{40}$ will 
+be ``secure'' by the above definition of secure.
+Note that producing the table for \verb+NewHopeB+ equal to one produces
+values which remain secure when a higher value of \verb+NewHopeB+ is selected.
 
-We did this in Feb 2018 and obtained the following table of values, giving maximum
+We did this in Oct 2019 and obtained the following table of values, giving maximum
 values of $q$ in the form of $2^x$ for the value $x$ from the following table.
 \begin{center}
-\begin{tabular}{|c|c|c|c|}
+\begin{tabular}{|c|c|c|c||c|c|}
 \hline
-$N$   & \verb+comp_sec+=80 & \verb+comp_sec+=128 & \verb+comp_sec+=256 \\
+$N$   & \verb+NewHopeB+ & $\sigma$ &  \verb+comp_sec+=80 & \verb+comp_sec+=128 & \verb+comp_sec+=256 \\
 \hline
-1024  &  44 &    29        & 16 \\
-2048  &  86 &    56        & 31 \\
-4096  & 171 &   111        & 60 \\
-8192  & 344 &   220        & 120 \\
-16384 & 690 &   440        & 239 \\
-32768 & 998 &   883        & 478 \\
+1024 & 1 & 0.707106781186548 & 40 & 25 & 12 \\
+2048 & 1 & 0.707106781186548 & 82 & 52 & 26 \\
+4096 & 1 & 0.707106781186548 & 167 & 106 & 56 \\
+8192 & 1 & 0.707106781186548 & 340 & 215 & 115 \\
+16384 & 1 & 0.707106781186548 & 686 & 436 & 235 \\
+32768 & 1 & 0.707106781186548 & 1392 & 879 & 473 \\
+65536 & 1 & 0.707106781186548 & 2830 & 1778 & 953 \\
 \hline
 \end{tabular}
 \end{center}
@@ -158,8 +210,8 @@ \subsection{Distributions and Norms}
       probability of coefficient is $p_{-1}=1/4$, $p_0=1/2$
       and $p_1=1/4$.
 \item $\dN(\sigma^2,N)$: This generates a vector of
-      length $N$ with elements chosen according to an approximation to
-      the discrete Gaussian distribution with variance $\sigma^2$.
+      length $N$ with elements chosen according to the NewHope 
+      approximation to the discrete Gaussian distribution with variance $\sigma^2$.
 \item $\RC(0.5,\sigma^2,N)$: This generates a triple of
       elements $(v,e_0,e_1)$ where $v$ is sampled from $\ZO_s(0.5,N)$
       and $e_0$ and $e_1$ are sampled from $\dN_s(\sigma^2,N)$.
@@ -173,6 +225,8 @@ \subsection{Distributions and Norms}
 when sampled from $\dN(\sigma^2,\phi(m))$ we obtain variance
 $\sigma^2 \cdot \phi(m)$ and when sampled from $\calU(q,\phi(m))$
 we obtain variance $q^2 \cdot \phi(m)/12$.
+We let in what follows $V_\sk$ denote $\sqrt{\texttt{HwtSK}}$ in the case 
+when \verb|HwtSK| is positive, and $\sigma \cdot \sqrt{\phi(m)}$ otherwise.
 By the law of large numbers we can use $\gc_1 \cdot \sqrt{V}$,
 where $V$ is the above variance, as a high probability bound
 on the size of $a(\zeta_m)$ (the probability is $1-2^{-\epsilon}$, 
@@ -181,12 +235,11 @@ \subsection{Distributions and Norms}
 If we take a product of $t$ such
 elements with variances $V_1, V_2, \ldots, V_t$
 then we use $\gc_t \cdot \sqrt{V_1 \cdot V_2 \cdots V_t}$
-as the resulting bounds.
+as the resulting high probability bounds.
 In our implementation we approximate $\dN(\sigma^2,n)$ using
-the binomial method from the NewHope paper, with a standard
-deviation of $3.16 = \sqrt{10}$. In particular this means
+the above binomial method from the NewHope paper, this means
 any vector sampled from $\dN(\sigma^2,n)$ will have
-infinity norm bounded by $20$.
+infinity norm bounded by \verb+NewHopeB+.
 
 
 \subsection{The FHE Scheme and Noise Analysis}
@@ -219,9 +272,10 @@ \subsubsection{Key Generation:}
 key generation protocol for the underlying threshold FHE keys was
 assumed. In SCALE we assume a `magic black box' which distributes
 these keys, which we leave to the application developer to create.
-The secret key $\sk$ is selected from a distribution with
-Hamming weight $h$, i.e. $\HWT(h,\phi(m))$, 
-and then it is distributed amongst the $n$ parties by simply producing a random 
+The secret key $\sk$ is either selected from a distribution with
+Hamming weight $h$, i.e. $\HWT(h,\phi(m))$ or from
+$\dN(\sigma^2,N)$ (depending on what was selected in \verb|config.h|).
+Then the secret key is distributed amongst the $n$ parties by simply producing a random 
 linear combination, and assigning each party one of the sums.
 The switching key data is produced in the standard way, i.e.
 in a non-distributed trusted manner.
@@ -239,7 +293,7 @@ \subsubsection{Key Generation:}
 \[ a_{\sk,\sk^2} \asn \calU(q,\phi(m)) \quad \mbox{ and } \quad 
    b_{\sk,\sk^2} =   a_{\sk,\sk^2} \cdot \sk + p \cdot e_{\sk,\sk^2} - p_1 \cdot \sk^2 \]
 where $e_{\sk,\sk^2} \asn \dN(\sigma^2,\phi(m))$.
-We take $\sigma=3.16$ as described above in what follows.
+
 
 \subsubsection{Encryption:}
 To encrypt an element $m\in R$, we choose $v, e_0, e_1 \asn \RC(0.5,\sigma^2,n)$, i.e.
@@ -248,7 +302,7 @@ \subsubsection{Encryption:}
 \]
 Then we set $c_0 = b \cdot v + p \cdot e_0+m$,~ $c_1=a\cdot v+p\cdot
 e_1$, and set the initial ciphertext as $\ct'=(c_0,c_1)$.
-We calculate a bound (with high probability) on the output noise of
+We calculate a bound (which holds with high probability) on the output noise of
 an honestly generated ciphertext to be
 \begin{align*}
   \norm{c_0 - \sk \cdot c_1}_\infty^\can 
@@ -264,7 +318,7 @@ \subsubsection{Encryption:}
 	  + p \cdot \sigma \cdot 
              \left( \gc_2 \cdot \phi(m) / \sqrt{2}
 			  + \gc_1 \cdot \sqrt{\phi(m)}
-			  + \gc_2 \cdot \sqrt{h \cdot \phi(m)}
+			  + \gc_2 \cdot \sqrt{\phi(m)} \cdot V_\sk
 		    \right) = B_\clean.
 \end{align*}
 Note this is a probablistic bound and not an absolute bound.
@@ -272,26 +326,29 @@ \subsubsection{Encryption:}
 \vspace{5mm}
 
 \noindent
-However, below we will only be able to guarantee the
-$m, v, e_0$ and $e_1$ values of a {\bf sum} of $n$ fresh ciphertexts
+However, below (using the TopGear protocol)
+we will only be able to guarantee the $m, v, e_0$ and $e_1$ 
+values of a {\bf sum} of $n$ fresh ciphertexts
 (one from each party) are selected subject to
-\[ \norm{v}_\infty \le 2^{\ZKsecp + 3} \cdot n \quad 
+\[ \norm{2 \cdot v}_\infty \le 2^{\ZKsecp + 3} \cdot n \quad 
     \mbox{  and  } \quad
-   \norm{e_0}_\infty, \norm{e_1}_\infty \le 20 \cdot 2^{\ZKsecp + 2} \cdot n \quad
+   \norm{2 \cdot e_0}_\infty, \norm{2 \cdot e_1}_\infty \le \texttt{NewHopeB} \cdot 2^{\ZKsecp + 2} \cdot n \quad
     \mbox{  and  } \quad
-   \norm{m}_\infty \le 2^{\ZKsecp + 1} \cdot n \cdot p,
+   \norm{2 \cdot m}_\infty \le 2^{\ZKsecp + 1} \cdot n \cdot p,
 \]
 where $\ZKsecp$ is our soundness parameter for the 
-zero-knowledge proofs
-and $U$ is selected so that $(2 \cdot \phi(m))^U > 2^\Soundsecp$.
-In this situation we obtain the bound, using the inequality above
-between the infinity norm in the polynomial embedding
-and the infinity norm in the canonical embedding,
+zero-knowledge proofs and $U$ is selected so that 
+$(2 \cdot \phi(m))^U > 2^\Soundsecp$.
+Thus in TopGear we double the generated ciphertext 
+(which is the sum of the input players ciphertext)
+to obtain a ciphertext $(c_0,c_1)$ which even in the
+case of dishonest players has a noise bound in the
+infinity norm in the canonical embedding of,
 \begin{align*}
   \norm{c_0 - \sk \cdot c_1}_\infty^\can
   &\le \sum_{i=1}^n
   	\norm{2 \cdot m_i}_\infty^\can
-	+ p \cdot \Big(\norm{\epsilon}_\infty^\can \cdot \norm{2 \cdot e_{2,i}}_\infty^\can 
+	+ p \cdot \Big(\norm{\epsilon}_\infty^\can \cdot \norm{2 \cdot v_i}_\infty^\can 
 	+ \norm{2 \cdot e_{0,i}}_\infty^\can \\
   &\hspace{3.5cm}
 	+ \norm{\sk}_\infty^\can \cdot \norm{2 \cdot e_{1,i}}_\infty^\can \Big) \\
@@ -300,18 +357,18 @@ \subsubsection{Encryption:}
 	+ p \cdot \Big(
 	 \gc_1 \cdot \sigma \cdot \phi(m)^{3/2} \cdot 2 \cdot 2^{\ZKsecp+2} \cdot n\\
   &\hspace{3cm}
-	  + \phi(m) \cdot 2 \cdot  2^{\ZKsecp+2} \cdot n \cdot 20  \\
+	  + \phi(m) \cdot 2 \cdot  2^{\ZKsecp+2} \cdot n \cdot \texttt{NewHopeB}  \\
   &\hspace{3cm}
-	  +  \gc_1 \cdot \sqrt{h} \cdot \phi(m) \cdot 2 \cdot 2^{\ZKsecp+2} \cdot n \cdot 20 
+	  +  \gc_1 \cdot V_\sk \cdot \phi(m) \cdot 2 \cdot 2^{\ZKsecp+2} \cdot n \cdot \texttt{NewHopeB} 
 	\Big)  \\
   &=\phi(m) \cdot 2^{\ZKsecp+2} \cdot n \cdot p
 	\cdot \Big( \frac{41}{2} + \gc_1 \cdot \sigma \cdot \phi(m)^{1/2}  
-	           +  20 \cdot \gc_1 \cdot \sqrt{h} 
+	           +  \texttt{NewHopeB} \cdot \gc_1 \cdot V_\sk
 	\Big)   \\
   & = B_\clean^\dishonest.
 \end{align*}
 Again this is a probabilistic bound (assuming validly
-distributed key generation), but assumes the worst case
+distributed key generation), but assumes the {\em worst case}
 ciphertext bounds.
 
 \subsubsection{$\SwitchModulus((c_0,c_1))$:}
@@ -331,7 +388,7 @@ \subsubsection{$\SwitchModulus((c_0,c_1))$:}
 \begin{align*}
     B_\scale 
 	&= c_1 \cdot p \cdot \sqrt{ \phi(m)/12}
-	+  c_2 \cdot p \cdot \sqrt{ \phi(m) \cdot h/12}, \\
+	+  c_2 \cdot p \cdot \sqrt{ \phi(m)/12} \cdot V_\sk, \\
 \end{align*}
 This is again a probabilistic analysis, assuming validly generated
 public keys.
@@ -368,7 +425,7 @@ \subsubsection{$\DistDec_{\{\sk_i\}}(\ct)$:}
 \item Player one computes $\vv_1 = e_{\vm+\vf}^{(0)}-\sk_1 \cdot e_{\vm+\vf}^{(1)}$ and player $i \ne 1$ computes.
 $\vv_i = -\sk_i \cdot e_{\vm+\vf}^{(1)}$.
 \item All players compute $\vt_i = \vv_i + p \cdot \vr_i$ for some random element
-with infinity norm given by $2^\DDsecp \cdot B/p$, where $\DDsecp$ is the parameter defining statistical distance for the distributed decryption.
+with infinity norm given by $2^\DDsecp \cdot B_\dec/p$, where $\DDsecp$ is the parameter defining statistical distance for the distributed decryption.
 \item The parties broadcast $\vt_i$.
 \item The parties compute $\vm+\vf = \sum \vt_i \pmod{p}$.
 \end{enumerate}
@@ -383,15 +440,15 @@ \subsubsection{$\DistDec_{\{\sk_i\}}(\ct)$:}
 
 \paragraph{Reshare Version 1:}
 This is described in \figref{reshare}.
-The value $B$ in the protocol is an upper bound on the noise in the canonical embedding
+The value $B_\dec$ in the protocol is an upper bound on the noise in the canonical embedding
 $\nu$ associated with a ciphertext we will decrypt in our protocols.
 To ensure valid distributed decryption we require
-\[ 2 \cdot (1+n \cdot 2^\DDsecp) \cdot B < q_{0}. \]
+\[ 2 \cdot (1+n \cdot 2^\DDsecp) \cdot B_\dec < q_{0}. \]
 
-Given a value of $B$, we therefore will obtain a lower bound
+Given a value of $B_\dec$, we therefore will obtain a lower bound
 on $p_0$ by the above inequality.
 The addition of a random term with infinity norm bounded by
-$2^\DDsecp \cdot B/p$ in the distributed decryption procedure
+$2^\DDsecp \cdot B_\dec/p$ in the distributed decryption procedure
 ensures that the individual {\em coefficients} of the sum
 $\vt_1+\cdots+\vt_n$ are statistically indistinguishable from
 random, with probability $2^{-\DDsecp}$.
@@ -414,9 +471,9 @@ \subsubsection{$\DistDec_{\{\sk_i\}}(\ct)$:}
 
 \begin{Boxfig}{Distributed decryption to secret
     sharing}{distdec}{$\mathsf{Reshare-2}$}
-Furthermore, let $B$ denote a bound on the noise, that is $\norm{c_0 - s \cdot c_1}_\infty$.
+Furthermore, let $B_\dec$ denote a bound on the noise, that is $\norm{c_0 - s \cdot c_1}_\infty$.
 \begin{enumerate}
-\item Party $i$ samples $\vf_i \asn [0,B\cdot 2^\DDsecp]^N$.
+\item Party $i$ samples $\vf_i \asn [0,B_\dec \cdot 2^\DDsecp]^N$.
 \item Party $1$ computes
   $\vv_i := (c_0 - s_1 \cdot c_1) - \vf_1 \bmod q$, and every other
   party $i$ computes $\vv_i :=  -s_i \cdot c_1- \vf_i$.
@@ -444,12 +501,12 @@ \subsubsection{$\SwitchKey(d_0,d_1,d_2)$:}
 So we expect
 \begin{align*}
  \norm{ p \cdot d_2 \cdot e_{\sk,\sk^2}}_\infty^\can 
-     &\le p \cdot c_2 \cdot \sqrt{q_0^2/12 \cdot \sigma^2 \cdot \phi(m)^2} \\
-     & = p \cdot c_2 \cdot q_0 \cdot \sigma \cdot \phi(m)/\sqrt{12} \\
+     &\le p \cdot \gc_2 \cdot \sqrt{q_0^2/12 \cdot \sigma^2 \cdot \phi(m)^2} \\
+     & = p \cdot \gc_2 \cdot q_0 \cdot \sigma \cdot \phi(m)/\sqrt{12} \\
      & = B_\KS \cdot q_0.
 \end{align*}
 Thus
-\[ B_\KS = p \cdot c_2 \cdot \sigma \cdot \phi(m)/\sqrt{12}. \]
+\[ B_\KS = p \cdot \gc_2 \cdot \sigma \cdot \phi(m)/\sqrt{12}. \]
 Then if the input to $\SwitchKey$ has noise bounded by $\nu$ then the output 
 noise value in the canonical embedding will be bounded by
 \[ \nu+\frac{B_\KS \cdot p_0}{p_1} + B_\scale. \]
@@ -484,7 +541,7 @@ \subsubsection{Application to the Offline Phase:}
 \[ U_2 = U_1 +  \frac{B_\clean^\dishonest}{p_1} + B_\scale. \]
 To ensure valid (distributed) decryption, we require
 \[ 2 \cdot U_2 \cdot (1+n \cdot 2^\DDsecp) < p_0, \]
-i.e. we take $B=U_2$ in our distributed decryption protocol.
+i.e. we take $B_\dec=U_2$ in our distributed decryption protocol.
 Note, that here we take the worst case bound for the ciphertext
 noise, but probabilistic analysis everywhere else. Since 
 the key generation is assumed to be honestly performed.
diff --git a/Documentation/FigZKPoK.tex b/Documentation/FigZKPoK.tex
index c7a5d546..c3c22434 100644
--- a/Documentation/FigZKPoK.tex
+++ b/Documentation/FigZKPoK.tex
@@ -1,6 +1,7 @@
 \begin{Boxfig}{Protocol for global proof of knowledge of a set of ciphertexts}{PZK1}{Protocol $\Pi_{\gZKPoK}$}
 The protocol is parametrized by integer parameters $U,V$ and 
 $\flag  \in \left\{ \Diag, \perp \right\}$ as well as $\pk$ and further parameters of the encryption scheme.
+Define $\rho_1=1$ and $\rho_2, \rho_3 = \texttt{NewHopeB}$.
 
 \vspace{3mm}
 
diff --git a/Documentation/Installation.tex b/Documentation/Installation.tex
index 6ab1eee7..e47ed1f8 100644
--- a/Documentation/Installation.tex
+++ b/Documentation/Installation.tex
@@ -10,6 +10,7 @@ \subsection{Installation}
 \item CPU supporting AES-NI and PCLMUL
 \item OpenSSL: Tested with version 1.1.0.b
 \item Crypto++: Tested with version 7.0
+\item CMake, only if you want to use it instead of make: Version 3.1 to 3.14 is required
 \end{itemize}
 Developers will also require
 \begin{itemize}
@@ -103,7 +104,7 @@ \subsubsection{Change config.h}
      config.h
 \end{verbatim}
 in the sub-directory \verb+src+.
-The main things to watch out for here are the various security parameters;
+The main things to watch out for here are the various FHE security parameters;
 these are explained in more detail in Section \ref{sec:fhe}.
 Note, to configure the statistical security parameter for the number representations
 in the compiler (integer comparison, fixed point etc) from the default
@@ -125,6 +126,37 @@ \subsubsection{Final Compilation}
 That's it! After make finishes then you should see a `Player.x`
 executable inside the SCALE-MAMBA directory.
 
+\subsubsection{Compile with CMake (experimental)}
+It is possible to build SCALE with CMake.
+We introduced CMake because there are many development tools that work with CMake-based projects,
+e.g., CLion, clangd and so on.
+
+You may install the dependencies the same way as above.
+We explain how to use CMake with the example below.
+
+\begin{verbatim}
+    mkdir src/build
+    cd src/build
+    # create the cmake project
+    CC=gcc CXX=g++ cmake \
+        -DOPENSSL_ROOT_DIR=$openssl_root \
+        -DCRYPTOPP_ROOT_DIR=$cryptopp_root \
+        -DMPIR_ROOT_DIR=$mpir_root ..
+    # build the project
+    make
+\end{verbatim}
+
+The first step is to have CMake create a project using the \verb+cmake+ command.
+The compiler can be changed using \verb+CC+ and \verb+CXX+.
+Use the \verb+-D+ flag to configure the dependencies.
+For example, if the MPIR library is in \verb+$HOME/mpir/lib+ and its include files are in \verb+$HOME/mpir/include+,
+then \verb+-DMPIR_ROOT_DIR+ should be set to \verb+$HOME/mpir+, i.e., the parent directory of \verb+lib+ and \verb+include+.
+If the dependencies are installed in the default directories,
+e.g., \verb+/usr/{lib64,include}+ or \verb+/usr/local/{lib64,include}+,
+then the \verb+-D+ flags are not needed.
+Finally, run \verb+make+ to create the binaries.
+The \verb+cmake+ command is not needed in subsequent compilations.
+For more information on CMake, we recommend this excellent wiki\footnote{\url{https://gitlab.kitware.com/cmake/community/wikis/home}}.
 
 
 \subsection{Creating and Installing Certificates}
@@ -237,8 +269,10 @@ \subsubsection{Data for networking}
   \item Which IP address is going to be used
   \item The name of the certificate for that player
 \end{itemize}
+\iffalse XXXX
 \item Whether a fake offline phase is going to be used.
 \item Whether a fake sacrifice phase is going to be used.
+\fi
 \end{itemize}
 
 \subsubsection{Data for secret sharing:}
@@ -269,8 +303,10 @@ \subsubsection{Data for secret sharing:}
 research system. 
 At this stage we also generate a set of keys
 for distributed decryption of a level-one FHE scheme if needed.
+\iffalse XXXX
 For the case of fake offline we assume these keys are on {\em each} computer,
 but using fake offline is only for test purposes in any case.
+\fi
 
 \paragraph{Shamir Secret Sharing:}
 Shamir secret sharing we assume is self-explanatory.
diff --git a/Documentation/Introduction.tex b/Documentation/Introduction.tex
index fc2370d6..4d2a86d2 100644
--- a/Documentation/Introduction.tex
+++ b/Documentation/Introduction.tex
@@ -72,28 +72,6 @@
 In any real system this entire setup phase will need
 investigating, with perhaps using HSMs to construct and deploy
 keys if no actively secure key generation protocol is available.
-\item
-There is a security hole in how we have implemented things.
-As the
-\begin{center}
-  \verb+offline->sacrifice->online+ 
-\end{center}
-pipeline is run continously in separate threads, each with their own channels etc., we 
-may {\bf use} a data item in the online phase {\bf before} the checking
-of a data item has {\bf fully completed} (i.e. before in an associated
-sacrifice etc. has been MAC-checked, for full-threshold,
-or hash-checked, for other LSSS schemes). 
-In a real system you will want to address this by having some
-other list of stuff (between sacrifice and online), which
-ensures that all checks are complete before online is
-allowed to use any data; or by ensuring MAC/hash-checking is
-done before the sacrifice queues are passed to the online phase.
-In our current system if something bad is introduced by an adversary
-then the system {\bf will} halt.
-But, before doing so there is a {\em small} chance that some data will 
-have leaked from the computation if the adversary can schedule the 
-error at exactly the right point (depending on what is happening in
-other threads).
 \end{itemize}
 
 
@@ -108,18 +86,13 @@ \subsection{Architecture}
 Using multiple threads enables you to get high throughput. Almost all
 of our experimental results are produced using multiple threads.
 \item Each online is associated with another four ``feeder'' threads.
-One produces multiplication triples, one produces square pairs
-and one produces shared bits.
-The fourth thread performs the sacrificing step, as well as
-the preprocessing data for player IO.
-The chain of events is that the multiplication thread produces
-an unchecked triple. This triple is added to a triple-list (which is
-done in batches for efficiency).
-At some point the sacrifice thread decides to take some
-data off the triple-list and perform the triple checking via
-sacrificing.
-Once a triple passes sacrificing it is passed onto another
-list, called the sacrificed-list, for consumption by the online phase.
+One produces multiplication triples, one produces square pairs,
+one produces shared bits and one produces data for input/output
+of data items.
+The chain of events is that the multiplication thread (say) produces
+a fully checked triple. This triple is added to a triple-list (which is
+done in batches for efficiency) for consumption by the online
+phase.
 The sizes of these lists can be controlled (somewhat) by the
 values in the file \verb+config.h+.
 One can control the number of entries {\em ever}
diff --git a/Documentation/symbols.tex b/Documentation/symbols.tex
index 9044017e..ce4ba073 100644
--- a/Documentation/symbols.tex
+++ b/Documentation/symbols.tex
@@ -203,6 +203,7 @@
 
 \newcommand{\clean}{\mathsf{clean}}
 \newcommand{\scale}{\mathsf{scale}}
+\newcommand{\dec}{\mathsf{dec}}
 \newcommand{\dishonest}{\mathsf{dishonest}}
 \newcommand{\KS}{\mathsf{KS}}
 
diff --git a/Programs/restart_1/restart.mpc b/Programs/restart_1/restart.mpc
index 7921c6bf..712ea517 100644
--- a/Programs/restart_1/restart.mpc
+++ b/Programs/restart_1/restart.mpc
@@ -1,16 +1,17 @@
-sfloat.vlen = 8   # Length of mantissa in bits
-sfloat.plen = 5   # Length of exponent in bits
-sfloat.kappa = 4  # Statistical security parameter for floats
 
-a = [sfloat(i) for i in [-.03,2]]
-a[0], a[1] = cond_swap(a[0], a[1])
+# Write some data to file
+inp = [sint(1), sint(2), sint(3), sint(4)]
+output_shares(3000,*inp)
 
-a = [sfloat(i) for i in [3,0]]
-a[0], a[1] = cond_swap(a[0], a[1])
-
-import random
-random.seed(0)
-a = [sfloat(random.uniform(-100,100)) for i in range(8)]
-odd_even_merge_sort(a)
+print_ln("Player zero enter a number")
+a=sint.get_private_input_from(0)
+print_ln("Player one enter a number")
+b=sint.get_private_input_from(1)
+c=a+b
+print_ln("The SUM is being sent to player one")
+c.reveal_to(1)
+print_reg(reveal(a))
+print_reg(reveal(b))
+print_reg(reveal(c))
 
 restart()
diff --git a/Programs/test_dabit/test_dabit.mpc b/Programs/test_dabit/test_dabit.mpc
new file mode 100644
index 00000000..fc7d3699
--- /dev/null
+++ b/Programs/test_dabit/test_dabit.mpc
@@ -0,0 +1,29 @@
+n_parallel = 8192
+n_threads = 8
+
+n = 8192 * (2**12)
+
+def generate_bits_single(n):
+    for i in range(n / n_parallel):
+        bp = sint(size=n_parallel)
+        b2 = sbit(size=n_parallel)
+        vdabit(n_parallel, *(bp, b2))
+
+def generate_bits_multithread(n):
+    def f():
+        @for_range(n / n_parallel / n_threads)
+        def g(_):
+            bp = sint(size=n_parallel)
+            b2 = sbit(size=n_parallel)
+            vdabit(n_parallel, *(bp, b2))
+
+    t = MPCThread(f, 'f')
+
+    for i in range(n_threads):
+        t.start()
+    for i in range(n_threads):
+        t.join()
+
+start_timer(1)
+generate_bits_multithread(n)
+stop_timer(1)
diff --git a/README.md b/README.md
index 10105db9..2dbb0176 100644
--- a/README.md
+++ b/README.md
@@ -7,26 +7,28 @@
 
 
 First type
-<p>
-        make doc
-</p>
+```
+make doc
+```
+
 Then *read* the documentation!
 
 Note: For Leuven maintainers, if wishing to recompile the basic 64 bit 
 circuits then call
-<p>
-        make circuits
-</p>
+```
+make circuits
+```
+
 These are then compiled down from the netlist down to the Bristol
 fashion again, and then simplified. 
 
 After doing this run
-<p>
-        ./Test-Convert.x
-</p>
+```
+./Test-Convert.x
+```
 to check all is OK.
 
 
-If you want to recompile the .net circuits from the .vhd see the
+If you want to recompile the `.net` circuits from the `.vhd` see the
 instructions in Circuits/README.txt
 
diff --git a/Scripts/compile-scasm.sh b/Scripts/compile-scasm.sh
new file mode 100755
index 00000000..2e98bb6b
--- /dev/null
+++ b/Scripts/compile-scasm.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# This file creates the test assembly files for scascm
+
+# Portably find the directory containing this script.
+HERE=$(cd `dirname $0`; pwd)
+# Assuming it lives in Scripts/, set ROOT to the root directory of the
+# project.
+ROOT=$HERE/..
+# Now set the PYTHONPATH so we don't have to worry about the CWD
+# whenever we invoke python scripts.
+export PYTHONPATH=$ROOT:$PYTHONPATH
+
+shift $[OPTIND-1]
+
+run_test() {
+    test=$1
+    shift
+    printf "\n\n\n\n\n\n\n\n\n\n"
+    echo "$test"
+    $ROOT/compile.py -a scasm -n -r -M -u $compile_opts $* Programs/$test || exit 1
+}
+
+test_opts="-s"
+compile_opts="--stop"
+for test in test_all test_array test_branch test_branching test_comparison test_count test_empty_tape test_flex test_float test_floatingpoint test_float_sorting test_float_vector test_for_range_multithread test_function test_idle_threads test_lib test_loop test_map_reduce test_mem_order test_new_threads test_sregint test_threads test_vector test_sfix test_sqrt test_math test_custom_array test_fix_array do_nothing GC_test IO_demo Local_test mem_clear_demo mult_demo restart sregint_tests tutorial test_dabit; do
+    run_test $test
+done
diff --git a/Scripts/test-n-10.sh b/Scripts/test-n-10.sh
index 2e7c05f4..1ba06dec 100755
--- a/Scripts/test-n-10.sh
+++ b/Scripts/test-n-10.sh
@@ -41,7 +41,7 @@ if test "$1"; then
 else
     test_opts="-s"
     compile_opts="--stop"
-    for test in test_all test_array test_branch test_branching test_comparison test_empty_tape test_flex test_float test_floatingpoint test_float_sorting test_float_vector test_function test_idle_threads test_lib test_loop test_mem_order test_sregint test_threads test_vector test_sfix test_sqrt test_math test_custom_array; do
+    for test in test_array test_branch test_branching test_comparison test_empty_tape test_flex test_float test_floatingpoint test_float_sorting test_float_vector test_function test_idle_threads test_lib test_loop test_mem_order test_sregint test_vector test_sfix test_sqrt test_math test_custom_array test_all; do
 	run_test $test
     done
 fi
diff --git a/Scripts/test.sh b/Scripts/test.sh
index dd401867..5841b7c1 100755
--- a/Scripts/test.sh
+++ b/Scripts/test.sh
@@ -38,7 +38,7 @@ if test "$1"; then
 else
     test_opts="-s"
     compile_opts="--stop"
-    for test in test_all test_array test_branch test_branching test_comparison test_count test_empty_tape test_flex test_float test_floatingpoint test_float_sorting test_float_vector test_for_range_multithread test_function test_idle_threads test_lib test_loop test_map_reduce test_mem_order test_new_threads test_sregint test_threads test_vector test_sfix test_sqrt test_math test_custom_array test_fix_array; do
+    for test in test_array test_branch test_branching test_comparison test_count test_empty_tape test_flex test_float test_floatingpoint test_float_sorting test_float_vector test_for_range_multithread test_function test_idle_threads test_lib test_loop test_map_reduce test_mem_order test_new_threads test_sregint test_threads test_vector test_sfix test_sqrt test_math test_custom_array test_fix_array test_all; do
 	run_test $test
     done
 fi
diff --git a/Scripts/test_32.sh b/Scripts/test_32.sh
index 21533966..1b22f8bb 100755
--- a/Scripts/test_32.sh
+++ b/Scripts/test_32.sh
@@ -41,7 +41,7 @@ else
     test_opts="-s"
     compile_opts="--stop"
     # We do not run sfix, float, math, sqrt and custom_array tests for 32 bit inputs 
-    for test in test_all test_array test_branch test_branching test_comparison test_count test_empty_tape test_flex test_for_range_multithread test_function test_idle_threads test_lib test_loop test_map_reduce test_mem_order test_new_threads test_threads test_vector; do
+    for test in test_array test_branch test_branching test_comparison test_count test_empty_tape test_flex test_for_range_multithread test_function test_idle_threads test_lib test_loop test_map_reduce test_mem_order test_new_threads test_threads test_vector test_all; do
 	run_test $test
     done
 fi
diff --git a/Test/FHE-P.cpp b/Test/FHE-P.cpp
index 38493180..9f1aaa05 100644
--- a/Test/FHE-P.cpp
+++ b/Test/FHE-P.cpp
@@ -17,6 +17,10 @@ int main()
   cout << "Enter plaintext modulus size " << endl;
   cin >> lg2p;
 
+  int newHB;
+  cout << "Enter the NewHope loop bound" << endl;
+  cin >> newHB;
+
   vector<int> sec= {1, 40, 80, 128};
 
   for (int s1= 2; s1 < 4; s1++)
@@ -30,7 +34,7 @@ int main()
                   cout << "\nn = 2: Comp_Sec = " << sec[s1] << ": DD_stat_sec = " << sec[s2];
                   cout << ": ZK_sound_sec = " << sec[s3] << ": ZK_slack_sec=" << sec[s4] << endl;
                   p= 0;
-                  Generate_Parameters(N, p0, p1, p, lg2p, hwt, 2, TopGear, sec[s1], sec[s2], sec[s3], sec[s4]);
+                  Generate_Parameters(N, p0, p1, p, lg2p, 2, TopGear, hwt, newHB, sec[s1], sec[s2], sec[s3], sec[s4]);
                 }
             }
         }
diff --git a/Test/Test-Adv-FHE.cpp b/Test/Test-Adv-FHE.cpp
index b3691f92..9a0a63b8 100644
--- a/Test/Test-Adv-FHE.cpp
+++ b/Test/Test-Adv-FHE.cpp
@@ -29,7 +29,7 @@ void Gen_FHE_Data(int &n, Ring &Rg, FFT_Data &PTD, FHE_Params &params,
   n= 2;
   bigint p0, p1, pr= 0;
   unsigned int hwt= 64, N;
-  Generate_Parameters(N, p0, p1, pr, 128, hwt, n, version);
+  Generate_Parameters(N, p0, p1, pr, 128, n, version, hwt);
 
   Rg.initialize(2 * N);
   gfp::init_field(pr);
diff --git a/Test/Test-FHE.cpp b/Test/Test-FHE.cpp
index 6cb0bb37..bddf47f0 100644
--- a/Test/Test-FHE.cpp
+++ b/Test/Test-FHE.cpp
@@ -576,7 +576,7 @@ int main()
   cout << "\n\nTesting Main FHE parameters" << endl;
   unsigned int hwt= 64;
   p= 0;
-  Generate_Parameters(N, p0, p1, p, 128, hwt, 3, TopGear);
+  Generate_Parameters(N, p0, p1, p, 128, 3, TopGear, hwt);
 
   Rg.initialize(2 * N);
 
diff --git a/compile.py b/compile.py
index ed3c8a8b..f0120b0b 100755
--- a/compile.py
+++ b/compile.py
@@ -59,7 +59,7 @@ def compilation():
         prog.write_bytes(options.outfile)
         if options.asmoutfile:
             for tape in prog.tapes:
-                tape.write_str(options.asmoutfile + '-' + tape.name)
+                tape.write_str(options.asmoutfile + '-' + tape.name+'.asm')
 
     if options.profile:
         import cProfile
diff --git a/run_tests.sh b/run_tests.sh
index ea6d0bf2..e02c7c29 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -2,7 +2,7 @@
 
 cp -R Auto-Test-Data/Cert-Store/* Cert-Store/
 
-for i in $(seq 0 9); do
+for i in $(seq 0 25); do
   \rm -f Scripts/logs/$i
 done
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000..57269b25
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,184 @@
+cmake_minimum_required(VERSION 3.1...3.14)
+
+if (${CMAKE_VERSION} VERSION_LESS 3.12)
+    cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+endif ()
+
+project(SCALE_MAMBA VERSION 1.0 DESCRIPTION "SCALE MAMBA" LANGUAGES CXX)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake-modules)
+set(CMAKE_CXX_STANDARD 11)
+
+include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+# TODO is it possible to make this into a runtime parameter?
+# TODO why is it not in config.h as well?
+add_definitions(-DMAX_MOD_SZ=7)
+add_compile_options(
+        -maes
+        -mpclmul
+        -msse4.1
+        -mavx
+        -march=core2
+)
+
+find_package(MPIR REQUIRED)
+find_package(MPIRXX REQUIRED)
+find_package(OpenSSL REQUIRED)
+find_package(CryptoPP REQUIRED)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+add_library(exceptions OBJECT
+        Exceptions/handler.cpp
+        )
+
+add_library(fhe OBJECT
+        FHE/Ciphertext.cpp
+        FHE/Distributions.cpp
+        FHE/FFT.cpp
+        FHE/FFT_Data.cpp
+        FHE/FHE_Keys.cpp
+        FHE/FHE_Params.cpp
+        FHE/Plaintext.cpp
+        FHE/Random_Coins.cpp
+        FHE/Ring.cpp
+        FHE/Ring_Element.cpp
+        FHE/Rq_Element.cpp
+        FHE/ZKPoK.cpp
+        )
+
+add_library(gc OBJECT
+        GC/Base_Circuits.cpp
+        GC/Circuit.cpp
+        GC/Garbled.cpp
+        GC/SimplifyCircuit.cpp
+        )
+
+add_library(io OBJECT
+        Input_Output/Input_Output_Base.cpp
+        Input_Output/Input_Output_Simple.cpp
+        )
+
+add_library(lsss OBJECT
+        LSSS/CAS.cpp
+        LSSS/MSP.cpp
+        LSSS/Open_Protocol.cpp
+        LSSS/PRSS.cpp
+        LSSS/PRZS.cpp
+        LSSS/Schur_Matrices.cpp
+        LSSS/Share.cpp
+        LSSS/ShareData.cpp
+        )
+
+add_library(mpcmath OBJECT
+        Math/bigint.cpp
+        Math/gf2n.cpp
+        Math/gfp.cpp
+        Math/Integer.cpp
+        Math/Matrix.cpp
+        Math/modp.cpp
+        Math/Zp_Data.cpp
+        )
+
+add_library(offline OBJECT
+        Offline/DABitGenerator.cpp
+        Offline/DABitMachine.cpp
+        Offline/FakePrep.cpp
+        Offline/FHE_Factory.cpp
+        Offline/offline.cpp
+        Offline/offline_data.cpp
+        Offline/offline_FHE.cpp
+        Offline/offline_IO_production.cpp
+        Offline/offline_Maurer.cpp
+        Offline/offline_phases.cpp
+        Offline/offline_Reduced.cpp
+        Offline/offline_subroutines.cpp
+        Offline/sacrifice.cpp
+        Offline/XOR_Machine.cpp
+        )
+
+add_library(online OBJECT
+        Online/Machine.cpp
+        Online/Online.cpp
+        Online/Schedule.cpp
+        )
+
+add_library(ot OBJECT
+        OT/aAND.cpp
+        OT/aAND_Thread.cpp
+        OT/aBit.cpp
+        OT/aBit_Thread.cpp
+        OT/aBitFactory.cpp
+        OT/aBitVector.cpp
+        OT/BitMatrix.cpp
+        OT/BitVector.cpp
+        OT/COT.cpp
+        OT/CRS.cpp
+        OT/DMC.cpp
+        OT/HaAND.cpp
+        OT/LaAND.cpp
+        OT/ROT.cpp
+        OT/SimpleROT.cpp
+        )
+target_link_libraries(ot PRIVATE ${CRYPTOPP_LIBRARIES})
+target_include_directories(ot PRIVATE ${CRYPTOPP_INCLUDE_DIRS})
+
+add_library(processor OBJECT
+        Processor/Instruction.cpp
+        Processor/Memory.cpp
+        Processor/Processor.cpp
+        Processor/Processor_IO.cpp
+        Processor/Program.cpp
+        )
+
+add_library(system OBJECT
+        System/Networking.cpp
+        System/Player.cpp
+        System/RunTime.cpp
+        System/SystemData.cpp
+        )
+target_link_libraries(system PRIVATE OpenSSL::SSL)
+
+add_library(tools OBJECT
+        Tools/aes-ni.cpp
+        Tools/aes.cpp
+        Tools/CBC-MAC.cpp
+        Tools/Crypto.cpp
+        Tools/MMO.cpp
+        Tools/random.cpp
+        Tools/Timer.cpp
+        Tools/util_containers.cpp
+        )
+
+add_library(local OBJECT
+        Local/BLAS.cpp
+        Local/Local_Functions.cpp
+        )
+
+add_library(scale
+        $<TARGET_OBJECTS:exceptions>
+        $<TARGET_OBJECTS:fhe>
+        $<TARGET_OBJECTS:gc>
+        $<TARGET_OBJECTS:io>
+        $<TARGET_OBJECTS:local>
+        $<TARGET_OBJECTS:lsss>
+        $<TARGET_OBJECTS:mpcmath>
+        $<TARGET_OBJECTS:offline>
+        $<TARGET_OBJECTS:online>
+        $<TARGET_OBJECTS:ot>
+        $<TARGET_OBJECTS:processor>
+        $<TARGET_OBJECTS:system>
+        $<TARGET_OBJECTS:tools>
+        )
+target_link_libraries(scale PRIVATE
+        Threads::Threads OpenSSL::Crypto OpenSSL::SSL
+        ${CRYPTOPP_LIBRARIES} ${MPIR_LIBRARIES} ${MPIRXX_LIBRARIES}
+        )
+
+add_executable(player Player.cpp)
+target_link_libraries(player PRIVATE scale)
+
+add_executable(setup Setup.cpp)
+target_link_libraries(setup PRIVATE OpenSSL::Crypto OpenSSL::SSL scale)
diff --git a/src/FHE/Distributions.cpp b/src/FHE/Distributions.cpp
index ccdaab68..6c942916 100644
--- a/src/FHE/Distributions.cpp
+++ b/src/FHE/Distributions.cpp
@@ -7,6 +7,7 @@ All rights reserved
 
 #include "Distributions.h"
 #include "math.h"
+#include "config.h"
 
 int sample_half(PRNG &G)
 {
@@ -19,21 +20,29 @@ int sample_half(PRNG &G)
     return -1;
 }
 
-/* This uses the approximation to a Gaussian via binomial distribution */
+/* This uses the approximation to a Gaussian via 
+ * binomial distribution 
+ *  
+ * This procedure consumes 2*NewHopeB bits, where NewHopeB
+ * is defined in config.h
+ */
 int sample_Gauss(PRNG &G)
 {
   int s= 0;
-  uint8_t ss[5];
-  G.get_random_bytes(ss, 5);
-  for (int j= 0; j < 5; j++)
+  uint8_t ss[(2*NewHopeB+7)/8];
+  G.get_random_bytes(ss, (2*NewHopeB+7)/8);
+  int cnt=0;
+  for (int j= 0; j < (2*NewHopeB+7)/8; j++)
     {
-      for (int k= 0; k < 4; k++)
+      for (int k= 0; k < 4 && cnt < NewHopeB; k++)
         {
           s= s + (int) (ss[j] & 1);
           ss[j]= ss[j] >> 1;
           s= s - (int) (ss[j] & 1);
           ss[j]= ss[j] >> 1;
+	  cnt++;
         }
     }
   return s;
 }
+
diff --git a/src/FHE/Distributions.h b/src/FHE/Distributions.h
index e86c5876..8027cce5 100644
--- a/src/FHE/Distributions.h
+++ b/src/FHE/Distributions.h
@@ -18,7 +18,7 @@ All rights reserved
 // 1/2=0, 1/4=-1, 1/4=1
 int sample_half(PRNG &G);
 
-// Following is std dev =3.2
+// Following is std dev = sigma
 int sample_Gauss(PRNG &G);
 
 class RandomGenerator : public Generator<bigint>
diff --git a/src/FHE/FHE_Keys.cpp b/src/FHE/FHE_Keys.cpp
index ef3ec82d..8ac6faa3 100644
--- a/src/FHE/FHE_Keys.cpp
+++ b/src/FHE/FHE_Keys.cpp
@@ -46,7 +46,11 @@ Rq_Element FHE_PK::sample_secret_key(PRNG &G)
 {
   Rq_Element sk= FHE_SK(*this).s();
   // Generate the secret key
-  sk.from_vec((*params).sample_Hwt(G));
+  if ((*params).get_hwt()>0)
+    { sk.from_vec((*params).sample_Hwt(G)); }
+  else
+    { sk.from(GaussianGenerator(G));        }
+
   return sk;
 }
 
diff --git a/src/FHE/FHE_Params.cpp b/src/FHE/FHE_Params.cpp
index 16b28a74..37b3285c 100644
--- a/src/FHE/FHE_Params.cpp
+++ b/src/FHE/FHE_Params.cpp
@@ -16,11 +16,10 @@ All rights reserved
  * security parameters. Please see the Chapter in the document
  * for how this is created etc
  */
-#define num_params 6
+#define num_params 7
 int FHE_Sec_Params[num_params][4]= {
-    {1024, 44, 29, 16}, {2048, 86, 56, 31}, {4096, 171, 111, 60}, {8192, 344, 220, 120}, {16384, 690, 440, 239}, {32768, 998, 883, 478}};
+    {1024, 40, 25, 12}, {2048, 82, 52, 26}, {4096, 167, 106, 56}, {8192, 340, 215, 115}, {16384, 686, 436, 235}, {32768, 1392, 879, 473}, {65536, 2830, 1778, 953}};
 
-#define sigma 3.16
 
 void produce_epsilon_constants(double C[3])
 {
@@ -65,8 +64,10 @@ bigint make_prime(int lg2, int N, const bigint &q= 0,
 
 /* Generates N,p0,p1 and p given input hwt h, log2p, n=nplayers */
 void Generate_Parameters(unsigned int &N, bigint &p0, bigint &p1, bigint &p, int lg2p,
-                         unsigned int h, unsigned int n,
+			 unsigned int n,
                          PoKVersion version,
+                         unsigned int h, 
+			 int NewHopeB_t,
                          int comp_sec_t,
                          int DD_stat_sec_t,
                          int ZK_sound_sec_t,
@@ -74,7 +75,8 @@ void Generate_Parameters(unsigned int &N, bigint &p0, bigint &p1, bigint &p, int
 {
   double pp= exp2((double) lg2p), ss= exp2((double) DD_stat_sec_t);
   double Sslack= exp2((double) ZK_slack_sec_t);
-  double lgp0, lgp1, lgq, hh= h;
+  double lgp0, lgp1, lgq;
+  double sigma=sqrt(NewHopeB_t/2.0);
 
   double C[3];
   produce_epsilon_constants(C);
@@ -106,10 +108,13 @@ void Generate_Parameters(unsigned int &N, bigint &p0, bigint &p1, bigint &p, int
     {
       N= FHE_Sec_Params[i][0];
       double phim= N;
+      double varsk=sigma*sqrt(phim);
+      if (h>0)
+	{ varsk=sqrt((double) h); }
 
       // New
       double B_Clean=
-          1.0 / 2 + 20 * C[1] * sigma * sqrt(phim) + 20 + 20 * C[1] * sqrt(h);
+          1.0 / 2 + NewHopeB * C[1] * sigma * sqrt(phim) + NewHopeB + NewHopeB * C[1] * varsk;
       if (version == HighGear)
         {
           B_Clean*= phim * S32 * 2 * n * pp;
@@ -122,7 +127,7 @@ void Generate_Parameters(unsigned int &N, bigint &p0, bigint &p1, bigint &p, int
           B_Clean*= phim * Sslack * 4.0 * n * pp;
         }
 
-      double B_Scale= pp * (C[1] * sqrt(phim / 12) + C[2] * sqrt(phim * hh / 12));
+      double B_Scale= pp * (C[1] * sqrt(phim / 12) + C[2] * sqrt(phim / 12)*varsk);
       double B_KS= pp * C[2] * sigma * phim / sqrt(12);
       for (lgq= 10; lgq < FHE_Sec_Params[i][index] && !done; lgq+= 10)
         {
@@ -174,7 +179,7 @@ void Generate_Parameters(unsigned int &N, bigint &p0, bigint &p1, bigint &p, int
 }
 
 void FHE_Params::set(const Ring &R, const bigint &p0, const bigint &p1,
-                     unsigned int h, unsigned int n,
+                     int h, unsigned int n,
                      bool check)
 {
   Zp_Data prD0(p0);
@@ -201,8 +206,8 @@ vector<bigint> FHE_Params::sample_Hwt(PRNG &G) const
     {
       ans[i]= 0;
     }
-  unsigned int cnt= 0, j= 0;
   uint8_t ch= 0;
+  int cnt=0, j=0;
   while (cnt < hwt)
     {
       unsigned int i= G.get_uint() % n;
diff --git a/src/FHE/FHE_Params.h b/src/FHE/FHE_Params.h
index d7aaad21..70403c50 100644
--- a/src/FHE/FHE_Params.h
+++ b/src/FHE/FHE_Params.h
@@ -32,10 +32,15 @@ enum PoKVersion { HighGear,
  *   - The default values here are what we use in the main system
  *     We have them as parameters here so we can test different
  *     things in the Test routines (FHE-p.x)
+ *
+ *     Use h=-1 to define a secret key with discrete Gaussian as
+ *     opposed to Hwt distribution
  */
 void Generate_Parameters(unsigned int &N, bigint &p0, bigint &p1, bigint &p, int lg2p,
-                         unsigned int h, unsigned int n,
+			 unsigned int n,
                          PoKVersion version,
+                         unsigned int h=HwtSK, 
+			 int NewHopeB_t=NewHopeB,
                          int comp_sec_t= comp_sec,
                          int DD_stat_sec_t= DD_stat_sec,
                          int ZK_sound_sec_t= ZK_sound_sec,
@@ -50,7 +55,8 @@ class FHE_Params
   // Data for distributed decryption
   bigint Bval; // Bound
 
-  unsigned int hwt;
+  // Hamming weight of secret key or -1 if using Gaussian secret key
+  int hwt;
 
 public:
   FHE_Params()
@@ -61,8 +67,9 @@ class FHE_Params
   // Rely on default copy assignment/constructor (not that they should
   // ever be needed)
 
-  void set(const Ring &R, const bigint &pr0, const bigint &pr1, unsigned int hwt, unsigned int n,
-           bool check= true);
+  void set(const Ring &R, const bigint &pr0, const bigint &pr1, int hwtSK, unsigned int n, bool check= true);
+
+  int get_hwt() const { return hwt; }
 
   const vector<FFT_Data> &FFTD() const
   {
diff --git a/src/FHE/ZKPoK.cpp b/src/FHE/ZKPoK.cpp
index bfbbbf54..69f974ef 100644
--- a/src/FHE/ZKPoK.cpp
+++ b/src/FHE/ZKPoK.cpp
@@ -43,98 +43,106 @@ void ZKPoK::Step0(condition type, PoKVersion vers,
 
   // Make plaintexts first
   m.resize(U, Plaintext(PTD));
-  vector<bigint> bi_alpha(alpha.size());
-  if (PoKType == Diagonal)
-    {
-      for (unsigned int i= 0; i < alpha.size(); i++)
-        {
-          to_bigint(bi_alpha[i], alpha[i]);
-        }
-    }
+  E.resize(U, pk.get_params());
 
-  for (unsigned int i= 0; i < U; i++)
+  if (single == false || prover == true)
     {
+      vector<bigint> bi_alpha(alpha.size());
       if (PoKType == Diagonal)
         {
-          m[i].assign_zero();
-          m[i].set_coeff(0, bi_alpha[i % alpha.size()]);
+          for (unsigned int i= 0; i < alpha.size(); i++)
+            {
+              to_bigint(bi_alpha[i], alpha[i]);
+            }
         }
-      else
+
+      for (unsigned int i= 0; i < U; i++)
         {
-          m[i].randomize(G);
+          if (PoKType == Diagonal)
+            {
+              m[i].assign_zero();
+              m[i].set_coeff(0, bi_alpha[i % alpha.size()]);
+            }
+          else
+            {
+              m[i].randomize(G);
+            }
         }
-    }
 
-  // Now make the random coins for the encryptions
-  r.resize(U, Random_Coins(pk.get_params()));
-  if (PoKType == Diagonal && version == HighGear)
-    {
-      for (unsigned int i= 0; i < alpha.size(); i++)
+      // Now make the random coins for the encryptions
+      r.resize(U, Random_Coins(pk.get_params()));
+      if (PoKType == Diagonal && version == HighGear)
         {
-          r[i].generate(G);
+          for (unsigned int i= 0; i < alpha.size(); i++)
+            {
+              r[i].generate(G);
+            }
+          for (unsigned int i= alpha.size(); i < U; i++)
+            {
+              r[i]= r[i % alpha.size()];
+            }
         }
-      for (unsigned int i= alpha.size(); i < U; i++)
+      else
         {
-          r[i]= r[i % alpha.size()];
+          for (unsigned int i= 0; i < U; i++)
+            {
+              r[i].generate(G);
+            }
         }
-    }
-  else
-    {
+
+      // Now do the encryptions
       for (unsigned int i= 0; i < U; i++)
         {
-          r[i].generate(G);
+          pk.encrypt(E[i], m[i], r[i]);
         }
     }
-
-  // Now do the encryptions
-  E.resize(U, pk.get_params());
-  for (unsigned int i= 0; i < U; i++)
-    {
-      pk.encrypt(E[i], m[i], r[i]);
-    }
 }
 
 void ZKPoK::Step1(const FHE_PK &pk, const FFT_Data &PTD, PRNG &G)
 {
-  // We first do the fake plaintexts
-  bigint Bp= PTD.get_prime() << (ZK_ZK_sec - 1);
-  vector<bigint> vv(pk.get_params().phi_m());
-  if (PoKType == Diagonal)
-    {
-      for (unsigned int i= 0; i < pk.get_params().phi_m(); i++)
-        {
-          vv[i]= 0;
-        }
-    }
-
   Z.resize(V, Rq_Element(pk.get_params().FFTD(), polynomial, polynomial));
-  for (unsigned int i= 0; i < V; i++)
+  T.resize(V, pk.get_params());
+  A.resize(V, pk.get_params());
+
+  if (single == false || prover == true)
     {
+      // We first do the fake plaintexts
+      bigint Bp= PTD.get_prime() << (ZK_ZK_sec - 1);
+      vector<bigint> vv(pk.get_params().phi_m());
       if (PoKType == Diagonal)
         {
-          vv[0]= randomBnd(G, 2 * Bp + 1) - Bp;
-          Z[i].from_vec(vv);
+          for (unsigned int i= 0; i < pk.get_params().phi_m(); i++)
+            {
+              vv[i]= 0;
+            }
         }
-      else
+
+      for (unsigned int i= 0; i < V; i++)
         {
-          Z[i].from(UniformGenerator(G, Bp));
+          if (PoKType == Diagonal)
+            {
+              vv[0]= randomBnd(G, 2 * Bp + 1) - Bp;
+              Z[i].from_vec(vv);
+            }
+          else
+            {
+              Z[i].from(UniformGenerator(G, Bp));
+            }
         }
-    }
 
-  // Now do the fake random coins
-  // s in the notes, but we add them directly into T[i])
-  T.resize(V, pk.get_params());
-  bigint B1= bigint(1) << ZK_ZK_sec, B2= 20 * B1;
-  for (unsigned int i= 0; i < V; i++)
-    {
-      T[i].generateUniform(G, B1, B2, B2);
-    }
+      // Now do the fake random coins
+      // s in the notes, but we add them directly into T[i])
+      bigint B1= bigint(1) << ZK_ZK_sec, B2= NewHopeB * B1;
+      for (unsigned int i= 0; i < V; i++)
+        {
+          T[i].generateUniform(G, B1, B2, B2);
+        }
 
-  // Now do the fake encryptions
-  A.resize(V, pk.get_params());
-  for (unsigned int i= 0; i < V; i++)
-    {
-      pk.quasi_encrypt(A[i], Z[i], T[i]);
+      // Now do the fake encryptions
+      for (unsigned int i= 0; i < V; i++)
+        {
+          pk.quasi_encrypt(A[i], Z[i], T[i]);
+        }
     }
 }
 
@@ -200,8 +208,15 @@ void ZKPoK::Step0_Step(istream &vE, const FHE_PK &pk)
   Ciphertext eq(pk.get_params()), temp(pk.get_params());
   for (unsigned int i= 0; i < E.size(); i++)
     {
-      eq.input(vE);
-      add(E[i], E[i], eq);
+      if (single == true && prover == false)
+        {
+          E[i].input(vE);
+        }
+      else if (single == false)
+        {
+          eq.input(vE);
+          add(E[i], E[i], eq);
+        }
     }
 }
 
@@ -211,8 +226,42 @@ void ZKPoK::Step1_Step(istream &vA, const FHE_PK &pk)
   Ciphertext eq(pk.get_params()), temp(pk.get_params());
   for (unsigned int i= 0; i < A.size(); i++)
     {
-      eq.input(vA);
-      add(A[i], A[i], eq);
+      if (single == true && prover == false)
+        {
+          A[i].input(vA);
+        }
+      else if (single == false)
+        {
+          eq.input(vA);
+          add(A[i], A[i], eq);
+        }
+    }
+}
+
+// Player calls this to enter the each other players vector vT
+void ZKPoK::Step2_Step(istream &vT, istream &vZ, const FHE_PK &pk)
+{
+  Random_Coins Ti(pk.get_params());
+  Rq_Element Zi(pk.get_params().FFTD(), polynomial, polynomial);
+  Ciphertext temp(pk.get_params());
+
+  for (unsigned int i= 0; i < T.size(); i++)
+    {
+      if (single == true && prover == false)
+        {
+          T[i].input(vT);
+          Z[i].input(vZ);
+          pk.quasi_encrypt(eq[i], Z[i], T[i]);
+        }
+      else if (single == false)
+        {
+          Ti.input(vT);
+          Zi.input(vZ);
+          add(T[i], T[i], Ti);
+          add(Z[i], Z[i], Zi);
+          pk.quasi_encrypt(temp, Zi, Ti);
+          add(eq[i], eq[i], temp);
+        }
     }
 }
 
@@ -262,141 +311,137 @@ void ZKPoK::Generate_e(vector<int> &e, uint8_t seed[SEED_SIZE])
 void ZKPoK::Step2(const vector<int> &ee, const FHE_PK &pk)
 {
   e= ee;
+  eq.resize(V, Ciphertext(pk.get_params()));
 
-  // First compute the z vector for this player and the T matrix
-  vector<Rq_Element> x(U, Rq_Element(pk.get_params().FFTD(), polynomial, polynomial));
-
-  Rq_Element temp_rq;
-  Random_Coins temp_rc(pk.get_params());
-  for (unsigned int i= 0; i < U; i++)
+  if (single == false || prover == true)
     {
-      x[i].from(m[i].get_iterator());
-    }
-  for (unsigned int i= 0; i < V; i++)
-    {
-      for (unsigned int j= 0; j < U; j++)
+      // First compute the z vector for this player and the T matrix
+      vector<Rq_Element> x(U, Rq_Element(pk.get_params().FFTD(), polynomial, polynomial));
+
+      Rq_Element temp_rq;
+      Random_Coins temp_rc(pk.get_params());
+      for (unsigned int i= 0; i < U; i++)
+        {
+          x[i].from(m[i].get_iterator());
+        }
+      for (unsigned int i= 0; i < V; i++)
         {
-          int mm= M(i, j, e);
-          if (version == TopGear && PoKType != Diagonal)
+          for (unsigned int j= 0; j < U; j++)
             {
-              if (mm >= 0)
+              int mm= M(i, j, e);
+              if (version == TopGear && PoKType != Diagonal)
                 {
-                  mul_by_X_i(temp_rq, x[j], mm);
-                  mul_by_X_i(temp_rc, r[j], mm);
-                  add(Z[i], Z[i], temp_rq);
-                  add(T[i], T[i], temp_rc);
+                  if (mm >= 0)
+                    {
+                      mul_by_X_i(temp_rq, x[j], mm);
+                      mul_by_X_i(temp_rc, r[j], mm);
+                      add(Z[i], Z[i], temp_rq);
+                      add(T[i], T[i], temp_rc);
+                    }
                 }
-            }
-          else
-            {
-              if (mm == 1)
+              else
                 {
-                  add(Z[i], Z[i], x[j]);
-                  add(T[i], T[i], r[j]);
+                  if (mm == 1)
+                    {
+                      add(Z[i], Z[i], x[j]);
+                      add(T[i], T[i], r[j]);
+                    }
                 }
             }
         }
-    }
-
-  eq.resize(V, Ciphertext(pk.get_params()));
-  for (unsigned int i= 0; i < V; i++)
-    {
-      pk.quasi_encrypt(eq[i], Z[i], T[i]);
-    }
-}
 
-// Player calls this to enter the each other players vector vT
-void ZKPoK::Step2_Step(istream &vT, istream &vZ, const FHE_PK &pk)
-{
-  Random_Coins Ti(pk.get_params());
-  Rq_Element Zi(pk.get_params().FFTD(), polynomial, polynomial);
-  Ciphertext temp(pk.get_params());
-  for (unsigned int i= 0; i < T.size(); i++)
-    {
-      Ti.input(vT);
-      Zi.input(vZ);
-      add(T[i], T[i], Ti);
-      add(Z[i], Z[i], Zi);
-      pk.quasi_encrypt(temp, Zi, Ti);
-      add(eq[i], eq[i], temp);
+      for (unsigned int i= 0; i < V; i++)
+        {
+          pk.quasi_encrypt(eq[i], Z[i], T[i]);
+        }
     }
 }
 
 bool ZKPoK::Step3(const FHE_PK &pk, const FFT_Data &PTD, unsigned int nplayers)
 {
-  Ciphertext temp(pk.get_params());
 
-  // First the check on  sum_players (d[i]-a[i])-M_e*E = 0
-  for (unsigned int i= 0; i < V; i++)
+  if (single == false || prover == false)
     {
-      sub(eq[i], eq[i], A[i]);
-      for (unsigned int j= 0; j < U; j++)
+      Ciphertext temp(pk.get_params());
+
+      // First the check on  sum_players (d[i]-a[i])-M_e*E = 0
+      for (unsigned int i= 0; i < V; i++)
         {
-          int mm= M(i, j, e);
-          if (version == TopGear && PoKType != Diagonal)
+          sub(eq[i], eq[i], A[i]);
+          for (unsigned int j= 0; j < U; j++)
             {
-              if (mm >= 0)
+              int mm= M(i, j, e);
+              if (version == TopGear && PoKType != Diagonal)
                 {
-                  mul_by_X_i(temp, E[j], mm);
-                  sub(eq[i], eq[i], temp);
+                  if (mm >= 0)
+                    {
+                      mul_by_X_i(temp, E[j], mm);
+                      sub(eq[i], eq[i], temp);
+                    }
                 }
-            }
-          else
-            {
-              if (mm == 1)
+              else
                 {
-                  sub(eq[i], eq[i], E[j]);
+                  if (mm == 1)
+                    {
+                      sub(eq[i], eq[i], E[j]);
+                    }
                 }
             }
+          if (!eq[i].c0().is_zero() || !eq[i].c1().is_zero())
+            {
+              cout << "Eq not zero : " << i << endl;
+              return false;
+            }
         }
-      if (!eq[i].c0().is_zero() || !eq[i].c1().is_zero())
-        {
-          cout << "Eq not zero : " << i << endl;
-          return false;
-        }
-    }
 
-  // Check z
-  bigint Bz= nplayers * (PTD.get_prime() << (ZK_ZK_sec));
-  //Rq_Element sz(pk.get_params().FFTD(), polynomial, polynomial);
-  for (unsigned int i= 0; i < V; i++)
-    {
-      if (Z[i].infinity_norm() > Bz)
+      // Check z
+      bigint Bz= (PTD.get_prime() << (ZK_ZK_sec));
+      bigint B1= (bigint(1) << (ZK_ZK_sec + 1));
+      if (single == false)
         {
-          cout << "z too big" << endl;
-          return false;
+          B1*= nplayers;
+          Bz*= nplayers;
         }
-      if (PoKType == Diagonal)
+      bigint B2= B1 * NewHopeB;
+      //Rq_Element sz(pk.get_params().FFTD(), polynomial, polynomial);
+      for (unsigned int i= 0; i < V; i++)
         {
-          vector<bigint> te= Z[i].to_vec_bigint();
-          for (unsigned int j= 1; j < te.size(); j++)
-            if (te[j] != 0)
-              {
-                cout << "Not diag " << endl;
-                return false;
-              }
+          if (Z[i].infinity_norm() > Bz)
+            {
+              cout << "z too big" << endl;
+              return false;
+            }
+          if (PoKType == Diagonal)
+            {
+              vector<bigint> te= Z[i].to_vec_bigint();
+              for (unsigned int j= 1; j < te.size(); j++)
+                if (te[j] != 0)
+                  {
+                    cout << "Not diag " << endl;
+                    return false;
+                  }
+            }
         }
-    }
 
-  // Check T
-  bigint B1= nplayers * (bigint(1) << (ZK_ZK_sec + 1)), B2= B1 * 20;
-  //Random_Coins sT(pk.get_params());
-  for (unsigned int i= 0; i < V; i++)
-    {
-      if (T[i].u().infinity_norm() > B1)
+      // Check T
+      //  Random_Coins sT(pk.get_params());
+      for (unsigned int i= 0; i < V; i++)
         {
-          cout << "u too big" << endl;
-          return false;
-        }
-      if (T[i].v().infinity_norm() > B2)
-        {
-          cout << "v too big" << endl;
-          return false;
-        }
-      if (T[i].w().infinity_norm() > B2)
-        {
-          cout << "w too big" << endl;
-          return false;
+          if (T[i].u().infinity_norm() > B1)
+            {
+              cout << "u too big" << endl;
+              return false;
+            }
+          if (T[i].v().infinity_norm() > B2)
+            {
+              cout << "v too big" << endl;
+              return false;
+            }
+          if (T[i].w().infinity_norm() > B2)
+            {
+              cout << "w too big" << endl;
+              return false;
+            }
         }
     }
 
diff --git a/src/FHE/ZKPoK.h b/src/FHE/ZKPoK.h
index ac8da89a..b846dd91 100644
--- a/src/FHE/ZKPoK.h
+++ b/src/FHE/ZKPoK.h
@@ -12,7 +12,12 @@ All rights reserved
  * correctness of plaintexts
  *
  * Each player runs an instance of ZKPoK
- *
+ *   When single=true this runs as the prover or verifier
+ *   otherwise it runs as the prover being all (as in 
+ *   HighGear/TopGear), with a sum as the final statement.
+ *   The former case is needed for FHE_IO creation. When
+ *   single=true we need to know if we are the prover
+ *   or the verifier
  *
  * Step 1:
  *
@@ -67,6 +72,7 @@ class ZKPoK
 {
   condition PoKType;
   PoKVersion version;
+  bool single, prover;
 
   unsigned int U, V;
   // The *actual* soundness security we achieve
@@ -74,6 +80,7 @@ class ZKPoK
   unsigned int ssec;
 
   // Associated random coins for this player for the valid ciphertexts
+  //   If single=true this only holds something if I am the prover
   vector<Random_Coins> r;
 
   vector<int> e;
@@ -82,16 +89,22 @@ class ZKPoK
 
   // A[j] is the j-ciphertext of the test vectors in Step 1
   // In Step1_Step we add in the other players A vectors one by one
+  // if single=false, otherwise this is just the data from the prover/me
   vector<Ciphertext> A;
 
   // E[j] is j th ciphertext from me in Step 1
   // In Step1_Step we add in the other players E vectors one by one
+  // if single=false, otherwise this is just the data from the prover
+  // when prover=false, and my ciphertexts when prover=true
   vector<Ciphertext> E;
 
+  //   If single=true this is just the data from the prover/me
   vector<Rq_Element> Z;   // The vector of z's
   vector<Random_Coins> T; // The vector of T's
 
-  vector<Plaintext> m; // The plaintexts for this player
+  // The plaintexts for this player
+  //   If single=true this only holds something if I am the prover
+  vector<Plaintext> m;
 
   vector<Ciphertext> eq; // Main equation checking vector
 
@@ -99,6 +112,30 @@ class ZKPoK
   int M(unsigned int k, unsigned int l, const vector<int> &e);
 
 public:
+  // Default settings (standard TopGear)
+  ZKPoK()
+  {
+    single= false;
+    prover= false;
+  }
+
+  // Settings for use with the IO setting
+  //   - Basically run old SPDZ-2 proof with new technology
+  void set_params(bool sin= false, bool prov= false)
+  {
+    single= sin;
+    prover= prov;
+  }
+
+  bool is_single() const
+  {
+    return single;
+  }
+  bool is_prover() const
+  {
+    return prover;
+  }
+
   /* Set up the initial ciphertexts we are going to prove.
    * We do this in a seperate call, as then we can repeat the proof
    * for the same ciphertexts over and over again, to increase
@@ -115,6 +152,9 @@ class ZKPoK
 
   void Step1(const FHE_PK &pk, const FFT_Data &PTD, PRNG &G);
 
+  // Player calls this to enter the each other players vectors vA
+  void Step1_Step(istream &vA, const FHE_PK &pk);
+
   // Get my vA for broadcasting
   void get_vA(ostream &s) const;
   // Get my vE for broadcasting
@@ -124,9 +164,6 @@ class ZKPoK
   // Get my vz for broadcasting
   void get_vZ(ostream &s) const;
 
-  // Player calls this to enter the each other players vectors vA
-  void Step1_Step(istream &vA, const FHE_PK &pk);
-
   // Generate the vector e for Step 2 from a random seed
   void Generate_e(vector<int> &e, uint8_t seed[SEED_SIZE]);
 
diff --git a/src/LSSS/Share.h b/src/LSSS/Share.h
index a1f1f903..fda960d1 100644
--- a/src/LSSS/Share.h
+++ b/src/LSSS/Share.h
@@ -152,7 +152,12 @@ class Share
     res.mul(*this, x);
     return res;
   }
-
+  Share operator<<(int i) const
+  {
+	  Share res;
+	  res.mul(*this, gfp(1) << i);
+	  return res;
+  }
   Share &operator+=(const Share &x)
   {
     add(x);
@@ -163,6 +168,10 @@ class Share
     mul(*this, x);
     return *this;
   }
+  Share &operator<<=(int i)
+  {
+	  return *this = *this<<i;
+  }
 
   // Input and output from a stream
   //  - Can do in human or machine only format (later should be faster)
diff --git a/src/Makefile b/src/Makefile
index 50abdd64..c5a864ed 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -3,11 +3,11 @@ include ../CONFIG.mine
 
 CC = g++
 
-CFLAGS = -Wall -std=c++11 -pedantic -Wextra -g -pthread -I$(ROOT)/src -maes -mpclmul -msse4.1 -mavx -march=core2 $(FLAGS) $(OPT) -I$(OSSL)/include 
+CFLAGS = -Wall -std=c++11 -pedantic -Wextra -g -pthread -I$(ROOT)/src -maes -mpclmul -msse4.1 -mavx -march=core2 $(FLAGS) $(OPT) -I$(OSSL)/include
 CPPFLAGS = $(CFLAGS) 
 LDLIBS = -L/$(OSSL)/lib -lm -lssl -lcrypto -lmpirxx -lmpir -lcryptopp $(LDFLAGS)
 
-all: $(LIB) Setup.x Player.x 
+all: $(LIB) Setup.x Player.x
 
 OFFLINE = $(patsubst %.cpp,%.o,$(wildcard Offline/*.cpp))
 ONLINE = $(patsubst %.cpp,%.o,$(wildcard Online/*.cpp))
diff --git a/src/Math/Matrix.h b/src/Math/Matrix.h
index f0179700..3cdde4be 100644
--- a/src/Math/Matrix.h
+++ b/src/Math/Matrix.h
@@ -52,6 +52,7 @@ vector<gfp> BackSubst(const gfp_matrix &A);
 
 bool is_zero(const vector<gfp> &x);
 
+// Hwt returns the hamming weight of vector D
 int Hwt(const vector<int> &D);
 
 #endif
diff --git a/src/OT/COT.h b/src/OT/COT.h
index e01251bf..7c391e2f 100644
--- a/src/OT/COT.h
+++ b/src/OT/COT.h
@@ -35,15 +35,16 @@ class Sender_COT
 #endif
   int pair;
 
+public:
   // Execute the next extraction procedure
   // This is protocol of Figure 19 of eprint 2015/901
   // We pass in the aBit here and this procedure sets the t value
   // Note we pass in sz_aB, as aB/x could be much bigger
+  //   - This is passively secure only!
   void next_iteration(Player &P, unsigned int sz_aB,
                       vector<aBit> &aB,
                       const BitVector &x);
 
-public:
   void init(Player &P, int i, CryptoPP::RandomPool &RNG);
 
   // This is a COT with player i
@@ -67,15 +68,16 @@ class Receiver_COT
   int pair;
   gf2n Delta;
 
+  void compute_Delta();
+
+public:
   // Execute the next extraction procedure
   // This is protocol of Figure 19 of eprint 2015/901
   // We pass in the aBit here and this procedure sets the q value
   // Note we pass in sz_aB, as aB could be much bigger
+  //   - This is passively secure only!
   void next_iteration(Player &P, unsigned int sz_aB, vector<aBit> &aB);
 
-  void compute_Delta();
-
-public:
   // This is a COT with player i
   void init(Player &P, int i, CryptoPP::RandomPool &RNG, vector<int> choicebits);
 
diff --git a/src/OT/OT_Thread_Data.h b/src/OT/OT_Thread_Data.h
index 95e84005..105df3c8 100644
--- a/src/OT/OT_Thread_Data.h
+++ b/src/OT/OT_Thread_Data.h
@@ -9,11 +9,13 @@ class OT_Thread_Data
 public:
   aBit_Data aBD;
   aAND_Data aAD;
+  bool ready;
 
   void init(unsigned int no_online_threads)
   {
     aBD.aBits.resize(no_online_threads + 1);
     aAD.aANDs.resize(no_online_threads);
+    ready=false;
   }
 };
 
diff --git a/src/OT/aBit.h b/src/OT/aBit.h
index 99fe27a3..3338ba00 100644
--- a/src/OT/aBit.h
+++ b/src/OT/aBit.h
@@ -103,6 +103,20 @@ class aBit
    */
   void get_Share_x_Delta_j(vector<gf2n> &ans) const;
 
+  /* Arithmetic Operators */
+  aBit operator+(const aBit &x) const
+  {
+    aBit res;
+    res.add(*this, x);
+    return res;
+  }
+
+  aBit &operator+=(const aBit &x)
+  {
+    add(x);
+    return *this;
+  }
+
   friend ostream &operator<<(ostream &s, const aBit &y);
 
   void output(ostream &s, bool human) const;
diff --git a/src/OT/aBit_Thread.cpp b/src/OT/aBit_Thread.cpp
index 3f95edc3..04cc7f49 100644
--- a/src/OT/aBit_Thread.cpp
+++ b/src/OT/aBit_Thread.cpp
@@ -40,14 +40,26 @@ int check_exit(Player &P, unsigned int no_online_threads, offline_control_data &
       // Do not die if main offline threads still working
       for (unsigned int i= 0; i < no_online_threads; i++)
         {
-          OCD.sacrifice_mutex[i].lock();
+          OCD.OCD_mutex[i].lock();
           if (OCD.finish_offline[i] != 1)
             {
               ss= "-";
               result= 0;
             }
-          OCD.sacrifice_mutex[i].unlock();
+          OCD.OCD_mutex[i].unlock();
         }
+      // Do not die if an online thread is still working
+      for (unsigned int i= 0; i < OCD.finish_offline.size(); i++)
+        {
+          OCD.OCD_mutex[i].lock();
+          if (OCD.finished_online[i] == 0)
+            {
+              ss= "-";
+              result= 0;
+            }
+          OCD.OCD_mutex[i].unlock();
+        }
+
       P.send_all(ss, 2);
     }
   else
@@ -123,6 +135,7 @@ void aBit_Thread(Player &P, unsigned int no_online_threads,
 
   // Pack the last queue first, as it is used for aANDs
   OTD.aBD.aBD_mutex.lock();
+  OTD.ready=true;
   aBF.tune(P, OTD.aBD.aBits[no_online_threads], verbose);
   if (verbose)
     {
diff --git a/src/Offline/DABitGenerator.cpp b/src/Offline/DABitGenerator.cpp
index fefb3ccc..dd77f97c 100644
--- a/src/Offline/DABitGenerator.cpp
+++ b/src/Offline/DABitGenerator.cpp
@@ -7,22 +7,21 @@ All rights reserved
 
 #include "DABitGenerator.h"
 #include "DABitMachine.h"
-#include "LSSS/PRSS.cpp"
 #include "OT/OT_Thread_Data.h"
 #include "Processor/Processor_IO.h"
 #include "Tools/util_containers.h"
 
 #include "unistd.h"
 #include <cassert>
+#include <list>
 
 extern vector<sacrificed_data> SacrificeD;
 extern OT_Thread_Data OTD;
 
-DABitGenerator::DABitGenerator(MaliciousDABitMachine &machine,
-                               Player &P, int thread_num, offline_control_data &OCDm)
-    : thread_num(thread_num), P(P),
-      machine(machine), OCD(OCDm),
-      xor_machine(P, OCD, thread_num)
+SmallPrimeDABitGenerator::SmallPrimeDABitGenerator(MaliciousDABitMachine &machine,
+                                                   Player &P, int thread_num)
+    : AbstractDABitGenerator(thread_num, P),
+      machine(machine), OCD(*machine.OCD), xor_machine(P, OCD, thread_num)
 {
   G.ReSeed(thread_num);
   // Randomness for FY permutation
@@ -32,9 +31,9 @@ DABitGenerator::DABitGenerator(MaliciousDABitMachine &machine,
 }
 
 // adjusting the MACs and shares for the input bits
-void DABitGenerator::adjust_and_pack(stringstream &ss,
-                                     vector<Share> &Shp, vector<aBit> &Sh2,
-                                     unsigned int player, vector<bool> bits)
+void SmallPrimeDABitGenerator::adjust_and_pack(stringstream &ss,
+                                               vector<Share> &Shp, vector<aBit> &Sh2,
+                                               unsigned int player, vector<bool> bits)
 {
   size_t num_bits= bits.size();
   // Assume we take aBit shares from Garbling
@@ -69,11 +68,11 @@ void DABitGenerator::adjust_and_pack(stringstream &ss,
   vector<Share> vec_random_masks_p;
 
   // Lock to retrieve gf(p) masks
-  OCD.sacrifice_mutex[thread_num].lock();
+  OCD.OCD_mutex[thread_num].lock();
   take_first_to_vector(vec_random_masks_p, SacrificeD[thread_num].ID.ios[player], num_bits);
   if (player == P.whoami())
     take_first_to_vector(opened_masks_p, SacrificeD[thread_num].ID.opened_ios, num_bits);
-  OCD.sacrifice_mutex[thread_num].unlock();
+  OCD.OCD_mutex[thread_num].unlock();
   timers["Retrieving input masks"].stop();
 
   timers["Serializing bits"].start();
@@ -115,7 +114,7 @@ void DABitGenerator::adjust_and_pack(stringstream &ss,
   timers["Serializing bits"].stop();
 }
 
-void DABitGenerator::provide_random_inputs(vector<vector<Share>> &Shp, vector<vector<aBit>> &Sh2, size_t num_bits)
+void SmallPrimeDABitGenerator::provide_random_inputs(vector<vector<Share>> &Shp, vector<vector<aBit>> &Sh2, size_t num_bits)
 {
 
   vector<bool> inp_bits(num_bits);
@@ -184,7 +183,7 @@ void DABitGenerator::provide_random_inputs(vector<vector<Share>> &Shp, vector<ve
 }
 
 // Protocol \pi_{daBits + MPC} from https://eprint.iacr.org/2019/207
-void DABitGenerator::run(daBitVector &dabs)
+void SmallPrimeDABitGenerator::run(daBitVector &dabs)
 {
   vector<Share> allShp, Shp_buckets;
   vector<aBit> allSh2, Sh2_buckets;
@@ -266,16 +265,351 @@ void DABitGenerator::run(daBitVector &dabs)
       dabs.bp[i]= combinedp[i * machine.bucket_size];
       dabs.b2[i]= combined2[i * machine.bucket_size];
     }
+  total+= machine.nBitsPerLoop;
   dabs.used= 0;
 }
 
-size_t DABitGenerator::report_sent()
+LargePrimeDABitGenerator::LargePrimeDABitGenerator(MaliciousDABitMachine &machine,
+                                                   Player &P, int thread_num)
+    : AbstractDABitGenerator(thread_num, P),
+      machine(machine), OCD(*machine.OCD), xor_machine(P, OCD, thread_num)
 {
-  // TODO: Wait until we merge this with benchmarking branch
-  return 0;
+  // Randomness for checking sec LSBs
+  uint8_t seed[SEED_SIZE];
+  AgreeRandom(P, seed, SEED_SIZE);
+  FRand.SetSeedFromRandom(seed);
+}
+
+void LargePrimeDABitGenerator::split_shares(vector<gfp> &out_modp_shares,
+                                            const vector<Share> &modp_bits)
+{
+
+  // TODO: needs fixing if more values per share
+  int m= machine.nBitsPerLoop + machine.sec;
+  int n= (P.nplayers() + 1) / 2;
+
+  vector<stringstream> to_send(P.nplayers());
+  int offset= P.whoami() % 2;
+
+  // Bits are arranges as following
+  // [b_1, ..., b_n, b_{n+1}, ..., b_{2*n}...
+  // current party will send its shares to its group P_{2*i+offset}
+  // Eg:
+  // P0 should get all shares of b_1^j where j is even,
+  // P1 should get all shares of b_1^j where j is odd
+  // P2 should get all shares of b_2^j where j is even
+  // P3 should get all shares of b_2^j where j is odd.. etc:
+  // where offset = {0,1}, depending on the group parity
+  int last_player= P.nplayers() - 1;
+  for (int k= 0; k < m; k++)
+    for (int i= 0; i < n; i++)
+      {
+        // when odd number of players then 2*i+offset is too large
+        // need to place this if to avoid seg faults
+        if (2 * i + offset < (int) P.nplayers())
+          modp_bits[k * n + i].get_share(0).output(to_send[2 * i + offset], false);
+        else
+          modp_bits[k * n + i].get_share(0).output(to_send[last_player], false);
+      }
+  // see whether P is in the even / odd group
+  // adjust the offset accordingly
+  vector<int> parity_group;
+  for (size_t i= offset; i < P.nplayers(); i+= 2)
+    if (i != P.whoami())
+      parity_group.push_back(i);
+
+  size_t num_ingroup= parity_group.size();
+
+  // add in the os buffer solution
+  vector<string> os(P.nplayers());
+  for (size_t i= 0; i < num_ingroup; i++)
+    {
+      size_t other_player= parity_group[i];
+      if (other_player > P.whoami())
+        {
+          P.send_to_player(other_player, to_send[other_player].str());
+        }
+      else if (other_player < P.whoami())
+        {
+          P.receive_from_player(other_player, os[other_player]);
+        }
+    }
+
+  // now do it reverse
+  for (size_t i= 0; i < num_ingroup; i++)
+    {
+      size_t other_player= parity_group[i];
+      if (other_player > P.whoami())
+        {
+          P.receive_from_player(other_player, os[other_player]);
+        }
+      else if (other_player < P.whoami())
+        {
+          P.send_to_player(other_player, to_send[other_player].str());
+        }
+    }
+
+  // if odd nr of players then parties who belong to different parity group
+  // send their bit share to the last party
+  if (P.nplayers() % 2)
+    {
+      if (P.whoami() % 2)
+        P.send_to_player(last_player, to_send[last_player].str());
+      if ((int) P.whoami() == last_player)
+        {
+          for (size_t i= 1; i < P.nplayers(); i+= 2)
+            {
+              P.receive_from_player(i, os[i]);
+              // add outsider player to same collecting group
+              parity_group.push_back(i);
+              // this is for the last loop to pickup and sum the received
+              // shares from all parties
+            }
+        }
+    }
+
+  out_modp_shares.resize(m * n);
+  // copy own shares
+  for (int i= 0; i < m; i++)
+    for (int k= 0; k < n; k++)
+      out_modp_shares[i * n + k]= modp_bits[i * n + k].get_share(0);
+
+  int own_index= P.whoami() / 2;
+  // Parse the GF(p) shares and add them locally
+  for (int other_player : parity_group)
+    {
+      stringstream is(os[other_player]);
+      gfp tmp;
+      for (int j= 0; j < m; j++)
+        {
+          tmp.input(is, false);
+          out_modp_shares[j * n + own_index]+= tmp;
+        }
+    }
+}
+
+// converts the modp bits vector to a matrix of shares to do a row wise gfp xor
+void LargePrimeDABitGenerator::prepare_for_xor(vector<vector<Share>> &bit_rows, const vector<Share> &modp_bits)
+{
+
+  int n= (P.nplayers() + 1) / 2;
+  int m= machine.nBitsPerLoop + machine.sec;
+
+  bit_rows.clear();
+  for (int i= 0; i < m; i++)
+    {
+      vector<Share> tmp;
+      for (int j= 0; j < n; ++j)
+        tmp.push_back(modp_bits[i * n + j]);
+      bit_rows.push_back(tmp);
+    }
+}
+
+void LargePrimeDABitGenerator::input_GF2_bits(vector<aBit> &Sh2, const vector<bool> &bits)
+{
+
+  vector<vector<aBit>> allRandomMasks(P.nplayers());
+  vector<vector<int>> allOpenedMasks(P.nplayers());
+
+  int num_dabits= machine.nBitsPerLoop + machine.sec;
+
+  // for n = 2 players, then everyone has to enter all their shares
+  // n > 2 then things change slightly
+  vector<bool> compressed_bits(num_dabits);
+
+  int extra_bits= (P.nplayers() + 1) / 2;
+  int own_bit_index= P.whoami() / 2;
+  for (int i= 0; i < num_dabits; i++)
+    compressed_bits[i]= bits[i * extra_bits + own_bit_index];
+
+  for (size_t pnum= 0; pnum < P.nplayers(); pnum++)
+    {
+      timers["get aShares"].start();
+      list<aBit> random_masks_2= OTD.aBD.get_aShares(thread_num, num_dabits);
+      timers["get aShares"].stop();
+      // Now open masks in F_2 to current player
+      take_first_to_vector(allRandomMasks[pnum], random_masks_2, num_dabits);
+      Open_aBits_To(allOpenedMasks[pnum], pnum, allRandomMasks[pnum], P);
+    }
+
+  Sh2.resize(num_dabits);
+  //adjust shares mod2
+  size_t my_num= P.whoami();
+  // pack signal bits
+  stringstream ss;
+  for (int i= 0; i < num_dabits; i++)
+    {
+      int wire_bit= compressed_bits[i];
+      wire_bit^= allOpenedMasks[my_num][i];
+      ss << (char) wire_bit;
+      aBit epsilon_2= allRandomMasks[my_num][i];
+      epsilon_2.add(wire_bit);
+      Sh2[i]= epsilon_2;
+    }
+
+  // sum all signal bits from other players
+  vector<string> os(P.nplayers());
+  os[P.whoami()]= ss.str();
+  P.Broadcast_Receive(os);
+  for (size_t pnum= 0; pnum < P.nplayers(); pnum++)
+    {
+      if (pnum != P.whoami())
+        {
+          istringstream is(os[pnum]);
+          gf2n te2;
+          for (int i= 0; i < num_dabits; i++)
+            {
+              char wire_bit;
+              is >> wire_bit;
+              Sh2[i].add(wire_bit);
+              // Need to locally add the rest of the sharings for each party
+              Sh2[i].add(allRandomMasks[pnum][i]);
+            }
+        }
+    }
+}
+
+void LargePrimeDABitGenerator::check_public_lsbs(vector<Share> modp_bits, vector<aBit> mod2_bits)
+{
+  int sec= machine.sec;
+
+  Share sum_p(P.whoami());
+  aBit sum_2;
+
+  // use first sec bits to perform the checks
+  for (size_t i= sec + 1; i < modp_bits.size(); i++)
+    {
+      sum_p.add(modp_bits[i]);
+      sum_2.add(mod2_bits[i]);
+    }
+
+  int log_nBits= 1;
+  while ((1LL << log_nBits) <= machine.nBitsPerLoop)
+    ++log_nBits;
+
+  // Now need to retrieve sec ^ 2 * log(nBitsPerLoop) random bits modp
+  int num_bits_needed= sec * sec * log_nBits;
+  timers["Waiting for random bits in GF(p)"].start();
+  Wait_For_Preproc(DATA_BIT, num_bits_needed, thread_num, OCD, P.whoami());
+  timers["Waiting for random bits in GF(p)"].stop();
+
+  list<Share> prep_bits;
+  // Lock to retrieve GF(p) random bits
+  OCD.bit_mutex[thread_num].lock();
+  auto &sacrificed_bits= SacrificeD[thread_num].BD.bb;
+  prep_bits.splice(prep_bits.begin(), sacrificed_bits,
+                   sacrificed_bits.begin(), next(sacrificed_bits.begin(), num_bits_needed));
+  OCD.bit_mutex[thread_num].unlock();
+
+  // Now compute private lsbs
+  vector<Share> lsb_modp(sec, Share(P.whoami()));
+  vector<aBit> lsb_mod2(sec);
+
+  // add in a random mask [2*r] of sec + log_nbits bitlength
+  auto it_prep_bit= prep_bits.begin();
+  vector<gfp> power_of_two(sec + log_nBits + 1, 1);
+  for (int i = 1; i <= sec + log_nBits; i++)
+    power_of_two[i] = power_of_two[i - 1] * 2;
+
+  for (int i= 0; i < sec; i++)
+    {
+      for (int j= 0; j < sec + log_nBits; j++)
+        {
+          // Need to preserve parity of the opened lsb mask
+          // so we keep the lsb unmasked by offsetting bit operations by 1
+          lsb_modp[i]+= (*it_prep_bit * power_of_two[j + 1]);
+          it_prep_bit++;
+        }
+    }
+  // add in extra bit which will be discarded
+  for (int i= 0; i < sec; i++)
+    {
+      lsb_modp[i]+= modp_bits[i];
+      lsb_mod2[i]+= mod2_bits[i];
+    }
+  // Now open and compare lsb
+  vector<gfp> opened_modp;
+  OP.Open_To_All_Begin(opened_modp, lsb_modp, P, 2);
+  OP.Open_To_All_End(opened_modp, lsb_modp, P, 2);
+
+  vector<int> opened_mod2;
+  Open_aBits(opened_mod2, lsb_mod2, P);
+  for (int i= 0; i < sec; ++i)
+    {
+      bool x= (opened_modp[i] & 1).is_one();
+      bool y= opened_mod2[i];
+      if (x != y)
+        throw Sacrifice_Check_Error("daBit correctness error");
+    }
+  OP.RunOpenCheck(P, "", 2);
+}
+void LargePrimeDABitGenerator::run(daBitVector &dabs)
+{
+  size_t num_dabits= machine.nBitsPerLoop + machine.sec;
+  // Now retrieve random GF(p) bits
+  // Wait until enough in the queue
+  timers["Waiting for random bits in GF(p)"].start();
+  Wait_For_Preproc(DATA_BIT, num_dabits, thread_num, OCD, P.whoami());
+  timers["Waiting for random bits in GF(p)"].stop();
+
+  int extra_gfp_xors= int(P.nplayers() + 1) / 2 - 1;
+  int m= num_dabits * (extra_gfp_xors + 1);
+
+  vector<Share> random_bits_modp;
+  // Lock to retrieve GF(p) random bits
+  OCD.bit_mutex[thread_num].lock();
+  take_first_to_vector(random_bits_modp, SacrificeD[thread_num].BD.bb, m);
+  OCD.bit_mutex[thread_num].unlock();
+
+  // if n-party case then simplify to 2-party case
+  vector<gfp> half_shares;
+  if (P.nplayers() > 2)
+    {
+      this->split_shares(half_shares, random_bits_modp);
+      vector<vector<Share>> row_arranged;
+      this->prepare_for_xor(row_arranged, random_bits_modp);
+      this->xor_machine.party_log_xor(random_bits_modp, row_arranged, this->OP);
+    }
+  else
+    {
+      for (auto &rbit : random_bits_modp)
+        half_shares.push_back(rbit.get_share(0));
+    }
+
+  vector<bool> mod2_half_shares(half_shares.size());
+  for (size_t i= 0; i < mod2_half_shares.size(); i++)
+    mod2_half_shares[i]= (half_shares[i] & 1).is_one();
+
+  // Input shares to GC
+  vector<aBit> mod2_shares(num_dabits);
+  this->input_GF2_bits(mod2_shares, mod2_half_shares);
+
+  // Offset bit shares by 1
+  // Only tested for N <= 3 parties
+  aBit x;
+  if ((P.nplayers() / 2) % 2)
+    x.assign_one();
+  else
+    x.assign_zero();
+  for (size_t i= 0; i < mod2_shares.size(); i++)
+    mod2_shares[i]+= x;
+
+  // Now do sec checks and compare public LSBs
+  this->check_public_lsbs(random_bits_modp, mod2_shares);
+
+  // Final bit output, discard first sec bits which were used for checks
+  dabs.b2.resize(machine.nBitsPerLoop);
+  dabs.bp.resize(machine.nBitsPerLoop);
+  for (int i= 0; i < machine.nBitsPerLoop; ++i)
+    {
+      dabs.bp[i]= random_bits_modp[machine.sec + i];
+      dabs.b2[i]= mod2_shares[machine.sec + i];
+    }
+  dabs.used= 0;
+  total+= machine.nBitsPerLoop;
 }
 
-void daBitVector::get_daBit(Share &bpr, aBit &b2r, DABitGenerator &daBitGen)
+void daBitVector::get_daBit(Share &bpr, aBit &b2r, AbstractDABitGenerator &daBitGen)
 {
   if (used >= bp.size())
     {
@@ -287,7 +621,7 @@ void daBitVector::get_daBit(Share &bpr, aBit &b2r, DABitGenerator &daBitGen)
 }
 
 void daBitVector::get_daBits(vector<Share> &bpr, vector<aBit> &b2r,
-                             DABitGenerator &daBitGen)
+                             AbstractDABitGenerator &daBitGen)
 {
   if (bpr.size() != b2r.size())
     {
diff --git a/src/Offline/DABitGenerator.h b/src/Offline/DABitGenerator.h
index 20fd0b1a..b46aec13 100644
--- a/src/Offline/DABitGenerator.h
+++ b/src/Offline/DABitGenerator.h
@@ -16,6 +16,7 @@ All rights reserved
 
 #include "LSSS/Open_Protocol.h"
 #include "LSSS/Share.h"
+#include "LSSS/PRSS.h"
 #include "Math/gf2n.h"
 #include "Math/gfp.h"
 #include "OT/aBit.h"
@@ -28,7 +29,7 @@ All rights reserved
 #include <vector>
 
 class MaliciousDABitMachine;
-class DABitGenerator;
+class AbstractDABitGenerator;
 
 // Holds vectors of bits with are double shared
 // In the future this needs to be templated
@@ -48,23 +49,47 @@ class daBitVector
   }
 
   // Gets the next daBit, and generates more if needed
-  void get_daBit(Share &bpr, aBit &b2r, DABitGenerator &daBitGen);
+  void get_daBit(Share &bpr, aBit &b2r, AbstractDABitGenerator &daBitGen);
 
   // As above but for a vector
-  void get_daBits(vector<Share> &bpr, vector<aBit> &b2r, DABitGenerator &daBitGen);
+  void get_daBits(vector<Share> &bpr, vector<aBit> &b2r, AbstractDABitGenerator &daBitGen);
 
-  friend class DABitGenerator;
+  friend class LargePrimeDABitGenerator;
+  friend class SmallPrimeDABitGenerator;
 };
 
 // Main class for generating daBits
-class DABitGenerator
+
+class AbstractDABitGenerator
 {
-  unsigned int thread_num;
+protected:
+  int thread_num;
+
+  // MAC Check for GFp bits
+  //   Do not use the one from the main online thread in case the
+  //   open/closes get mixed up
+  Open_Protocol OP;
 
+public:
   Player &P;
+  long long total;
+
+  map<string, Timer> timers;
+
+  // Previous code created a new player, according to thread_num
+  // Now we just copy by reference, might want to change this in the future.
+  AbstractDABitGenerator(int thread_num, Player &P) : thread_num(thread_num), P(P), total(0) {}
+
+  virtual ~AbstractDABitGenerator() {}
+  virtual void run(daBitVector &dabs)= 0;
+  int get_thread_num() const { return thread_num; }
+};
+
+// this is classic cut-and-choose from https://eprint.iacr.org/2019/207
+class SmallPrimeDABitGenerator : public AbstractDABitGenerator
+{
 
   MaliciousDABitMachine &machine;
-  offline_control_data &OCD;
 
   // Randomness for bit generation
   PRNG G;
@@ -72,14 +97,15 @@ class DABitGenerator
   // Shared randomness for FRand
   PRNG FRand;
 
-  // MAC Check for GFp bits
-  //   Do not use the one from the main online thread in case the
-  //   open/closes get mixed up
-  Open_Protocol OP;
+  // OCD for getting preprocessed data such as triples or random bits
+  offline_control_data &OCD;
 
   // XOR Machine for doing the CNC and n-party gf(p) xor
   XOR_Machine xor_machine;
 
+  // Counter for debug
+  unsigned long long total;
+
   void adjust_and_pack(stringstream &ss, vector<Share> &Shp, vector<aBit> &Sh2,
                        unsigned int player, vector<bool> bits);
 
@@ -87,13 +113,36 @@ class DABitGenerator
                              vector<vector<aBit>> &Sh2, size_t nBits);
 
 public:
-  DABitGenerator(MaliciousDABitMachine &machine, Player &P, int thread_num, offline_control_data &OCD);
+  SmallPrimeDABitGenerator(MaliciousDABitMachine &machine, Player &P, int thread_num);
   void run(daBitVector &dabs);
-  int get_thread_num() const { return thread_num; }
+};
 
-  map<string, Timer> timers;
+// This is using the improved dabit generation from https://eprint.iacr.org/2019/974
+// which works for large primes; currently we've only implemented it for dishonest
+// majority
+class LargePrimeDABitGenerator : public AbstractDABitGenerator
+{
+  // to get parameters state
+  MaliciousDABitMachine &machine;
+
+  // OCD for getting preprocessed data such as triples or random bits
+  offline_control_data &OCD;
 
-  size_t report_sent();
+  //XOR Machine for doing n/2 party XOR
+  // Shared randomness for FRand
+  PRNG FRand;
+
+  // XOR Machine for the n/2-party gf(p) xor
+  XOR_Machine xor_machine;
+
+  void input_GF2_bits(vector<aBit> &out_sh2, const vector<bool> &bits);
+  void check_public_lsbs(const vector<Share> modp_bits, const vector<aBit> mod2_bits);
+  void split_shares(vector<gfp> &out_modp_shares, const vector<Share> &modp_bits);
+  void prepare_for_xor(vector<vector<Share>> &bit_rows, const vector<Share> &modp_bits);
+
+public:
+  LargePrimeDABitGenerator(MaliciousDABitMachine &machine, Player &P, int thread_num);
+  void run(daBitVector &dabs);
 };
 
 #endif /* SRC_OFFLINE_DABITGENERATOR_H_ */
diff --git a/src/Offline/DABitMachine.cpp b/src/Offline/DABitMachine.cpp
index 2d0940a2..cf1fb699 100644
--- a/src/Offline/DABitMachine.cpp
+++ b/src/Offline/DABitMachine.cpp
@@ -15,12 +15,18 @@ All rights reserved
 #include "DABitMachine.h"
 #include "config.h"
 
-DABitMachineBase::DABitMachineBase() : nBitsPerLoop(kdaBitsPerLoop), sec(daBits_stat_sec), cnc_param(0), bucket_size(0)
+DABitMachineBase::DABitMachineBase() : nBitsPerLoop(kdaBitsPerLoop), sec(daBits_stat_sec), cnc_param(0),
+	bucket_size(0)
 {
 }
 
-void MaliciousDABitMachine::Initialize(uint nparties)
+MaliciousDABitMachine::MaliciousDABitMachine(): n_parties(0), OCD(0)
 {
+}
+
+void MaliciousDABitMachine::Initialize(uint nparties, offline_control_data& _OCD)
+{
+  this->OCD = &_OCD;
   // add pre computed cnc parameters using input/triple factor 15.0
   // 40 bit stat sec, 8192 dabits per loop; C = 2, B = 3
   pre_cnc_params[make_pair(40, 8192)]= make_pair(2, 3);
@@ -87,3 +93,11 @@ void MaliciousDABitMachine::find_cnc_params()
     }
   //cout << "Selected for bucketing: C = " << cnc_param << " B = " << bucket_size << endl;
 }
+
+AbstractDABitGenerator *MaliciousDABitMachine::new_generator(Player &P, int thread_num)
+{
+  if (numBits(gfp::pr()) >= 64 and Share::SD.type == Full)
+	  return new LargePrimeDABitGenerator(*this, P, thread_num);
+  else
+	  return new SmallPrimeDABitGenerator(*this, P, thread_num);
+}
diff --git a/src/Offline/DABitMachine.h b/src/Offline/DABitMachine.h
index 47aeb2c0..1951a48a 100644
--- a/src/Offline/DABitMachine.h
+++ b/src/Offline/DABitMachine.h
@@ -48,11 +48,16 @@ class DABitMachineBase
  */
 class MaliciousDABitMachine : public DABitMachineBase
 {
+
 public:
   unsigned int n_parties;
+  offline_control_data* OCD;
 
-  void Initialize(unsigned int n_parties);
+  MaliciousDABitMachine();
+  void Initialize(unsigned int n_parties, offline_control_data& OCD);
   void find_cnc_params();
+  AbstractDABitGenerator *new_generator(Player &P, int thread_num);
 };
 
+
 #endif /* SRC_OFFLINE_DABITMACHINE_H_ */
diff --git a/src/Offline/FHE_Factory.cpp b/src/Offline/FHE_Factory.cpp
index 730c629d..479485e0 100644
--- a/src/Offline/FHE_Factory.cpp
+++ b/src/Offline/FHE_Factory.cpp
@@ -19,7 +19,6 @@ extern Timer global_time;
 
 FHE_Industry::FHE_Industry(unsigned int maxnumber)
 {
-  ready= false;
   Factory_List_Lock= new mutex[maxnumber];
   Current_Factory_Lock= new mutex[maxnumber];
   Factory.resize(maxnumber);
@@ -37,6 +36,7 @@ FHE_Industry::FHE_Industry(unsigned int maxnumber)
           Current_Factory_Lock[i].lock();
         }
     }
+  ready= false;
 }
 
 FHE_Industry::~FHE_Industry()
@@ -189,67 +189,69 @@ int FHE_Industry::Next_Off_Production_Line(Plaintext &mess, Ciphertext &ctx,
 /* If one party says every thread is finished then finish */
 bool FHE_Industry::is_finished(unsigned int num, Player &P, const offline_control_data &OCD)
 {
-  bool finished= true, wait= false;
-  for (unsigned int i= 0; i < OCD.finished_offline.size() && finished; i++)
+  bool finished= false, wait= true;
+  // Loop if we are waiting and not finished
+  while (wait && !finished)
     {
-      //printf("Waiting for sacrifice lock %d %d B\n",num,i); fflush(stdout);
-      OCD.sacrifice_mutex[i].lock();
-      //printf("Got sacrifice lock %d %d B\n",num,i); fflush(stdout);
-      if (OCD.finished_offline[i] < 4)
+      finished= true, wait= false;
+      for (unsigned int i= 0; i < OCD.finished_offline.size() && finished; i++)
         {
-          finished= false;
+          //printf("Waiting for sacrifice lock %d %d B\n",num,i); fflush(stdout);
+          OCD.OCD_mutex[i].lock();
+          //printf("Got sacrifice lock %d %d B\n",num,i); fflush(stdout);
+          if (OCD.finished_offline[i] < 4)
+            {
+              finished= false;
+            }
+          OCD.OCD_mutex[i].unlock();
+          //printf("Released sacrifice lock %d %d B\n",num,i); fflush(stdout);
         }
-      OCD.sacrifice_mutex[i].unlock();
-      //printf("Released sacrifice lock %d %d B\n",num,i); fflush(stdout);
-    }
-  // We want to sleep if the factory list is getting too big the amount
-  // and trigger for sleep depending on whether we are using HighGear
-  // or TopGear
-  //printf("Waiting for FL lock %d C\n",num); fflush(stdout);
-  Factory_List_Lock[num].lock();
-  int wt= 10;
+      // We want to sleep if the factory list is getting too big the amount
+      // and trigger for sleep depending on whether we are using HighGear
+      // or TopGear
+      //printf("Waiting for FL lock %d C\n",num); fflush(stdout);
+      Factory_List_Lock[num].lock();
 #ifndef TOP_GEAR
-  if (Factory[num].size() > 0)
-    {
-      wait= true;
-      wt= Factory[num].size() * 30;
-    }
+      if (Factory[num].size() > 2)
+        {
+          wait= true;
+        }
 #else
-  if (Factory[num].size() > 4)
-    {
-      wait= true;
-      wt= Factory[num].size() * 10;
-    }
+      if (Factory[num].size() > 4)
+        {
+          wait= true;
+        }
 #endif
-  Factory_List_Lock[num].unlock();
-  // Need to sync players
-  vector<string> o(P.nplayers());
-  o[P.whoami()]= "N";
-  if (wait)
-    {
-      o[P.whoami()]= "W";
-    }
-  if (finished)
-    {
-      o[P.whoami()]= "Y";
-    }
-  //printf("\n is_finished B : %d %s\n",P.whoami(),o[P.whoami()].c_str());
-  P.Broadcast_Receive(o);
-  for (unsigned int p= 0; p < P.nplayers(); p++)
-    {
-      //printf("\n is_finished R : %d %s\n",p,o[p].c_str());
-      if (o[p].compare("Y") == 0)
+      Factory_List_Lock[num].unlock();
+      // Need to sync players
+      vector<string> o(P.nplayers());
+      o[P.whoami()]= "N";
+      if (wait)
         {
-          finished= true;
+          o[P.whoami()]= "W";
         }
-      if (o[p].compare("W") == 0)
+      if (finished)
         {
-          wait= true;
+          o[P.whoami()]= "Y";
+        }
+      //printf("\n is_finished B : %d %s\n",P.whoami(),o[P.whoami()].c_str());
+      P.Broadcast_Receive(o);
+      for (unsigned int p= 0; p < P.nplayers(); p++)
+        {
+          //printf("\n is_finished R : %d %s\n",p,o[p].c_str());
+          if (o[p].compare("Y") == 0)
+            {
+              finished= true;
+            }
+          if (o[p].compare("W") == 0)
+            {
+              wait= true;
+            }
+        }
+      if (wait && !finished)
+        {
+          sleep(5);
         }
-    }
-  if (wait && !finished)
-    {
-      sleep(wt);
     }
   return finished;
 }
@@ -308,7 +310,7 @@ bool Do_ZKPoK(ZKPoK &ZK, Player &P,
       return true;
     }
 
-  // Transmit E and A data
+  // Transmit A data
   //   - Put within a scoping to ensure data is removed
   if (verbose > 0)
     {
@@ -473,9 +475,10 @@ void FHE_Industry::FHE_Factory(Player &P, const offline_control_data &OCD, const
           {
             Current_Factory_Lock[i].unlock();
           }
-      }
-      ready= true;
 
+        // And signal we are ready
+        ready= true;
+      }
       if (verbose > 0)
         {
           printf("\nFinished mac ctxs : %f \n\n", global_time.elapsed());
@@ -488,14 +491,7 @@ void FHE_Industry::FHE_Factory(Player &P, const offline_control_data &OCD, const
       finished= is_finished(mynumber, P, OCD);
 
       // Only execute a new ZKPoK if we have not finished
-      // AND the size is not too big
-      bool too_big= false;
-#ifndef TOP_GEAR
-      too_big= Factory[mynumber].size() > 5;
-#else
-      too_big= Factory[mynumber].size() > 10;
-#endif
-      if (!finished && !too_big)
+      if (!finished)
         {
           ZKPoK ZK;
           if (verbose > 0)
diff --git a/src/Offline/FHE_Factory.h b/src/Offline/FHE_Factory.h
index 08773fea..37586085 100644
--- a/src/Offline/FHE_Factory.h
+++ b/src/Offline/FHE_Factory.h
@@ -34,7 +34,7 @@ class FHE_Industry
 
   // The encryptions of the macs
   vector<Ciphertext> ctx_macs;
-  // Signals whether ctx_macs are ready
+  // bool to say if they are ready yet
   bool ready;
 
   // Assume Current_Factory_Lock[num] is locked!!!
@@ -45,6 +45,7 @@ class FHE_Industry
   ~FHE_Industry();
 
   bool is_ready() const { return ready; }
+
   const Ciphertext &ct_mac(unsigned int i) const
   {
     return ctx_macs[i];
diff --git a/src/Offline/FakePrep.cpp b/src/Offline/FakePrep.cpp
index e94d1878..7e255ca7 100644
--- a/src/Offline/FakePrep.cpp
+++ b/src/Offline/FakePrep.cpp
@@ -102,7 +102,7 @@ void FakePrep::produce_triples(list<Share> &a, list<Share> &b, list<Share> &c)
   triples[2]= c;
   rewind_triples= true;
 }
-void FakePrep::produce_squares(list<Share> &a, list<Share> &b)
+void FakePrep::produce_squares(list<Share> &a, list<Share> &b, unsigned int rep)
 {
   if (rewind_squares)
     {
@@ -127,7 +127,7 @@ void FakePrep::produce_squares(list<Share> &a, list<Share> &b)
       vector<gfp> amacs(Share::SD.nmacs);
       vector<gfp> bmacs(Share::SD.nmacs);
       vector<Share> sa(n), sb(n);
-      for (int i= 0; i < sz_offline_batch; i++)
+      while (a.size() < sz_offline_batch * rep)
         {
           aa.randomize(PRG);
           bb.mul(aa, aa);
@@ -152,7 +152,7 @@ void FakePrep::produce_squares(list<Share> &a, list<Share> &b)
     }
   else
     {
-      for (int i= 0; i < sz_offline_batch; i++)
+      while (a.size() < sz_offline_batch * rep)
         {
           string ss;
           P.receive_from_player(0, ss);
diff --git a/src/Offline/FakePrep.h b/src/Offline/FakePrep.h
index dd5307b4..31e0bb55 100644
--- a/src/Offline/FakePrep.h
+++ b/src/Offline/FakePrep.h
@@ -30,7 +30,7 @@ class FakePrep
 public:
   FakePrep(Player &P);
   void produce_triples(list<Share> &a, list<Share> &b, list<Share> &c);
-  void produce_squares(list<Share> &a, list<Share> &b);
+  void produce_squares(list<Share> &a, list<Share> &b, unsigned int rep= 1);
   void produce_bits(list<Share> &b);
 };
 
diff --git a/src/Offline/XOR_Machine.cpp b/src/Offline/XOR_Machine.cpp
index a6a2d49b..908d075b 100644
--- a/src/Offline/XOR_Machine.cpp
+++ b/src/Offline/XOR_Machine.cpp
@@ -74,7 +74,7 @@ void XOR_Machine::xors(vector<Share> &result, const vector<Share> &LBits,
   Wait_For_Preproc(DATA_TRIPLE, n, thread, OCD);
   list<Share> ta, tb, tc;
 
-  OCD.sacrifice_mutex[thread].lock();
+  OCD.mul_mutex[thread].lock();
 
   auto &sd_ta= SacrificeD[thread].TD.ta;
   auto &sd_tb= SacrificeD[thread].TD.tb;
@@ -84,7 +84,7 @@ void XOR_Machine::xors(vector<Share> &result, const vector<Share> &LBits,
   tb.splice(tb.begin(), sd_tb, sd_tb.begin(), next(sd_tb.begin(), n));
   tc.splice(tc.begin(), sd_tc, sd_tc.begin(), next(sd_tc.begin(), n));
 
-  OCD.sacrifice_mutex[thread].unlock();
+  OCD.mul_mutex[thread].unlock();
 
   vector<vector<Share>> triples(3);
   for (int i= 0; i < 3; i++)
diff --git a/src/Offline/offline.cpp b/src/Offline/offline.cpp
index 926f1c93..3f2ae74c 100644
--- a/src/Offline/offline.cpp
+++ b/src/Offline/offline.cpp
@@ -13,12 +13,14 @@ All rights reserved
 #include "offline_Maurer.h"
 #include "offline_Reduced.h"
 #include "offline_subroutines.h"
+#include "sacrifice.h"
 
 using namespace std;
 
 void offline_phase_triples(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, list<Share> &a,
                            list<Share> &b, list<Share> &c, const FHE_PK &pk,
                            const FHE_SK &sk, const FFT_Data &PTD,
+                           int fake_sacrifice, Open_Protocol &OP,
                            FHE_Industry &industry)
 {
   if (Share::SD.Otype == Fake)
@@ -37,11 +39,15 @@ void offline_phase_triples(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, li
     {
       offline_FHE_triples(P, a, b, c, pk, sk, PTD, industry);
     }
+
+  sacrifice_phase_triples(P, fake_sacrifice, a, b, c, OP);
 }
 
-void offline_phase_squares(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, list<Share> &a,
-                           list<Share> &b, const FHE_PK &pk, const FHE_SK &sk,
+void offline_phase_squares(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep,
+                           list<Share> &a, list<Share> &b,
+                           const FHE_PK &pk, const FHE_SK &sk,
                            const FFT_Data &PTD,
+                           int fake_sacrifice, Open_Protocol &OP,
                            FHE_Industry &industry)
 {
   if (Share::SD.Otype == Fake)
@@ -60,27 +66,41 @@ void offline_phase_squares(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, li
     {
       offline_FHE_squares(P, a, b, pk, sk, PTD, industry);
     }
+
+  sacrifice_phase_squares(P, fake_sacrifice, a, b, OP);
 }
 
-void offline_phase_bits(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, list<Share> &b,
-                        Open_Protocol &OP, const FHE_PK &pk, const FHE_SK &sk,
+void offline_phase_bits(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep,
+                        list<Share> &bits,
+                        const FHE_PK &pk, const FHE_SK &sk,
                         const FFT_Data &PTD,
+                        int fake_sacrifice, Open_Protocol &OP,
                         FHE_Industry &industry)
 {
+  // We need to get rep more squares than bits
+  int rep= sacrifice_stat_sec / numBits(gfp::pr()) + 1;
+
+  list<Share> a, b;
   if (Share::SD.Otype == Fake)
     {
-      prep.produce_bits(b);
+      prep.produce_bits(bits);
+      prep.produce_squares(a, b, rep);
     }
   else if (Share::SD.Otype == Maurer)
     {
-      offline_Maurer_bits(P, prss, b, OP);
+      offline_Maurer_bits(P, prss, bits, OP);
+      offline_Maurer_squares(P, prss, a, b, rep);
     }
   else if (Share::SD.Otype == Reduced)
     {
-      offline_Reduced_bits(P, prss, przs, b, OP);
+      offline_Reduced_bits(P, prss, przs, bits, OP);
+      offline_Reduced_squares(P, prss, przs, a, b, rep);
     }
   else
     {
-      offline_FHE_bits(P, b, pk, sk, PTD, industry);
+      offline_FHE_bits(P, bits, pk, sk, PTD, industry);
+      offline_FHE_squares(P, a, b, pk, sk, PTD, industry, rep);
     }
+
+  sacrifice_phase_bits(P, fake_sacrifice, bits, a, b, OP);
 }
diff --git a/src/Offline/offline.h b/src/Offline/offline.h
index 81da4def..beb6dfc2 100644
--- a/src/Offline/offline.h
+++ b/src/Offline/offline.h
@@ -15,19 +15,25 @@ All rights reserved
 #include "System/Player.h"
 #include <list>
 
-void offline_phase_triples(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, list<Share> &a,
-                           list<Share> &b, list<Share> &c, const FHE_PK &pk,
-                           const FHE_SK &sk, const FFT_Data &PTD,
+void offline_phase_triples(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep,
+                           list<Share> &a, list<Share> &b, list<Share> &c,
+                           const FHE_PK &pk, const FHE_SK &sk,
+                           const FFT_Data &PTD,
+                           int fake_sacrifice, Open_Protocol &OP,
                            FHE_Industry &industry);
 
-void offline_phase_squares(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, list<Share> &a,
-                           list<Share> &b, const FHE_PK &pk, const FHE_SK &sk,
+void offline_phase_squares(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep,
+                           list<Share> &a, list<Share> &b,
+                           const FHE_PK &pk, const FHE_SK &sk,
                            const FFT_Data &PTD,
+                           int fake_sacrifice, Open_Protocol &OP,
                            FHE_Industry &industry);
 
-void offline_phase_bits(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep, list<Share> &b,
-                        Open_Protocol &OP, const FHE_PK &pk, const FHE_SK &sk,
+void offline_phase_bits(Player &P, PRSS &prss, PRZS &przs, FakePrep &prep,
+                        list<Share> &b,
+                        const FHE_PK &pk, const FHE_SK &sk,
                         const FFT_Data &PTD,
+                        int fake_sacrifice, Open_Protocol &OP,
                         FHE_Industry &industry);
 
 #endif
diff --git a/src/Offline/offline_FHE.cpp b/src/Offline/offline_FHE.cpp
index 6598bba2..32d6e496 100644
--- a/src/Offline/offline_FHE.cpp
+++ b/src/Offline/offline_FHE.cpp
@@ -204,7 +204,8 @@ void offline_FHE_triples(Player &P, list<Share> &a, list<Share> &b,
 void offline_FHE_squares(Player &P, list<Share> &a, list<Share> &b,
                          const FHE_PK &pk, const FHE_SK &sk,
                          const FFT_Data &PTD,
-                         FHE_Industry &industry)
+                         FHE_Industry &industry,
+                         unsigned int rep)
 {
   unsigned int nmacs= P.get_mac_keys().size();
 
@@ -213,7 +214,7 @@ void offline_FHE_squares(Player &P, list<Share> &a, list<Share> &b,
   Ciphertext ca(pk.get_params()), cc(pk.get_params()), nc(pk.get_params());
   Ciphertext tmp(pk.get_params());
 
-  while (a.size() < sz_offline_batch)
+  while (a.size() < sz_offline_batch * rep)
     {
       industry.Next_Off_Production_Line(va, ca, P, "Square a");
 
@@ -289,63 +290,147 @@ void offline_FHE_bits(Player &P, list<Share> &a, const FHE_PK &pk,
     }
 }
 
-/* We do not need to do ZKPoKs for input data, we just need for
- * the given person to encrypt some random stuff, send the
- * ciphertext, all parties multiply through by the alpha ciphertext 
- * and then reshare both ciphertexts.
+void Update_ZKPoK(ZKPoK &ZKP, Player &P,
+                  unsigned int prover,
+                  const FHE_PK &pk,
+                  const FFT_Data &PTD)
+{
+  unsigned int nplayers= P.nplayers();
+
+#ifdef TOP_GEAR
+  ZKP.Step0(General, TopGear, pk, PTD, P.G, P.get_mac_keys());
+#else
+  ZKP.Step0(General, HighGear, pk, PTD, P.G, P.get_mac_keys());
+#endif
+
+  // Transmit E data if we are prover, receive data if
+  // we are verifier
+  if (ZKP.is_prover())
+    {
+      stringstream osE;
+      ZKP.get_vE(osE);
+      P.send_all(osE.str());
+    }
+  else
+    {
+      string vsE;
+      P.receive_from_player(prover, vsE);
+      istringstream isE(vsE);
+      ZKP.Step0_Step(isE, pk);
+    }
+
+  ZKP.Step1(pk, PTD, P.G);
+
+  // Again transmit A data if we are prover, receive data if
+  // we are verifier
+  if (ZKP.is_prover())
+    {
+      stringstream osA;
+      ZKP.get_vA(osA);
+      P.send_all(osA.str());
+    }
+  else
+    {
+      string vsA;
+      P.receive_from_player(prover, vsA);
+      istringstream isA(vsA);
+      ZKP.Step1_Step(isA, pk);
+    }
+
+  // Step 2 first step
+  uint8_t seed[SEED_SIZE];
+  AgreeRandom(P, seed, SEED_SIZE);
+  vector<int> e;
+  ZKP.Generate_e(e, seed);
+
+  ZKP.Step2(e, pk);
+
+  if (ZKP.is_prover())
+    {
+      // Transmit T and Z data
+      stringstream osT, osZ;
+      ZKP.get_vT(osT);
+      ZKP.get_vZ(osZ);
+      P.send_all(osT.str());
+      P.send_all(osZ.str());
+    }
+  else
+    {
+      string vsT, vsZ;
+      P.receive_from_player(prover, vsT);
+      P.receive_from_player(prover, vsZ);
+      istringstream isT(vsT), isZ(vsZ);
+      ZKP.Step2_Step(isT, isZ, pk);
+    }
+
+  // Step 3
+  if (!ZKP.Step3(pk, PTD, nplayers))
+    {
+      throw ZKPoK_Fail();
+    }
+}
+
+/* We *need* to do ZKPoKs for input data, unlike what SPDZ-2
+ * paper says.
  */
 void offline_FHE_IO(Player &P, unsigned int player_num, list<Share> &a,
                     list<gfp> &opened, const FHE_PK &pk, const FHE_SK &sk,
                     const FFT_Data &PTD,
+                    offline_control_data &OCD,
+                    unsigned int online_thread,
                     FHE_Industry &industry)
 {
-  // Spin until ctx_macs are ready
-  while (industry.is_ready() == false)
+
+  // Update the relevant ZKPoK (do this now for ALL input players as
+  // we are probably in the first iteration waiting for the ctx for alpha
+  // to be created)
+  for (unsigned int i= 0; i < P.nplayers(); i++)
     {
-      sleep(5);
+      if (OCD.IO_ZKPoKs[online_thread][i].isempty())
+        {
+          Update_ZKPoK(OCD.IO_ZKPoKs[online_thread][i], P, i, pk, PTD);
+        }
     }
 
-  unsigned int nmacs= P.get_mac_keys().size();
+  int used= OCD.IO_ZKPoKs[online_thread][player_num].get_next_unused();
 
-  Plaintext m(PTD), va(PTD);
-  vector<Plaintext> ga(nmacs, PTD);
-  Ciphertext c(pk.get_params()), tmp(pk.get_params());
+  Plaintext va(PTD), aa(PTD);
+  Ciphertext ca(pk.get_params()), tmp(pk.get_params());
+  OCD.IO_ZKPoKs[online_thread][player_num].get_entry(va, ca, used);
 
-  // Construct the send/receive the main ciphertext
-  if (P.whoami() == player_num)
-    {
-      m.randomize(P.G);
-      c= pk.encrypt(m);
-      ostringstream s;
-      c.output(s);
-      P.send_all(s.str());
-    }
-  else
+  // Reshare the actual shares
+  Reshare(aa, ca, P, sk);
+
+  // Spin until the Macs are ready
+  bool ready= false;
+  while (ready == false)
     {
-      string s;
-      P.receive_from_player(player_num, s);
-      istringstream is(s);
-      c.input(is);
+      ready= industry.is_ready();
+      if (ready == false)
+        {
+          sleep(5);
+        }
     }
 
-  // Reshare the input ciphertext and the MACS
-  Reshare(va, c, P, sk);
+  // Reshare the MACS
+  unsigned int nmacs= P.get_mac_keys().size();
+  vector<Plaintext> ga(nmacs, PTD);
   for (unsigned int i= 0; i < nmacs; i++)
     {
-      mul(tmp, c, industry.ct_mac(i), pk);
+      mul(tmp, ca, industry.ct_mac(i), pk);
       Reshare(ga[i], tmp, P, sk);
     }
 
-  // Construct the actual shares
+  // Create the shares
   unsigned int sz= pk.get_params().phi_m();
   vector<Share> alist(sz);
-  vector<gfp> openedlist(sz);
   vector<gfp> s(1), macs(nmacs);
   Share ss;
   for (unsigned int i= 0; i < sz; i++)
     {
-      get_share(s, macs, va, ga, i);
-      a.emplace_back(P.whoami(), s, macs);
+      get_share(s, macs, aa, ga, i);
+      ss.assign(P.whoami(), s, macs);
+      a.push_back(ss);
       alist[i]= ss;
     }
 
@@ -354,7 +439,7 @@ void offline_FHE_IO(Player &P, unsigned int player_num, list<Share> &a,
     {
       for (unsigned int i= 0; i < sz; i++)
         {
-          opened.push_back(m.element(i));
+          opened.push_back(va.element(i));
         }
     }
 }
diff --git a/src/Offline/offline_FHE.h b/src/Offline/offline_FHE.h
index 5224b7c3..40603ec4 100644
--- a/src/Offline/offline_FHE.h
+++ b/src/Offline/offline_FHE.h
@@ -24,7 +24,8 @@ void offline_FHE_triples(Player &P, list<Share> &a, list<Share> &b,
 void offline_FHE_squares(Player &P, list<Share> &a, list<Share> &b,
                          const FHE_PK &pk, const FHE_SK &sk,
                          const FFT_Data &PTD,
-                         FHE_Industry &industry);
+                         FHE_Industry &industry,
+                         unsigned int rep= 1);
 
 void offline_FHE_bits(Player &P, list<Share> &a, const FHE_PK &pk,
                       const FHE_SK &sk, const FFT_Data &PTD,
@@ -32,7 +33,8 @@ void offline_FHE_bits(Player &P, list<Share> &a, const FHE_PK &pk,
 
 void offline_FHE_IO(Player &P, unsigned int player_num, list<Share> &a,
                     list<gfp> &opened, const FHE_PK &pk, const FHE_SK &sk,
-                    const FFT_Data &PTD,
+                    const FFT_Data &PTD, offline_control_data &OCD,
+                    unsigned int online_thread,
                     FHE_Industry &industry);
 
 #endif
diff --git a/src/Offline/offline_IO_production.cpp b/src/Offline/offline_IO_production.cpp
index 41d63dd2..8f098d82 100644
--- a/src/Offline/offline_IO_production.cpp
+++ b/src/Offline/offline_IO_production.cpp
@@ -108,7 +108,8 @@ void make_IO_data_fake(Player &P, unsigned int player_num, list<Share> &a,
 void make_IO_data(Player &P, int fake_sacrifice, PRSS &prss,
                   unsigned int player_num, list<Share> &a, list<gfp> &opened,
                   const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
-                  Open_Protocol &OP,
+                  offline_control_data &OCD, Open_Protocol &OP,
+                  unsigned int online_thread,
                   FHE_Industry &industry)
 {
   a.resize(0);
@@ -116,7 +117,7 @@ void make_IO_data(Player &P, int fake_sacrifice, PRSS &prss,
 
   if (Share::SD.type == Full && !fake_sacrifice)
     {
-      offline_FHE_IO(P, player_num, a, opened, pk, sk, PTD, industry);
+      offline_FHE_IO(P, player_num, a, opened, pk, sk, PTD, OCD, online_thread, industry);
     }
   else if (fake_sacrifice)
     {
diff --git a/src/Offline/offline_IO_production.h b/src/Offline/offline_IO_production.h
index 187571a1..f8b9ed21 100644
--- a/src/Offline/offline_IO_production.h
+++ b/src/Offline/offline_IO_production.h
@@ -13,10 +13,12 @@ using namespace std;
 #include "FHE_Factory.h"
 #include "LSSS/Open_Protocol.h"
 #include "LSSS/PRSS.h"
+#include "offline_data.h"
 
 void make_IO_data(Player &P, int fake_sacrifice, PRSS &prss,
                   unsigned int player_num, list<Share> &a, list<gfp> &opened,
                   const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
-                  Open_Protocol &OP, FHE_Industry &industry);
+                  offline_control_data &OCD, Open_Protocol &OP,
+                  unsigned int num_online, FHE_Industry &industry);
 
 #endif
diff --git a/src/Offline/offline_Maurer.cpp b/src/Offline/offline_Maurer.cpp
index 5a675f1f..81cc8d60 100644
--- a/src/Offline/offline_Maurer.cpp
+++ b/src/Offline/offline_Maurer.cpp
@@ -8,6 +8,12 @@ All rights reserved
 #include "config.h"
 #include "offline_subroutines.h"
 
+void clear_vector_sstream(vector<stringstream> &ss)
+{
+  for (unsigned int i=0; i<ss.size(); i++)
+    { ss[i]=stringstream(); }
+}
+
 void mult_inner_subroutine_one(const Share &aa, const Share &bb,
                                vector<Share> &cc, vector<stringstream> &ss,
                                Player &P)
@@ -50,10 +56,11 @@ void offline_Maurer_triples(Player &P, PRSS &prss, list<Share> &a,
 {
   Share aa, bb;
   vector<vector<Share>> cc(amortize, vector<Share>(P.nplayers()));
+  vector<stringstream> ss(P.nplayers()); 
   for (int i= 0; i < sz_offline_batch / amortize; i++)
     {
-      vector<stringstream> ss(P.nplayers()); // This line is here to make sure ss
-                                             // is reinitialized every loop
+      clear_vector_sstream(ss);
+
       for (int j= 0; j < amortize; j++)
         {
           aa= prss.next_share(P);
@@ -81,16 +88,18 @@ void offline_Maurer_triples(Player &P, PRSS &prss, list<Share> &a,
     }
 }
 
-void offline_Maurer_squares(Player &P, PRSS &prss, list<Share> &a,
-                            list<Share> &b)
+void offline_Maurer_squares(Player &P, PRSS &prss,
+                            list<Share> &a, list<Share> &b,
+                            unsigned int rep)
 {
   Share aa;
   vector<vector<Share>> bb(amortize, vector<Share>(P.nplayers()));
   vector<string> sstr(P.nplayers());
-  for (int i= 0; i < sz_offline_batch / amortize; i++)
+  vector<stringstream> ss(P.nplayers()); 
+  while (a.size() < sz_offline_batch * rep)
     {
-      vector<stringstream> ss(P.nplayers()); // This line is here to make sure ss
-                                             // is reinitialized every loop
+      clear_vector_sstream(ss);
+
       for (int j= 0; j < amortize; j++)
         {
           aa= prss.next_share(P);
@@ -125,12 +134,15 @@ void offline_Maurer_bits(Player &P, PRSS &prss, list<Share> &b,
   gfp prod, one(1), twoi(2);
   twoi.invert();
   vector<string> sstr(P.nplayers());
+  vector<stringstream> ss(P.nplayers());
   for (int i= 0; i < sz_offline_batch / amortize; i++)
     {
       /* Essentially run the square protocol to get amortize
        * number of sharing of a and sharing of b=a^2
        */
-      vector<stringstream> ss(P.nplayers());
+
+      clear_vector_sstream(ss);
+
       for (int j= 0; j < amortize; j++)
         {
           aa[j]= prss.next_share(P);
diff --git a/src/Offline/offline_Maurer.h b/src/Offline/offline_Maurer.h
index bbf76068..a8f13906 100644
--- a/src/Offline/offline_Maurer.h
+++ b/src/Offline/offline_Maurer.h
@@ -17,7 +17,8 @@ void offline_Maurer_triples(Player &P, PRSS &prss, list<Share> &a,
                             list<Share> &b, list<Share> &c);
 
 void offline_Maurer_squares(Player &P, PRSS &prss, list<Share> &a,
-                            list<Share> &b);
+                            list<Share> &b,
+                            unsigned int rep= 1);
 
 void offline_Maurer_bits(Player &P, PRSS &prss, list<Share> &b,
                          Open_Protocol &OP);
diff --git a/src/Offline/offline_Reduced.cpp b/src/Offline/offline_Reduced.cpp
index 5b97fdf0..08617646 100644
--- a/src/Offline/offline_Reduced.cpp
+++ b/src/Offline/offline_Reduced.cpp
@@ -231,8 +231,9 @@ void offline_Reduced_triples(Player &P, PRSS &prss, PRZS &przs, list<Share> &a,
     }
 }
 
-void offline_Reduced_squares(Player &P, PRSS &prss, PRZS &przs, list<Share> &a,
-                             list<Share> &b)
+void offline_Reduced_squares(Player &P, PRSS &prss, PRZS &przs,
+                             list<Share> &a, list<Share> &b,
+                             unsigned int rep)
 {
   Share aa;
   vector<Share> cc(amortize), cc_m(amortize);
@@ -241,7 +242,7 @@ void offline_Reduced_squares(Player &P, PRSS &prss, PRZS &przs, list<Share> &a,
       Init_Configure(P.whoami());
     }
 
-  for (int i= 0; i < sz_offline_batch / amortize; i++)
+  while (a.size() < sz_offline_batch * rep)
     {
       vector<stringstream> ss_m(P.nplayers()); // This line is here to make sure
                                                // ss is reinitialized every loop
diff --git a/src/Offline/offline_Reduced.h b/src/Offline/offline_Reduced.h
index 09970cc9..6c9a9c6c 100644
--- a/src/Offline/offline_Reduced.h
+++ b/src/Offline/offline_Reduced.h
@@ -17,8 +17,9 @@ using namespace std;
 void offline_Reduced_triples(Player &P, PRSS &prss, PRZS &przs, list<Share> &a,
                              list<Share> &b, list<Share> &c);
 
-void offline_Reduced_squares(Player &P, PRSS &prss, PRZS &przs, list<Share> &a,
-                             list<Share> &b);
+void offline_Reduced_squares(Player &P, PRSS &prss, PRZS &przs,
+                             list<Share> &a, list<Share> &b,
+                             unsigned int rep= 1);
 
 void offline_Reduced_bits(Player &P, PRSS &prss, PRZS &przs, list<Share> &b,
                           Open_Protocol &OP);
diff --git a/src/Offline/offline_data.cpp b/src/Offline/offline_data.cpp
index 62fe4384..8118531d 100644
--- a/src/Offline/offline_data.cpp
+++ b/src/Offline/offline_data.cpp
@@ -25,25 +25,42 @@ void Wait_For_Preproc(int type, unsigned int size, int thread,
   bool wait= true;
   while (wait)
     {
-      OCD.sacrifice_mutex[thread].lock();
       wait= false;
-      if (type == DATA_TRIPLE && SacrificeD[thread].TD.ta.size() < size)
+      switch (type)
         {
-          wait= true;
+          case DATA_TRIPLE:
+            OCD.mul_mutex[thread].lock();
+            if (SacrificeD[thread].TD.ta.size() < size)
+              {
+                wait= true;
+              }
+            OCD.mul_mutex[thread].unlock();
+            break;
+          case DATA_SQUARE:
+            OCD.sqr_mutex[thread].lock();
+            if (SacrificeD[thread].SD.sa.size() < size)
+              {
+                wait= true;
+              }
+            OCD.sqr_mutex[thread].unlock();
+            break;
+          case DATA_BIT:
+            OCD.bit_mutex[thread].lock();
+            if (SacrificeD[thread].BD.bb.size() < size)
+              {
+                wait= true;
+              }
+            OCD.bit_mutex[thread].unlock();
+            break;
+          case DATA_INPUT_MASK:
+            OCD.OCD_mutex[thread].lock();
+            if (SacrificeD[thread].ID.ios[player].size() < size)
+              {
+                wait= true;
+              }
+            OCD.OCD_mutex[thread].unlock();
+            break;
         }
-      if (type == DATA_SQUARE && SacrificeD[thread].SD.sa.size() < size)
-        {
-          wait= true;
-        }
-      if (type == DATA_BIT && SacrificeD[thread].BD.bb.size() < size)
-        {
-          wait= true;
-        }
-      if (type == DATA_INPUT_MASK && SacrificeD[thread].ID.ios[player].size() < size)
-        {
-          wait= true;
-        }
-      OCD.sacrifice_mutex[thread].unlock();
       if (wait)
         {
           sleep(1);
diff --git a/src/Offline/offline_data.h b/src/Offline/offline_data.h
index 5f328aac..ebcbb9da 100644
--- a/src/Offline/offline_data.h
+++ b/src/Offline/offline_data.h
@@ -11,6 +11,7 @@ All rights reserved
 #include <mutex>
 using namespace std;
 
+#include "FHE/ZKPoK.h"
 #include "LSSS/Share.h"
 
 /* These datatypes are just the data store for the offline phase
@@ -82,21 +83,21 @@ class offline_control_data
 {
   void clean_up()
   {
-    if (mult_mutex != NULL)
+    if (OCD_mutex != NULL)
       {
-        delete[] mult_mutex;
-        delete[] square_mutex;
+        delete[] OCD_mutex;
+        delete[] mul_mutex;
+        delete[] sqr_mutex;
         delete[] bit_mutex;
-        delete[] sacrifice_mutex;
-        mult_mutex= NULL;
+        OCD_mutex= NULL;
       }
   }
 
 public:
-  mutex *mult_mutex;      // Lock for mult threads
-  mutex *square_mutex;    // Lock for square threads
-  mutex *bit_mutex;       // Lock for bit threads
-  mutex *sacrifice_mutex; // Lock for sacrifice threads
+  mutex *OCD_mutex; // Lock for control OCD and input data
+  mutex *mul_mutex; // Lock for mult triples
+  mutex *sqr_mutex; // Lock for sqr pairs
+  mutex *bit_mutex; // Lock for bits
 
   // Min number produced before we start online
   unsigned int minm, mins, minb;
@@ -106,20 +107,27 @@ class offline_control_data
   vector<unsigned int> totm, tots, totb, totI;
 
   vector<int> finish_offline;   // Flag to say whether we SHOULD finish offline
+  vector<int> finished_online;  // Flag to say whether online has finished
   vector<int> finished_offline; // Counts how many threads HAVE died
 
-  void resize(unsigned int num_threads)
+  // The ZKPoKs needed for IO production when doing Full Threshold
+  vector<vector<ZKPoK>> IO_ZKPoKs;
+
+  void resize(unsigned int num_threads,
+              unsigned int nplayers,
+              unsigned int whoami)
   {
     clean_up();
-    mult_mutex= new mutex[num_threads];
-    square_mutex= new mutex[num_threads];
+    OCD_mutex= new mutex[num_threads];
+    mul_mutex= new mutex[num_threads];
+    sqr_mutex= new mutex[num_threads];
     bit_mutex= new mutex[num_threads];
-    sacrifice_mutex= new mutex[num_threads];
     totm.resize(num_threads);
     tots.resize(num_threads);
     totb.resize(num_threads);
     totI.resize(num_threads);
     finish_offline.resize(num_threads);
+    finished_online.resize(num_threads);
     finished_offline.resize(num_threads);
     for (unsigned int i= 0; i < num_threads; i++)
       {
@@ -129,12 +137,37 @@ class offline_control_data
         totI[i]= 0;
         finish_offline[i]= 0;
         finished_offline[i]= 0;
+        finished_online[i]= 0;
+      }
+    if (Share::SD.type == Full)
+      {
+        IO_ZKPoKs.resize(num_threads);
+        for (unsigned int i= 0; i < num_threads; i++)
+          {
+            IO_ZKPoKs[i].resize(nplayers);
+            for (unsigned int j= 0; j < nplayers; j++)
+              {
+                if (j == whoami)
+                  {
+                    IO_ZKPoKs[i][j].set_params(true, true);
+                  }
+                else
+                  {
+                    IO_ZKPoKs[i][j].set_params(true, false);
+                  }
+              }
+          }
       }
   }
 
+  unsigned int num_online_threads() const
+  {
+    return totm.size();
+  }
+
   offline_control_data()
   {
-    mult_mutex= NULL;
+    OCD_mutex= NULL;
   }
   ~offline_control_data()
   {
diff --git a/src/Offline/offline_phases.cpp b/src/Offline/offline_phases.cpp
index 5b9322e0..ae341682 100644
--- a/src/Offline/offline_phases.cpp
+++ b/src/Offline/offline_phases.cpp
@@ -21,9 +21,6 @@ using namespace std;
 
 extern Timer global_time;
 
-extern vector<triples_data> TriplesD;
-extern vector<squares_data> SquaresD;
-extern vector<bits_data> BitsD;
 extern vector<sacrificed_data> SacrificeD;
 
 enum ODtype {
@@ -39,49 +36,50 @@ enum ODtype {
 //    0 = OK, prepare some more stuff
 //    1 = Exit
 //    2 = Wait
+// Do not apply locking as we are only reading, this should be OK
 int check_exit(int num_online, const Player &P, offline_control_data &OCD, ODtype T)
 {
   int result= 0;
   string ss= "-";
   if (P.whoami() == 0)
     {
-      OCD.sacrifice_mutex[num_online].lock();
+      //OCD.OCD_mutex[num_online].lock();
       if (OCD.finish_offline[num_online] == 1)
         {
           ss= "E";
           result= 1;
         }
-      OCD.sacrifice_mutex[num_online].unlock();
+      //OCD.OCD_mutex[num_online].unlock();
       if (result == 0)
         { /* wait if the queues are too big */
           switch (T)
             {
               case Triples:
-                OCD.mult_mutex[num_online].lock();
-                if ((TriplesD[num_online].ta.size() > max_triples_offline) || (OCD.totm[num_online] > OCD.maxm && OCD.maxm != 0))
+                //OCD.mul_mutex[num_online].lock();
+                if ((SacrificeD[num_online].TD.ta.size() > max_triples_sacrifice) || (OCD.totm[num_online] > OCD.maxm && OCD.maxm != 0))
                   {
                     result= 2;
                     ss= "W";
                   }
-                OCD.mult_mutex[num_online].unlock();
+                //OCD.mul_mutex[num_online].unlock();
                 break;
               case Squares:
-                OCD.square_mutex[num_online].lock();
-                if ((SquaresD[num_online].sa.size() > max_squares_offline) || (OCD.tots[num_online] > OCD.maxs && OCD.maxs != 0 && OCD.totb[num_online] > OCD.maxb && OCD.maxb != 0))
+                //OCD.sqr_mutex[num_online].lock();
+                if ((SacrificeD[num_online].SD.sa.size() > max_squares_sacrifice) || (OCD.tots[num_online] > OCD.maxs && OCD.maxs != 0))
                   {
                     result= 2;
                     ss= "W";
                   }
-                OCD.square_mutex[num_online].unlock();
+                //OCD.sqr_mutex[num_online].unlock();
                 break;
               case Bits:
-                OCD.bit_mutex[num_online].lock();
-                if ((BitsD[num_online].bb.size() > max_bits_offline) || (OCD.totb[num_online] > OCD.maxb && OCD.maxb != 0))
+                //OCD.bit_mutex[num_online].lock();
+                if ((SacrificeD[num_online].BD.bb.size() > max_bits_sacrifice) || (OCD.totb[num_online] > OCD.maxb && OCD.maxb != 0))
                   {
                     result= 2;
                     ss= "W";
                   }
-                OCD.bit_mutex[num_online].unlock();
+                //OCD.bit_mutex[num_online].unlock();
                 break;
               default:
                 throw bad_value();
@@ -104,8 +102,9 @@ int check_exit(int num_online, const Player &P, offline_control_data &OCD, ODtyp
   return result;
 }
 
-void mult_phase(int num_online, Player &P, offline_control_data &OCD,
-                const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
+void mult_phase(int num_online, Player &P, int fake_sacrifice,
+                offline_control_data &OCD, const FHE_PK &pk,
+                const FHE_SK &sk, const FFT_Data &PTD,
                 FHE_Industry &industry,
                 int verbose)
 {
@@ -114,26 +113,27 @@ void mult_phase(int num_online, Player &P, offline_control_data &OCD,
   PRZS przs(P);
   FakePrep prep(P);
 
+  Open_Protocol OP;
+
   list<Share> a, b, c;
   list<Share>::iterator it;
   int flag;
   while (0 == 0)
     {
       flag= check_exit(num_online, P, OCD, Triples);
-
       /* Needs to die gracefully if online is gone */
       if (flag == 1)
         {
           printf("Exiting mult phase : thread = %d\n", num_online);
-          OCD.sacrifice_mutex[num_online].lock();
+          OCD.OCD_mutex[num_online].lock();
           OCD.finished_offline[num_online]++;
-          OCD.sacrifice_mutex[num_online].unlock();
+          OCD.OCD_mutex[num_online].unlock();
           return;
         }
 
       if (flag == 2)
         {
-          sleep(5);
+          sleep(1);
         }
       else
         {
@@ -142,7 +142,8 @@ void mult_phase(int num_online, Player &P, offline_control_data &OCD,
               printf("In triples: thread = %d\n", num_online);
               fflush(stdout);
             }
-          offline_phase_triples(P, prss, przs, prep, a, b, c, pk, sk, PTD, industry);
+
+          offline_phase_triples(P, prss, przs, prep, a, b, c, pk, sk, PTD, fake_sacrifice, OP, industry);
           if (verbose > 1)
             {
               printf("Out of triples: thread = %d\n", num_online);
@@ -150,19 +151,21 @@ void mult_phase(int num_online, Player &P, offline_control_data &OCD,
             }
 
           /* Add to queues */
-          OCD.mult_mutex[num_online].lock();
-          it= TriplesD[num_online].ta.end();
-          TriplesD[num_online].ta.splice(it, a);
-          it= TriplesD[num_online].tb.end();
-          TriplesD[num_online].tb.splice(it, b);
-          it= TriplesD[num_online].tc.end();
-          TriplesD[num_online].tc.splice(it, c);
-          OCD.mult_mutex[num_online].unlock();
+          OCD.mul_mutex[num_online].lock();
+          OCD.totm[num_online]+= a.size();
+          it= SacrificeD[num_online].TD.ta.end();
+          SacrificeD[num_online].TD.ta.splice(it, a);
+          it= SacrificeD[num_online].TD.tb.end();
+          SacrificeD[num_online].TD.tb.splice(it, b);
+          it= SacrificeD[num_online].TD.tc.end();
+          SacrificeD[num_online].TD.tc.splice(it, c);
+          OCD.mul_mutex[num_online].unlock();
         }
     }
 }
 
-void square_phase(int num_online, Player &P, offline_control_data &OCD,
+void square_phase(int num_online, Player &P, int fake_sacrifice,
+                  offline_control_data &OCD,
                   const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
                   FHE_Industry &industry,
                   int verbose)
@@ -172,6 +175,8 @@ void square_phase(int num_online, Player &P, offline_control_data &OCD,
   PRZS przs(P);
   FakePrep prep(P);
 
+  Open_Protocol OP;
+
   list<Share> a, b, c;
   list<Share>::iterator it;
   int flag;
@@ -183,15 +188,15 @@ void square_phase(int num_online, Player &P, offline_control_data &OCD,
       if (flag == 1)
         {
           printf("Exiting square phase: thread = %d\n", num_online);
-          OCD.sacrifice_mutex[num_online].lock();
+          OCD.OCD_mutex[num_online].lock();
           OCD.finished_offline[num_online]++;
-          OCD.sacrifice_mutex[num_online].unlock();
+          OCD.OCD_mutex[num_online].unlock();
           return;
         }
 
       if (flag == 2)
         {
-          sleep(5);
+          sleep(1);
         }
       else
         {
@@ -200,7 +205,8 @@ void square_phase(int num_online, Player &P, offline_control_data &OCD,
               printf("In squares: thread = %d\n", num_online);
               fflush(stdout);
             }
-          offline_phase_squares(P, prss, przs, prep, a, b, pk, sk, PTD, industry);
+
+          offline_phase_squares(P, prss, przs, prep, a, b, pk, sk, PTD, fake_sacrifice, OP, industry);
           if (verbose > 1)
             {
               printf("Out of squares: thread = %d\n", num_online);
@@ -208,17 +214,19 @@ void square_phase(int num_online, Player &P, offline_control_data &OCD,
             }
 
           /* Add to queues */
-          OCD.square_mutex[num_online].lock();
-          it= SquaresD[num_online].sa.end();
-          SquaresD[num_online].sa.splice(it, a);
-          it= SquaresD[num_online].sb.end();
-          SquaresD[num_online].sb.splice(it, b);
-          OCD.square_mutex[num_online].unlock();
+          OCD.sqr_mutex[num_online].lock();
+          OCD.tots[num_online]+= a.size();
+          it= SacrificeD[num_online].SD.sa.end();
+          SacrificeD[num_online].SD.sa.splice(it, a);
+          it= SacrificeD[num_online].SD.sb.end();
+          SacrificeD[num_online].SD.sb.splice(it, b);
+          OCD.sqr_mutex[num_online].unlock();
         }
     }
 }
 
-void bit_phase(int num_online, Player &P, offline_control_data &OCD,
+void bit_phase(int num_online, Player &P, int fake_sacrifice,
+               offline_control_data &OCD,
                const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
                FHE_Industry &industry,
                int verbose)
@@ -241,15 +249,15 @@ void bit_phase(int num_online, Player &P, offline_control_data &OCD,
       if (flag == 1)
         {
           printf("Exiting bit phase: thread = %d\n", num_online);
-          OCD.sacrifice_mutex[num_online].lock();
+          OCD.OCD_mutex[num_online].lock();
           OCD.finished_offline[num_online]++;
-          OCD.sacrifice_mutex[num_online].unlock();
+          OCD.OCD_mutex[num_online].unlock();
           return;
         }
 
       if (flag == 2)
         {
-          sleep(5);
+          sleep(1);
         }
       else
         {
@@ -258,7 +266,9 @@ void bit_phase(int num_online, Player &P, offline_control_data &OCD,
               printf("In bits: thread = %d\n", num_online);
               fflush(stdout);
             }
-          offline_phase_bits(P, prss, przs, prep, b, OP, pk, sk, PTD, industry);
+
+          offline_phase_bits(P, prss, przs, prep, b, pk, sk, PTD, fake_sacrifice, OP, industry);
+
           if (verbose > 1)
             {
               printf("Out of bits: thread = %d\n", num_online);
@@ -267,385 +277,156 @@ void bit_phase(int num_online, Player &P, offline_control_data &OCD,
 
           /* Add to queues */
           OCD.bit_mutex[num_online].lock();
-          it= BitsD[num_online].bb.end();
-          BitsD[num_online].bb.splice(it, b);
+          OCD.totb[num_online]+= b.size();
+          it= SacrificeD[num_online].BD.bb.end();
+          SacrificeD[num_online].BD.bb.splice(it, b);
           OCD.bit_mutex[num_online].unlock();
         }
     }
 }
 
-/* This proposes a number of things to sacrifice to get
- * agreement amongst all players. All players propose
- * and then they take the minumum
+/* Decide what to do for the inputs thread
+ *   Make inputs or exit or wait
+ * All players propose and then they take the minumum
+ * No thread locks as we are only reading
  */
-bool propose_numbers_sacrifice(int num_online, Player &P, int &nm, int &ns,
-                               int &nb, vector<int> &make_inputs,
-                               offline_control_data &OCD,
-                               int verbose)
+bool propose_what_to_do(int num_online, Player &P, int &finish,
+                        vector<int> &make_inputs, bool &minput_global,
+                        offline_control_data &OCD,
+                        int verbose)
 {
   // The number of sacrifice equations we need per item produced
   int rep= sacrifice_stat_sec / numBits(gfp::pr()) + 1;
 
   // Each player first either proposes a set of numbers or an exit
-  nm= 0;
-  ns= 0;
-  nb= 0;
-  bool minputs= false;
+  finish= 0;
+  minput_global= false;
   for (unsigned int i= 0; i < P.nplayers(); i++)
     {
       make_inputs[i]= 0;
     }
 
-  while (nm == 0 && ns == 0 && nb == 0 && minputs == false)
+  //OCD.OCD_mutex[num_online].lock();
+  for (unsigned int i= 0; i < P.nplayers(); i++)
     {
-      OCD.mult_mutex[num_online].lock();
-      int ta= TriplesD[num_online].ta.size();
-      OCD.mult_mutex[num_online].unlock();
-
-      OCD.square_mutex[num_online].lock();
-      int sa= SquaresD[num_online].sa.size();
-      OCD.square_mutex[num_online].unlock();
-
-      OCD.bit_mutex[num_online].lock();
-      int bb= BitsD[num_online].bb.size();
-      OCD.bit_mutex[num_online].unlock();
-
-      nm= min((rep + 1) * sz_triples_sacrifice, ta) - 1;
-      nm= (nm / (rep + 1)) * (rep + 1); // Make a round mult of (rep+1)
-      if (nm < 0)
-        {
-          nm= 0;
-        }
-
-      nb= min(sz_bits_sacrifice, bb) - 1;
-      nb= min(nb, sa / rep) -
-          10; // Leave some gap for making squares
-      if (nb < 0)
-        {
-          nb= 0;
-        }
-
-      ns= min((rep + 1) * sz_squares_sacrifice, sa - rep * nb) - 1;
-      ns= (ns / (rep + 1)) * (rep + 1); // Make a round mult of (rep+1)
-      if (ns < 0)
-        {
-          ns= 0;
-        }
-
-      if (verbose > 1)
-        {
-          printf("In sacrifice proposal: thread = %d : %d %d %d\n", num_online, ta, sa, bb);
-          fflush(stdout);
-        }
-
-      OCD.sacrifice_mutex[num_online].lock();
-      if (SacrificeD[num_online].TD.ta.size() > max_triples_sacrifice)
-        {
-          nm= 0;
-        }
-      if (OCD.totm[num_online] > OCD.maxm && OCD.maxm != 0)
-        {
-          nm= 0;
-        }
-      if (SacrificeD[num_online].SD.sa.size() > max_squares_sacrifice)
-        {
-          ns= 0;
-        }
-      if (OCD.tots[num_online] > OCD.maxs && OCD.maxs != 0)
-        {
-          ns= 0;
-        }
-      if (SacrificeD[num_online].BD.bb.size() > max_bits_sacrifice)
-        {
-          nb= 0;
-        }
-      if (OCD.totb[num_online] > OCD.maxb && OCD.maxb != 0)
-        {
-          nb= 0;
-        }
-      for (unsigned int i= 0; i < P.nplayers(); i++)
+      if ((OCD.maxI == 0 && SacrificeD[num_online].ID.ios[i].size() < max_IO_sacrifice) || (OCD.totI[num_online] < OCD.maxI))
         {
-          if ((OCD.maxI == 0 && SacrificeD[num_online].ID.ios[i].size() < max_IO_sacrifice) || (OCD.totI[num_online] < OCD.maxI))
-            {
-              make_inputs[i]= 1;
-              minputs= true;
-            }
+          make_inputs[i]= 1;
+          minput_global= true;
         }
+    }
 
-      /* Needs to die gracefully if we are told to */
-      // If I am exiting tell other players
-      if (OCD.finish_offline[num_online] == 1)
-        {
-          nm= -1;
-        }
-      OCD.sacrifice_mutex[num_online].unlock();
+  /* Needs to die gracefully if we are told to */
+  // If I am exiting tell other players
+  if (OCD.finish_offline[num_online] == 1)
+    {
+      finish= -1;
+    }
+  //OCD.OCD_mutex[num_online].unlock();
 
-      // Propose to other players what I have
-      stringstream ss;
-      ss << nm << " " << ns << " " << nb << " ";
-      for (unsigned int i= 0; i < P.nplayers(); i++)
-        {
-          ss << make_inputs[i] << " ";
-        }
-      if (verbose > 1)
-        {
-          printf("Proposing Sacrifice : thread = %d : %s rep=%d\n", num_online,
-                 ss.str().c_str(), rep);
-          fflush(stdout);
-        }
-      P.send_all(ss.str());
+  // Propose to other players what I have
+  stringstream ss;
+  ss << finish << " ";
+  for (unsigned int i= 0; i < P.nplayers(); i++)
+    {
+      ss << make_inputs[i] << " ";
+    }
+  if (verbose > 1)
+    {
+      printf("Proposing Sacrifice : thread = %d : %s rep=%d\n", num_online,
+             ss.str().c_str(), rep);
+      fflush(stdout);
+    }
+  P.send_all(ss.str());
 
-      // Now get data from all players taking minimum/max where necessary
-      for (unsigned int i= 0; i < P.nplayers(); i++)
-        {
-          if (i != P.whoami())
+  // Now get data from all players taking minimum/max where necessary
+  for (unsigned int i= 0; i < P.nplayers(); i++)
+    {
+      if (i != P.whoami())
+        {
+          string ss;
+          P.receive_from_player(i, ss);
+          istringstream is(ss);
+          int fint, ni;
+          is >> fint;
+          finish= min(finish, fint);
+          for (unsigned int i= 0; i < P.nplayers(); i++)
             {
-              string ss;
-              P.receive_from_player(i, ss);
-              istringstream is(ss);
-              int nmt, nst, nbt, ni;
-              is >> nmt >> nst >> nbt;
-              nm= min(nm, nmt);
-              ns= min(ns, nst);
-              nb= min(nb, nbt);
-              for (unsigned int i= 0; i < P.nplayers(); i++)
+              is >> ni;
+              make_inputs[i]= max(make_inputs[i], ni);
+              if (make_inputs[i] == 1)
                 {
-                  is >> ni;
-                  make_inputs[i]= max(make_inputs[i], ni);
-                  if (make_inputs[i] == 1)
-                    {
-                      minputs= true;
-                    }
+                  minput_global= true;
                 }
             }
         }
-      if (nm == 0 && ns == 0 && nb == 0 && minputs == false)
-        {
-          sleep(1);
-        }
     }
 
   bool exit= false;
   // Signal exit if anyone signalled exit
-  if (nm == -1)
+  if (finish == -1)
     {
       exit= true;
     }
   return exit;
 }
 
-void sacrifice_phase(int num_online, Player &P, int fake_sacrifice,
-                     offline_control_data &OCD, const FHE_PK &pk,
-                     const FHE_SK &sk, const FFT_Data &PTD,
-                     FHE_Industry &industry,
-                     int verbose)
+
+/* Thread locks removed when only reading */
+void inputs_phase(int num_online, Player &P, int fake_sacrifice,
+                  offline_control_data &OCD, const FHE_PK &pk,
+                  const FHE_SK &sk, const FFT_Data &PTD,
+                  FHE_Industry &industry,
+                  int verbose)
 {
-  int nm, ns, nb;
-  int rep= sacrifice_stat_sec / numBits(gfp::pr()) + 1;
+  int finish;
 
   // Initialize PRSS stuff for IO production
-  //   - We do IO production here as it is rarely done, and hence this
-  //     keep the sacrifice phase thread busy whilst other threads
-  //     do the main work
   PRSS prss(P);
 
   Open_Protocol OP;
 
-  list<Share> a, b, c;
+  list<Share> a;
   list<gfp> opened;
-  list<Share>::iterator first, last, it;
+  list<Share>::iterator it;
   list<gfp>::iterator it_g;
   vector<int> minputs(P.nplayers());
-  bool exit= false, ready= false;
+  bool minput_global, exit= false;
   while (0 == 0)
     {
-      exit= propose_numbers_sacrifice(num_online, P, nm, ns, nb, minputs, OCD,
-                                      verbose - 1);
+      exit= propose_what_to_do(num_online, P, finish, minputs, minput_global, OCD,
+                               verbose - 1);
 
-      // Do the input bits first as the other operations wait until
-      // enough data is ready to sacrifice
-      for (unsigned int i= 0; i < P.nplayers() && !exit; i++)
+      if (exit != true && finish == 0 && minput_global == false)
         {
-          if (minputs[i])
-            {
-              make_IO_data(P, fake_sacrifice, prss, i, a, opened, pk, sk, PTD, OP, industry);
-
-              /* Add to queues */
-              OCD.sacrifice_mutex[num_online].lock();
-              OCD.totI[num_online]+= a.size();
-              it= SacrificeD[num_online].ID.ios[i].end();
-              SacrificeD[num_online].ID.ios[i].splice(it, a);
-              if (i == P.whoami())
-                {
-                  it_g= SacrificeD[num_online].ID.opened_ios.end();
-                  SacrificeD[num_online].ID.opened_ios.splice(it_g, opened);
-                }
-              OCD.sacrifice_mutex[num_online].unlock();
-            }
+          sleep(2);
         }
-
-      // Wait until we have enough offline data in this thread to be able to do
-      // the required sacrifice
-      if (!exit)
+      else
         {
-          ready= false;
-          while (ready == false && !exit)
+          for (unsigned int i= 0; i < P.nplayers() && !exit; i++)
             {
-              ready= true;
-              OCD.mult_mutex[num_online].lock();
-              if ((int) TriplesD[num_online].ta.size() < nm)
-                {
-                  ready= false;
-                }
-              OCD.mult_mutex[num_online].unlock();
-
-              OCD.square_mutex[num_online].lock();
-              if ((int) SquaresD[num_online].sa.size() < rep * nb + ns)
-                {
-                  ready= false;
-                }
-              OCD.square_mutex[num_online].unlock();
-
-              OCD.bit_mutex[num_online].lock();
-              if ((int) BitsD[num_online].bb.size() < nb)
-                {
-                  ready= false;
-                }
-              OCD.bit_mutex[num_online].unlock();
-
-              /* Wait if nothing to do */
-              if (ready == false)
+              if (minputs[i])
                 {
-                  sleep(1);
+                  make_IO_data(P, fake_sacrifice, prss, i, a, opened, pk, sk, PTD, OCD, OP, num_online, industry);
+
+                  /* Add to queues */
+                  OCD.OCD_mutex[num_online].lock();
+                  OCD.totI[num_online]+= a.size();
+                  it= SacrificeD[num_online].ID.ios[i].end();
+                  SacrificeD[num_online].ID.ios[i].splice(it, a);
+                  if (i == P.whoami())
+                    {
+                      it_g= SacrificeD[num_online].ID.opened_ios.end();
+                      SacrificeD[num_online].ID.opened_ios.splice(it_g, opened);
+                    }
+                  OCD.OCD_mutex[num_online].unlock();
                 }
             }
         }
-
-      /* Do the actual work we need to do */
-      if (nm > 0 && !exit)
-        {
-          if (verbose > 1)
-            {
-              printf("Sac: thread = %d : T: %d\n", num_online, nm);
-              fflush(stdout);
-            }
-
-          OCD.mult_mutex[num_online].lock();
-
-          first= TriplesD[num_online].ta.begin();
-          last= TriplesD[num_online].ta.begin();
-          advance(last, nm);
-          a.clear();
-          a.splice(a.begin(), TriplesD[num_online].ta, first, last);
-
-          first= TriplesD[num_online].tb.begin();
-          last= TriplesD[num_online].tb.begin();
-          advance(last, nm);
-          b.clear();
-          b.splice(b.begin(), TriplesD[num_online].tb, first, last);
-
-          first= TriplesD[num_online].tc.begin();
-          last= TriplesD[num_online].tc.begin();
-          advance(last, nm);
-          c.clear();
-          c.splice(c.begin(), TriplesD[num_online].tc, first, last);
-
-          OCD.mult_mutex[num_online].unlock();
-
-          sacrifice_phase_triples(P, fake_sacrifice, a, b, c, OP);
-
-          /* Add to queues */
-          OCD.sacrifice_mutex[num_online].lock();
-          OCD.totm[num_online]+= a.size();
-          it= SacrificeD[num_online].TD.ta.end();
-          SacrificeD[num_online].TD.ta.splice(it, a);
-          it= SacrificeD[num_online].TD.tb.end();
-          SacrificeD[num_online].TD.tb.splice(it, b);
-          it= SacrificeD[num_online].TD.tc.end();
-          SacrificeD[num_online].TD.tc.splice(it, c);
-          OCD.sacrifice_mutex[num_online].unlock();
-        }
-      if (ns > 0 && !exit)
-        {
-          if (verbose > 1)
-            {
-              printf("Sac: thread = %d : S: %d\n", num_online, ns);
-              fflush(stdout);
-            }
-
-          OCD.square_mutex[num_online].lock();
-
-          first= SquaresD[num_online].sa.begin();
-          last= SquaresD[num_online].sa.begin();
-          advance(last, ns);
-          a.clear();
-          a.splice(a.begin(), SquaresD[num_online].sa, first, last);
-
-          first= SquaresD[num_online].sb.begin();
-          last= SquaresD[num_online].sb.begin();
-          advance(last, ns);
-          b.clear();
-          b.splice(b.begin(), SquaresD[num_online].sb, first, last);
-
-          OCD.square_mutex[num_online].unlock();
-
-          sacrifice_phase_squares(P, fake_sacrifice, a, b, OP);
-
-          /* Add to queues */
-          OCD.sacrifice_mutex[num_online].lock();
-          OCD.tots[num_online]+= a.size();
-          it= SacrificeD[num_online].SD.sa.end();
-          SacrificeD[num_online].SD.sa.splice(it, a);
-          it= SacrificeD[num_online].SD.sb.end();
-          SacrificeD[num_online].SD.sb.splice(it, b);
-          OCD.sacrifice_mutex[num_online].unlock();
-        }
-      if (nb > 0 && !exit)
-        {
-          if (verbose > 1)
-            {
-              printf("Sac: thread = %d : B: %d\n", num_online, nb);
-              fflush(stdout);
-            }
-
-          OCD.bit_mutex[num_online].lock();
-
-          first= BitsD[num_online].bb.begin();
-          last= BitsD[num_online].bb.begin();
-          advance(last, nb);
-          a.clear();
-          a.splice(a.begin(), BitsD[num_online].bb, first, last);
-
-          OCD.bit_mutex[num_online].unlock();
-
-          OCD.square_mutex[num_online].lock();
-
-          first= SquaresD[num_online].sa.begin();
-          last= SquaresD[num_online].sa.begin();
-          advance(last, rep * nb);
-          b.clear();
-          b.splice(b.begin(), SquaresD[num_online].sa, first, last);
-
-          first= SquaresD[num_online].sb.begin();
-          last= SquaresD[num_online].sb.begin();
-          advance(last, rep * nb);
-          c.clear();
-          c.splice(c.begin(), SquaresD[num_online].sb, first, last);
-
-          OCD.square_mutex[num_online].unlock();
-
-          sacrifice_phase_bits(P, fake_sacrifice, a, b, c, OP);
-
-          /* Add to queues */
-          OCD.sacrifice_mutex[num_online].lock();
-          OCD.totb[num_online]+= a.size();
-          it= SacrificeD[num_online].BD.bb.end();
-          SacrificeD[num_online].BD.bb.splice(it, a);
-          OCD.sacrifice_mutex[num_online].unlock();
-        }
-      OCD.sacrifice_mutex[num_online].lock();
       if (verbose > 0)
-        { // Print this data it is useful for debugging stuff
+        { // Only reading for printing purposes so we dont lock mutex's for reading
+
+          // Print this data it is useful for debugging stuff
           printf("Sacrifice Queues : thread = %d : %lu %lu %lu : ", num_online,
                  SacrificeD[num_online].TD.ta.size(),
                  SacrificeD[num_online].SD.sa.size(),
@@ -689,24 +470,55 @@ void sacrifice_phase(int num_online, Player &P, int fake_sacrifice,
           avg= time / btotal;
           printf("Seconds per `Thing` (all threads) %f : Total %f : Throughput %f\n", avg, total, 1 / avg);
         }
-      /* Check whether we should kill the offline phase as we have now enough data */
-      if (OCD.totm[num_online] > OCD.maxm && OCD.maxm != 0 &&
-          OCD.tots[num_online] > OCD.maxs && OCD.maxs != 0 &&
-          OCD.totb[num_online] > OCD.maxb && OCD.maxb != 0 &&
-          OCD.totI[num_online] > OCD.maxI)
-        {
-          OCD.finish_offline[num_online]= 1;
-          printf("We have enough data to stop offline phase now\n");
+      if (!exit && (OCD.maxm != 0 || OCD.maxs != 0 || OCD.maxb != 0 || OCD.maxI != 0))
+        { /* Check whether we should kill the offline phase as we have now enough data */
           exit= true;
+
+	  // OCD.mul_mutex[num_online].lock();
+          if (OCD.totm[num_online] < OCD.maxm || OCD.maxm == 0)
+            {
+              exit= false;
+            }
+          //OCD.mul_mutex[num_online].unlock();
+
+          //OCD.sqr_mutex[num_online].lock();
+          if (OCD.tots[num_online] < OCD.maxs || OCD.maxs == 0)
+            {
+              exit= false;
+            }
+          //OCD.sqr_mutex[num_online].unlock();
+
+          //OCD.bit_mutex[num_online].lock();
+          if (OCD.totb[num_online] < OCD.maxb || OCD.maxb == 0)
+            {
+              exit= false;
+            }
+          //OCD.bit_mutex[num_online].unlock();
+
+          //OCD.OCD_mutex[num_online].lock();
+          if (OCD.totI[num_online] < OCD.maxI || OCD.maxI == 0)
+            {
+              exit= false;
+            }
+          //OCD.OCD_mutex[num_online].unlock();
+
+          if (exit)
+            {
+              OCD.OCD_mutex[num_online].lock();
+              OCD.finish_offline[num_online]= 1;
+              OCD.OCD_mutex[num_online].unlock();
+              printf("We have enough data to stop offline phase now\n");
+            }
         }
       if (exit)
         {
+          OCD.OCD_mutex[num_online].lock();
           OCD.finished_offline[num_online]++;
+          OCD.OCD_mutex[num_online].unlock();
         }
-      OCD.sacrifice_mutex[num_online].unlock();
       if (exit)
         {
-          printf("Exiting sacrifice phase : thread = %d\n", num_online);
+          printf("Exiting inputs phase : thread = %d\n", num_online);
           return;
         }
     }
diff --git a/src/Offline/offline_phases.h b/src/Offline/offline_phases.h
index 6a06081e..ae662677 100644
--- a/src/Offline/offline_phases.h
+++ b/src/Offline/offline_phases.h
@@ -11,25 +11,31 @@ All rights reserved
 #include "System/Player.h"
 #include "offline_data.h"
 
-void mult_phase(int num_online, Player &P, offline_control_data &OCD,
+void mult_phase(int num_online, Player &P, int fake_sacrifice,
+                offline_control_data &OCD,
                 const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
                 FHE_Industry &industry,
                 int verbose);
 
-void square_phase(int num_online, Player &P, offline_control_data &OCD,
+void square_phase(int num_online, Player &P, int fake_sacrifice,
+                  offline_control_data &OCD,
                   const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
                   FHE_Industry &industry,
                   int verbose);
 
-void bit_phase(int num_online, Player &P, offline_control_data &OCD,
+void bit_phase(int num_online, Player &P, int fake_sacrifice,
+               offline_control_data &OCD,
                const FHE_PK &pk, const FHE_SK &sk, const FFT_Data &PTD,
                FHE_Industry &industry,
                int verbose);
 
-void sacrifice_phase(int num_online, Player &P, int fake_sacrifice,
-                     offline_control_data &OCD, const FHE_PK &pk,
-                     const FHE_SK &sk, const FFT_Data &PTD,
-                     FHE_Industry &industry,
-                     int verbose);
+/* As well as doing the input generation, this also acts as a bit
+ * of a control to ensure things close down cleanly
+ */
+void inputs_phase(int num_online, Player &P, int fake_sacrifice,
+                  offline_control_data &OCD, const FHE_PK &pk,
+                  const FHE_SK &sk, const FFT_Data &PTD,
+                  FHE_Industry &industry,
+                  int verbose);
 
 #endif
diff --git a/src/Offline/sacrifice.cpp b/src/Offline/sacrifice.cpp
index d3b8226b..436a64c9 100644
--- a/src/Offline/sacrifice.cpp
+++ b/src/Offline/sacrifice.cpp
@@ -236,7 +236,7 @@ void sacrifice_bits(Player &P, list<Share> &bits, list<Share> &a,
   vector<Share> bits1(amortize);
   Share temp;
 
-  int left_todo= bits.size();
+  int left_todo= min(bits.size(), a.size() / rep);
   while (left_todo > 0)
     {
       int this_loop= amortize;
diff --git a/src/Online/Online.cpp b/src/Online/Online.cpp
index 927b1143..fd52c902 100644
--- a/src/Online/Online.cpp
+++ b/src/Online/Online.cpp
@@ -9,6 +9,9 @@ All rights reserved
 
 #include "Online.h"
 #include "Processor/Processor.h"
+#include "OT/OT_Thread_Data.h"
+
+extern OT_Thread_Data OTD;
 
 void online_phase(int online_num, Player &P, offline_control_data &OCD,
                   Machine &machine)
@@ -21,13 +24,35 @@ void online_phase(int online_num, Player &P, offline_control_data &OCD,
   bool wait= true;
   while (wait)
     {
-      OCD.sacrifice_mutex[online_num].lock();
-      if (OCD.totm[online_num] > OCD.minm && OCD.tots[online_num] > OCD.mins &&
-          OCD.totb[online_num] > OCD.minb)
+      wait= false;
+      OCD.mul_mutex[online_num].lock();
+      if (OCD.totm[online_num] < OCD.minm)
+        {
+          wait= true;
+        }
+      OCD.mul_mutex[online_num].unlock();
+
+      OCD.sqr_mutex[online_num].lock();
+      if (OCD.tots[online_num] < OCD.mins)
+        {
+          wait= true;
+        }
+      OCD.sqr_mutex[online_num].unlock();
+
+      OCD.bit_mutex[online_num].lock();
+      if (OCD.totb[online_num] < OCD.minb)
         {
-          wait= false;
+          wait= true;
         }
-      OCD.sacrifice_mutex[online_num].unlock();
+      OCD.bit_mutex[online_num].unlock();
+
+      OTD.aBD.aBD_mutex.lock();
+      if (OTD.ready==false)
+	{
+	  wait=true;
+        }
+      OTD.aBD.aBD_mutex.unlock();
+
       if (wait)
         {
           sleep(1);
@@ -36,7 +61,7 @@ void online_phase(int online_num, Player &P, offline_control_data &OCD,
   printf("Starting online phase\n");
 
   // Initialise the program
-  Processor Proc(online_num, P.nplayers(), P, OCD);
+  Processor Proc(online_num, P.nplayers(), P);
 
   bool flag= true;
 
@@ -89,8 +114,9 @@ void online_phase(int online_num, Player &P, offline_control_data &OCD,
   machine.Lock_Until_Ready(online_num);
 
   // Signal offline threads I am dying now
-  OCD.sacrifice_mutex[online_num].lock();
+  OCD.OCD_mutex[online_num].lock();
   OCD.finish_offline[online_num]= 1;
-  OCD.sacrifice_mutex[online_num].unlock();
+  OCD.finished_online[online_num]= 1;
+  OCD.OCD_mutex[online_num].unlock();
   printf("Exiting online phase : %d\n", online_num);
 }
diff --git a/src/Processor/Instruction.cpp b/src/Processor/Instruction.cpp
index 2ae46817..3d6ae095 100644
--- a/src/Processor/Instruction.cpp
+++ b/src/Processor/Instruction.cpp
@@ -137,6 +137,7 @@ void BaseInstruction::parse_operands(istream &s, int pos)
       case STMINTI:
       case LEGENDREC:
       case SQUARE:
+      case DABIT:
       case CONVINT:
       case LTZINT:
       case EQZINT:
@@ -160,16 +161,15 @@ void BaseInstruction::parse_operands(istream &s, int pos)
         break;
       // instructions with 1 register operand
       case BIT:
-      case PRINTREGPLAIN:
+      case PRINT_REG:
       case LDTN:
       case LDARG:
       case STARG:
-      case JMPI:
       case PUSHINT:
       case POPINT:
-      case PRINTCHRINT:
-      case PRINTSTRINT:
-      case PRINTINT:
+      case PRINT_CHAR_REGINT:
+      case PRINT_CHAR4_REGINT:
+      case PRINT_INT:
         r[0]= get_int(s);
         break;
       // instructions with 2 registers + 1 integer operand
@@ -215,7 +215,6 @@ void BaseInstruction::parse_operands(istream &s, int pos)
       case STMINT:
       case JMPNZ:
       case JMPEQZ:
-      case PRINTREG:
       case LDINT:
       case INPUT_CLEAR:
       case OUTPUT_CLEAR:
@@ -236,19 +235,20 @@ void BaseInstruction::parse_operands(istream &s, int pos)
         m= get_int(s);
         break;
       // instructions with 1 reg + 2 integer operand
-      case PRINTFIXPLAIN:
+      case PRINT_FIX:
         r[0]= get_int(s);
         n= get_int(s);
         m= get_int(s);
         break;
       // instructions with 1 integer operand
-      case PRINTSTR:
-      case PRINTCHR:
+      case PRINT_CHAR4:
+      case PRINT_CHAR:
       case JMP:
-      case START_TIMER:
-      case STOP_TIMER:
+      case CALL:
+      case START_CLOCK:
+      case STOP_CLOCK:
       case CLOSE_CHAN:
-      case PRINTMEM:
+      case PRINT_MEM:
       case JOIN_TAPE:
         n= get_int(s);
         break;
@@ -257,9 +257,10 @@ void BaseInstruction::parse_operands(istream &s, int pos)
       case CLEAR_REGISTERS:
       case RESTART:
       case CRASH:
+      case RETURN:
         break;
       // instructions with 4 register operands
-      case PRINTFLOATPLAIN:
+      case PRINT_FLOAT:
       case MUL2SINT:
         get_vector(4, start, s);
         break;
@@ -298,10 +299,10 @@ void BaseInstruction::parse_operands(istream &s, int pos)
     }
 }
 
-int BaseInstruction::get_reg_type() const
+RegType BaseInstruction::get_reg_type() const
 {
   switch (opcode)
-    { // List here commands which write to a regint/sregint/sbit register or regint memory
+    { // List here commands which write to a specific type of register or a direct memory access
       case LDMINT:
       case LDMINTI:
       case MOVINT:
@@ -309,7 +310,6 @@ int BaseInstruction::get_reg_type() const
       case LDTN:
       case LDARG:
       case INPUT_INT:
-      case OPEN_CHAN:
       case RAND:
       case LDINT:
       case CONVMODP:
@@ -323,6 +323,8 @@ int BaseInstruction::get_reg_type() const
       case EQINT:
       case EQZINT:
       case STMINT:
+      case STMSINT:
+      case STMSINTI:
       case LDMSINT:
       case LDMSINTI:
       case MOVSINT:
@@ -339,17 +341,9 @@ int BaseInstruction::get_reg_type() const
       case SHRSINT:
       case NEG:
       case SAND:
-      case XORSB:
-      case ANDSB:
-      case ORSB:
-      case NEGB:
-      case LTZSINT:
-      case EQZSINT:
-      case BITSINT:
       case SINTBIT:
       case CONVSINTSREG:
       case CONVREGSREG:
-      case CONVSREGSINT:
       case OPENSINT:
       case OPENSBIT:
       case ANDSINT:
@@ -364,22 +358,88 @@ int BaseInstruction::get_reg_type() const
       case LF_REGINT:
       case LF_SREGINT:
         return INT;
+      case XORSB:
+      case ANDSB:
+      case ORSB:
+      case NEGB:
+      case LTZSINT:
+      case EQZSINT:
+      case BITSINT:
+        return SBIT;
+      case PUSHINT:
+      case STARG:
+      case REQBL:
+      case RUN_TAPE:
+      case JOIN_TAPE:
+      case CRASH:
+      case CLEAR_MEMORY:
+      case CLEAR_REGISTERS:
+      case PRINT_MEM:
+      case PRINT_REG:
+      case PRINT_CHAR:
+      case PRINT_CHAR4:
+      case PRINT_CHAR_REGINT:
+      case PRINT_CHAR4_REGINT:
+      case PRINT_FLOAT:
+      case PRINT_FIX:
+      case PRINT_INT:
+      case OPEN_CHAN:
+      case CLOSE_CHAN:
+      case OUTPUT_SHARE:
+      case OUTPUT_INT:
+      case PRIVATE_OUTPUT:
+      case JMP:
+      case JMPNZ:
+      case JMPEQZ:
+      case STARTOPEN:
+      case START_CLOCK:
+      case STOP_CLOCK:
+      case CALL:
+      case RETURN:
+        return NONE;
+      case DABIT:
+        return DUAL;
       default:
         return MODP;
     }
 }
 
-int BaseInstruction::get_max_reg(int reg_type) const
+/* This does an overestimate as it counts ALL values, even 
+ * if they are reading. But the reg_type looks only at
+ * the RETURN value type. So we will overcount some register
+ * usage. If we got the exact register usage it would cost
+ * more logic, and would save little in terms of memeory
+ *
+ * The trick is that if a register is read it must be
+ * written so if we only count the instructions which
+ * write, and then take the max register in that
+ * instruction it will be OK
+ *   
+ * So if we hade
+ *      blah with c0,c1,s0,s1 registers
+ *      c5 <- add c0, c1
+ *      s3 <- add c5, s1
+ * Then the max registers *should* be
+ *    c : 5    s: 3
+ * But we actually count
+ *    c : 5    s: 5
+ * due to the c5 read in the last instruction. But we only
+ * need a max and 3<5 so all is OK
+ *
+ * Dual is a weird one to catch the different write types
+ * of the DABIT instruction
+ *
+ */
+int BaseInstruction::get_max_reg(RegType reg_type) const
 {
-  if (get_reg_type() != reg_type)
+  if ((get_reg_type() == reg_type) || (get_reg_type() == DUAL && (reg_type == SBIT || reg_type == MODP)))
     {
-      return 0;
+      if (start.size())
+        return *max_element(start.begin(), start.end()) + size;
+      else
+        return *max_element(r, r + 3) + size;
     }
-
-  if (start.size())
-    return *max_element(start.begin(), start.end()) + size;
-  else
-    return *max_element(r, r + 3) + size;
+  return 0;
 }
 
 int Instruction::get_mem(RegType reg_type, SecrecyType sec_type) const
@@ -500,6 +560,12 @@ ostream &operator<<(ostream &s, const Instruction &instr)
       case STARG:
         s << "STARG";
         break;
+      case CALL:
+        s << "CALL";
+        break;
+      case RETURN:
+        s << "RETURN";
+        break;
       case RUN_TAPE:
         s << "RUN_TAPE";
         break;
@@ -632,6 +698,9 @@ ostream &operator<<(ostream &s, const Instruction &instr)
       case SQUARE:
         s << "SQUARE";
         break;
+      case DABIT:
+        s << "DABIT";
+        break;
       case ANDC:
         s << "ANDC";
         break;
@@ -689,9 +758,6 @@ ostream &operator<<(ostream &s, const Instruction &instr)
       case EQINT:
         s << "EQINT";
         break;
-      case JMPI:
-        s << "JMPI";
-        break;
       case LDINT:
         s << "LDINT";
         break;
@@ -713,44 +779,41 @@ ostream &operator<<(ostream &s, const Instruction &instr)
       case CONVMODP:
         s << "CONVMODP";
         break;
-      case PRINTMEM:
-        s << "PRINTMEM";
+      case PRINT_MEM:
+        s << "PRINT_MEM";
         break;
-      case PRINTREG:
-        s << "PRINTREG";
+      case PRINT_REG:
+        s << "PRINT_REG";
         break;
-      case PRINTREGPLAIN:
-        s << "PRINTREGPLAIN";
+      case PRINT_CHAR:
+        s << "PRINT_CHAR";
         break;
-      case PRINTCHR:
-        s << "PRINTCHR";
+      case PRINT_CHAR4:
+        s << "PRINT_CHAR4";
         break;
-      case PRINTSTR:
-        s << "PRINTSTR";
+      case PRINT_CHAR_REGINT:
+        s << "PRINT_CHAR_REGINT";
         break;
-      case PRINTCHRINT:
-        s << "PRINTCHRINT";
+      case PRINT_CHAR4_REGINT:
+        s << "PRINT_CHAR4_REGINT";
         break;
-      case PRINTSTRINT:
-        s << "PRINTSTRINT";
+      case PRINT_FLOAT:
+        s << "PRINT_FLOAT";
         break;
-      case PRINTFLOATPLAIN:
-        s << "PRINTFLOATPLAIN";
+      case PRINT_FIX:
+        s << "PRINT_FIX";
         break;
-      case PRINTFIXPLAIN:
-        s << "PRINTFIXPLAIN";
-        break;
-      case PRINTINT:
-        s << "PRINTINT";
+      case PRINT_INT:
+        s << "PRINT_INT";
         break;
       case RAND:
         s << "RAND";
         break;
-      case START_TIMER:
-        s << "START_TIMER";
+      case START_CLOCK:
+        s << "START_CLOCK";
         break;
-      case STOP_TIMER:
-        s << "STOP_TIMER";
+      case STOP_CLOCK:
+        s << "STOP_CLOCK";
         break;
       case LDMSINT:
         s << "LDMSINT";
@@ -1035,9 +1098,13 @@ ostream &operator<<(ostream &s, const Instruction &instr)
         break;
       // instructions with 1 sint + 1 sregint register operand
       case CONVSREGSINT:
-        s << "s_" << instr.r[0] << " ";
+	s << "s_" << instr.r[0] << " ";
         s << "sr_" << instr.r[1] << " ";
         break;
+      case DABIT:
+        s << "s_" << instr.r[0] << " ";
+        s << "sb_" << instr.r[1] << " ";
+        break;
       // instructions with 1 rint + 1 sregint register operand
       case OPENSINT:
         s << "r_" << instr.r[0] << " ";
@@ -1085,7 +1152,7 @@ ostream &operator<<(ostream &s, const Instruction &instr)
         s << "sb_" << instr.r[1] << " ";
         break;
       // instructions with 1 cint register operands
-      case PRINTREGPLAIN:
+      case PRINT_REG:
         s << "c_" << instr.r[0] << " ";
         break;
       // instructions with 1 sint register operands
@@ -1093,10 +1160,9 @@ ostream &operator<<(ostream &s, const Instruction &instr)
         s << "s_" << instr.r[0] << " ";
         break;
       // instructions with 1 rint register operands
-      case PRINTINT:
-      case PRINTCHRINT:
-      case PRINTSTRINT:
-      case JMPI:
+      case PRINT_INT:
+      case PRINT_CHAR_REGINT:
+      case PRINT_CHAR4_REGINT:
       case PUSHINT:
       case POPINT:
       case STARG:
@@ -1166,7 +1232,6 @@ ostream &operator<<(ostream &s, const Instruction &instr)
       case LDI:
       case LDMC:
       case STMC:
-      case PRINTREG:
       case INPUT_CLEAR:
       case OUTPUT_CLEAR:
         s << "c_" << instr.r[0] << " ";
@@ -1187,21 +1252,22 @@ ostream &operator<<(ostream &s, const Instruction &instr)
         s << instr.m << " ";
         break;
       // instructions with 1 sint + 2 integer operands
-      case PRINTFIXPLAIN:
+      case PRINT_FIX:
         s << "c_" << instr.r[0] << " ";
         s << instr.n << " ";
         s << instr.m << " ";
         break;
       // instructions with 1 integer operand
-      case PRINTSTR:
-      case PRINTCHR:
+      case PRINT_CHAR:
+      case PRINT_CHAR4:
       case REQBL:
       case JMP:
-      case START_TIMER:
-      case STOP_TIMER:
+      case START_CLOCK:
+      case STOP_CLOCK:
       case CLOSE_CHAN:
-      case PRINTMEM:
+      case PRINT_MEM:
       case JOIN_TAPE:
+      case CALL:
         s << instr.n << " ";
         break;
       // instructions with no operand
@@ -1209,6 +1275,7 @@ ostream &operator<<(ostream &s, const Instruction &instr)
       case CRASH:
       case CLEAR_MEMORY:
       case CLEAR_REGISTERS:
+      case RETURN:
         break;
       // Three integer operands
       case RUN_TAPE:
@@ -1223,7 +1290,7 @@ ostream &operator<<(ostream &s, const Instruction &instr)
             s << "sr_" << instr.start[i] << " ";
           }
         break;
-      case PRINTFLOATPLAIN:
+      case PRINT_FLOAT:
       case STOPOPEN:
         for (unsigned int i= 0; i < instr.start.size(); i++)
           {
@@ -1332,7 +1399,6 @@ void Instruction::execute_using_sacrifice_data(
   // Check to see if we have to wait
   Wait_For_Preproc(opcode, size, thread, OCD);
   // Now do the work
-  OCD.sacrifice_mutex[thread].lock();
   Proc.increment_PC();
 
   int r[3]= {this->r[0], this->r[1], this->r[2]};
@@ -1342,22 +1408,28 @@ void Instruction::execute_using_sacrifice_data(
       switch (opcode)
         {
           case TRIPLE:
+            OCD.mul_mutex[thread].lock();
             Proc.get_Sp_ref(r[0])= SacrificeD[thread].TD.ta.front();
             SacrificeD[thread].TD.ta.pop_front();
             Proc.get_Sp_ref(r[1])= SacrificeD[thread].TD.tb.front();
             SacrificeD[thread].TD.tb.pop_front();
             Proc.get_Sp_ref(r[2])= SacrificeD[thread].TD.tc.front();
             SacrificeD[thread].TD.tc.pop_front();
+            OCD.mul_mutex[thread].unlock();
             break;
           case SQUARE:
+            OCD.sqr_mutex[thread].lock();
             Proc.get_Sp_ref(r[0])= SacrificeD[thread].SD.sa.front();
             SacrificeD[thread].SD.sa.pop_front();
             Proc.get_Sp_ref(r[1])= SacrificeD[thread].SD.sb.front();
             SacrificeD[thread].SD.sb.pop_front();
+            OCD.sqr_mutex[thread].unlock();
             break;
           case BIT:
+            OCD.bit_mutex[thread].lock();
             Proc.get_Sp_ref(r[0])= SacrificeD[thread].BD.bb.front();
             SacrificeD[thread].BD.bb.pop_front();
+            OCD.bit_mutex[thread].unlock();
             break;
           default:
             throw bad_value();
@@ -1370,7 +1442,6 @@ void Instruction::execute_using_sacrifice_data(
           r[2]++;
         }
     }
-  OCD.sacrifice_mutex[thread].unlock();
 }
 
 bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
@@ -1397,6 +1468,16 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
   int r[3]= {this->r[0], this->r[1], this->r[2]};
   int n= this->n;
 
+  // Extract daBit
+  if (opcode == DABIT)
+    {
+      for (unsigned int i= 0; i < size; i++)
+        {
+          Proc.write_daBit(r[0] + i, r[1] + i);
+        }
+      return restart;
+    }
+
   for (unsigned int i= 0; i < size; i++)
     {
       switch (opcode)
@@ -1784,9 +1865,6 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
           case JMP:
             Proc.relative_jump((signed int) n);
             break;
-          case JMPI:
-            Proc.relative_jump((signed int) Proc.read_Ri(r[0]));
-            break;
           case JMPNZ:
             if (Proc.read_Ri(r[0]) != 0)
               {
@@ -1799,6 +1877,15 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 Proc.relative_jump((signed int) n);
               }
             break;
+          case CALL:
+            Proc.pushi(Proc.get_PC());
+            Proc.relative_jump((signed int) n);
+            break;
+          case RETURN:
+            long ret_pos;
+            Proc.popi(ret_pos);
+            Proc.jump(ret_pos);
+            break;
           case EQZINT:
             if (Proc.read_Ri(r[1]) == 0)
               Proc.write_Ri(r[0], 1);
@@ -1851,7 +1938,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
             to_signed_bigint(Proc.temp.aa, Proc.read_Cp(r[1]), n);
             Proc.write_Ri(r[0], Proc.temp.aa.get_si());
             break;
-          case PRINTMEM:
+          case PRINT_MEM:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1859,16 +1946,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTREG:
-            if (P.whoami() == 0)
-              {
-                stringstream ss;
-                ss << "Reg[" << r[0] << "] = " << Proc.read_Cp(r[0]) << " # "
-                   << string((char *) &n, sizeof(n)) << endl;
-                machine.get_IO().debug_output(ss);
-              }
-            break;
-          case PRINTREGPLAIN:
+          case PRINT_REG:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1876,7 +1954,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTFIXPLAIN:
+          case PRINT_FIX:
             if (P.whoami() == 0)
               {
                 gfp v= Proc.read_Cp(r[0]);
@@ -1893,7 +1971,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTFLOATPLAIN:
+          case PRINT_FLOAT:
             if (P.whoami() == 0)
               {
                 gfp v= Proc.read_Cp(start[0]);
@@ -1920,7 +1998,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTINT:
+          case PRINT_INT:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1928,7 +2006,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTSTR:
+          case PRINT_CHAR4:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1936,7 +2014,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTCHR:
+          case PRINT_CHAR:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1944,7 +2022,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTCHRINT:
+          case PRINT_CHAR_REGINT:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1952,7 +2030,7 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
                 machine.get_IO().debug_output(ss);
               }
             break;
-          case PRINTSTRINT:
+          case PRINT_CHAR4_REGINT:
             if (P.whoami() == 0)
               {
                 stringstream ss;
@@ -1963,10 +2041,10 @@ bool Instruction::execute(Processor &Proc, Player &P, Machine &machine,
           case RAND:
             Proc.write_Ri(r[0], Proc.get_random_uint() % (1 << Proc.read_Ri(r[1])));
             break;
-          case START_TIMER:
+          case START_CLOCK:
             machine.start_timer(n);
             break;
-          case STOP_TIMER:
+          case STOP_CLOCK:
             machine.stop_timer(n);
             break;
           case REQBL:
diff --git a/src/Processor/Instruction.h b/src/Processor/Instruction.h
index 2ec7bd94..6dc9cefa 100644
--- a/src/Processor/Instruction.h
+++ b/src/Processor/Instruction.h
@@ -57,6 +57,8 @@ enum {
   LDARG= 0x11,
   REQBL= 0x12,
   STARG= 0x13,
+  CALL= 0x14,
+  RETURN= 0x15,
   RUN_TAPE= 0x19,
   JOIN_TAPE= 0x1A,
   CRASH= 0x1B,
@@ -113,6 +115,7 @@ enum {
   TRIPLE= 0x50,
   BIT= 0x51,
   SQUARE= 0x52,
+  DABIT= 0x53,
 
   // sregint/sbit instructions
   LDMSINT= 0x60,
@@ -162,7 +165,6 @@ enum {
   LTINT= 0x95,
   GTINT= 0x96,
   EQINT= 0x97,
-  JMPI= 0x98,
 
   // Integers
   LDINT= 0x9A,
@@ -179,16 +181,15 @@ enum {
   CONVSREGSINT= 0xC4,
 
   // Debug Printing
-  PRINTMEM= 0xB0,
-  PRINTREG= 0XB1,
-  PRINTREGPLAIN= 0xB2,
-  PRINTCHR= 0xB3,
-  PRINTSTR= 0xB4,
-  PRINTCHRINT= 0xB5,
-  PRINTSTRINT= 0xB6,
-  PRINTFLOATPLAIN= 0xB7,
-  PRINTFIXPLAIN= 0xB8,
-  PRINTINT= 0xB9,
+  PRINT_MEM= 0xB0,
+  PRINT_REG= 0xB2,
+  PRINT_CHAR= 0xB3,
+  PRINT_CHAR4= 0xB4,
+  PRINT_CHAR_REGINT= 0xB5,
+  PRINT_CHAR4_REGINT= 0xB6,
+  PRINT_FLOAT= 0xB7,
+  PRINT_FIX= 0xB8,
+  PRINT_INT= 0xB9,
 
   // Comparison of sregints
   EQZSINT= 0xD0,
@@ -213,8 +214,8 @@ enum {
 
   // Others
   RAND= 0xE0,
-  START_TIMER= 0xE1,
-  STOP_TIMER= 0xE2,
+  START_CLOCK= 0xE1,
+  STOP_CLOCK= 0xE2,
 
   // Local functions
   LF_CINT= 0xEA,
@@ -228,7 +229,10 @@ enum {
 enum RegType {
   MODP,
   INT,
+  SBIT,
+  DUALBIT,
   MAX_REG_TYPE,
+  DUAL,
   NONE
 };
 
@@ -254,12 +258,12 @@ class BaseInstruction
 
   void parse_operands(istream &s, int pos);
 
-  virtual int get_reg_type() const;
+  virtual RegType get_reg_type() const;
 
   bool is_direct_memory_access(SecrecyType sec_type) const;
 
   // Returns the maximal register used
-  int get_max_reg(int reg_type) const;
+  int get_max_reg(RegType reg_type) const;
 };
 
 class Instruction : public BaseInstruction
diff --git a/src/Processor/Processor.cpp b/src/Processor/Processor.cpp
index 0123dde2..3ce5101b 100644
--- a/src/Processor/Processor.cpp
+++ b/src/Processor/Processor.cpp
@@ -15,11 +15,10 @@ extern Base_Circuits Global_Circuit_Store;
 extern Local_Functions Global_LF;
 
 Processor::Processor(int thread_num, unsigned int nplayers,
-                     Player &P, offline_control_data &OCD)
-    : online_thread_num(thread_num),
-      daBitGen(daBitMachine, P, thread_num, OCD),
-      iop(nplayers)
+                     Player &P) : online_thread_num(thread_num), iop(nplayers)
 {
+
+  daBitGen = daBitMachine.new_generator(P, thread_num);
   rounds= 0;
   sent= 0;
 }
@@ -27,6 +26,12 @@ Processor::Processor(int thread_num, unsigned int nplayers,
 Processor::~Processor()
 {
   fprintf(stderr, "Sent %d elements in %d rounds\n", sent, rounds);
+#ifdef VERBOSE 
+  cout << "dabitgen statistics:" << endl;
+  cout << "Produced " << daBitGen->total << " dabits" << endl;
+  for (auto &timer: daBitGen->timers)
+      cout << timer.first << " took time " << timer.second.elapsed() / 1e6 << endl;
+#endif
 }
 
 void Processor::POpen_Start(const vector<int> &reg, int size, Player &P)
@@ -129,11 +134,13 @@ void Processor::execute(const Program &prog, int argument, Player &P,
 {
   reg_maxp= prog.num_reg(MODP);
   reg_maxi= prog.num_reg(INT);
+  reg_maxb= prog.num_reg(SBIT);
+
   Cp.resize(reg_maxp);
   Sp.resize(reg_maxp);
   Ri.resize(reg_maxi);
   srint.resize(reg_maxi);
-  sbit.resize(reg_maxi);
+  sbit.resize(reg_maxb);
 
   for (int i= 0; i < reg_maxp; i++)
     {
@@ -148,11 +155,14 @@ void Processor::execute(const Program &prog, int argument, Player &P,
     }
   rwi.resize(reg_maxi);
   rwsr.resize(reg_maxi);
-  rwsb.resize(reg_maxi);
   for (int i= 0; i < reg_maxi; i++)
     {
       rwi[i]= 0;
       rwsr[i]= 0;
+    }
+  rwsb.resize(reg_maxb);
+  for (int i= 0; i < reg_maxb; i++)
+    {
       rwsb[i]= 0;
     }
 #endif
@@ -202,6 +212,7 @@ void Processor::convert_sint_to_sregint_small(int i0, int i1, Player &P)
   vector<Share> bpr(size);
   vector<aBit> b2r(size);
   // Get a set of size daBits, until the value is less than p
+  auto& daBitGen = get_generator();
   while (!done)
     { // Get daBits
       daBitV.get_daBits(bpr, b2r, daBitGen);
@@ -291,6 +302,7 @@ void Processor::convert_sint_to_sregint(int i0, int i1, Player &P)
 
   // Get a daBit
   vector<Share> bpr(size1);
+  auto& daBitGen = get_generator();
   daBitV.get_daBits(bpr, input[1], daBitGen);
 
   // Now form r
@@ -341,6 +353,7 @@ void Processor::convert_sregint_to_sint(int i0, int i1, Player &P)
   /* Get sreg_bitl daBits */
   vector<aBit> a2r(sreg_bitl);
   vector<Share> apr(sreg_bitl);
+  auto& daBitGen = get_generator();
   daBitV.get_daBits(apr, a2r, daBitGen);
 
   /* Add onto the input register */
diff --git a/src/Processor/Processor.h b/src/Processor/Processor.h
index 99b74aa5..3a328e73 100644
--- a/src/Processor/Processor.h
+++ b/src/Processor/Processor.h
@@ -47,8 +47,6 @@ class Processor
   vector<aBitVector> srint;
   vector<aBit> sbit;
 
-  int reg_maxp, reg_maxi;
-
 // In DEBUG mode we keep track of valid/invalid read/writes on the registers
 #ifdef DEBUG
   vector<int> rwp;
@@ -60,6 +58,9 @@ class Processor
   // Program counter
   unsigned int PC;
 
+  // These are here for DEBUG mode
+  int reg_maxp, reg_maxi, reg_maxb;
+
   // This is the vector of partially opened values and shares we need to store
   // as the Open commands are split in two
   vector<gfp> PO;
@@ -77,7 +78,7 @@ class Processor
    * within instructions
    */
   // In the case when the OT thread is active this holds the daBitGenerator for this thread
-  DABitGenerator daBitGen;
+  AbstractDABitGenerator* daBitGen;
   // This holds the computed daBits
   daBitVector daBitV;
 
@@ -88,11 +89,16 @@ class Processor
   // Data structures for input and output of private data
   Processor_IO iop;
 
+  // retrieve dabit generator to avoid dealing with pointers
+  AbstractDABitGenerator& get_generator()
+  {
+    return *daBitGen;
+  }
+
 public:
   friend class Instruction;
 
-  Processor(int online_thread_num, unsigned int nplayers, Player &P,
-            offline_control_data &OCD);
+  Processor(int online_thread_num, unsigned int nplayers, Player &P);
   ~Processor();
 
   void clear_registers();
@@ -238,6 +244,15 @@ class Processor
     sbit.at(i)= x;
   }
 
+  void write_daBit(int i1, int j1)
+  {
+    daBitV.get_daBit(temp.Sansp, temp.aB, daBitGen);
+    rwp[i1 + reg_maxp]= 1;
+    rwsb[j1]= 1;
+    Sp.at(i1)= temp.Sansp;
+    sbit.at(j1)= temp.aB;
+  }
+
 #else
   const gfp &read_Cp(int i) const
   {
@@ -302,6 +317,12 @@ class Processor
   {
     sbit[i]= x;
   }
+  void write_daBit(int i1, int j1)
+  {
+     daBitV.get_daBit(temp.Sansp, temp.aB, *daBitGen);
+     write_Sp(i1, temp.Sansp);
+     write_sbit(j1, temp.aB);
+  }
 
 #endif
 
diff --git a/src/Processor/Processor_IO.cpp b/src/Processor/Processor_IO.cpp
index fb74ef07..430a276e 100644
--- a/src/Processor/Processor_IO.cpp
+++ b/src/Processor/Processor_IO.cpp
@@ -34,7 +34,7 @@ void Processor_IO::private_input(unsigned int player, int target, unsigned int c
     }
 
   stringstream ss;
-  OCD.sacrifice_mutex[thread].lock();
+  OCD.OCD_mutex[thread].lock();
   rshares[player]= SacrificeD[thread].ID.ios[player].front();
   SacrificeD[thread].ID.ios[player].pop_front();
   if (player == P.whoami())
@@ -43,7 +43,7 @@ void Processor_IO::private_input(unsigned int player, int target, unsigned int c
       SacrificeD[thread].ID.opened_ios.pop_front();
       i_epsilon.output(ss, false);
     }
-  OCD.sacrifice_mutex[thread].unlock();
+  OCD.OCD_mutex[thread].unlock();
   Proc.increment_counters(Share::SD.M.shares_per_player(P.whoami()));
 
   if (player == P.whoami())
@@ -76,7 +76,7 @@ void Processor_IO::private_output(unsigned int player, int source, unsigned int
   int thread= Proc.get_thread_num();
   Wait_For_Preproc(DATA_INPUT_MASK, 1, thread, OCD, player);
 
-  OCD.sacrifice_mutex[thread].lock();
+  OCD.OCD_mutex[thread].lock();
 
   gfp o_epsilon;
 
@@ -90,7 +90,7 @@ void Processor_IO::private_output(unsigned int player, int source, unsigned int
   vector<gfp> values(1);
   shares[0]= SacrificeD[thread].ID.ios[player].front();
   SacrificeD[thread].ID.ios[player].pop_front();
-  OCD.sacrifice_mutex[thread].unlock();
+  OCD.OCD_mutex[thread].unlock();
 
   shares[0].add(Proc.get_Sp_ref(source));
 
diff --git a/src/Processor/Program.cpp b/src/Processor/Program.cpp
index 2fd89394..89ab9bbe 100644
--- a/src/Processor/Program.cpp
+++ b/src/Processor/Program.cpp
@@ -19,7 +19,7 @@ void Program::compute_constants()
     {
       for (int reg_type= 0; reg_type < MAX_REG_TYPE; reg_type++)
         {
-          max_reg[reg_type]= max(max_reg[reg_type], p[i].get_max_reg(reg_type));
+          max_reg[reg_type]= max(max_reg[reg_type], p[i].get_max_reg(RegType(reg_type)));
           for (int sec_type= 0; sec_type < MAX_SECRECY_TYPE; sec_type++)
             {
               max_mem[reg_type][sec_type]=
@@ -28,6 +28,16 @@ void Program::compute_constants()
             }
         }
     }
+
+  /*
+  cout << "Counts....\ntype\tmaxreg\tmemc\tmems\n";
+  for (int reg_type= 0; reg_type < MAX_REG_TYPE; reg_type++)
+    { cout <<  reg_type << "\t" << max_reg[reg_type] << "\t";
+      for (int sec_type= 0; sec_type < MAX_SECRECY_TYPE; sec_type++)
+        { cout << max_mem[reg_type][sec_type] << "\t"; }
+      cout << endl;
+    }
+  */
 }
 
 void Program::parse(stringstream &s)
diff --git a/src/Setup.cpp b/src/Setup.cpp
index b2cef178..0e3f1047 100644
--- a/src/Setup.cpp
+++ b/src/Setup.cpp
@@ -47,9 +47,9 @@ void init_FHE_Params(FHE_Params &params, bigint &pr, bigint &p0, bigint &p1,
 {
 // Assumes here pr=0 if we want it to be so
 #ifdef TOP_GEAR
-  Generate_Parameters(N, p0, p1, pr, lg2p, hwt, n, TopGear);
+  Generate_Parameters(N, p0, p1, pr, lg2p, n, TopGear, hwt);
 #else
-  Generate_Parameters(N, p0, p1, pr, lg2p, hwt, n, HighGear);
+  Generate_Parameters(N, p0, p1, pr, lg2p, n, HighGear, hwt);
 #endif
 
   Ring Rg(2 * N);
@@ -76,7 +76,7 @@ void init_FHE(bigint &pr, int lg2p, unsigned int n)
 {
   bigint p0, p1;
   FHE_Params params;
-  unsigned int N, hwt= 64;
+  unsigned int N, hwt= HwtSK;
   init_FHE_Params(params, pr, p0, p1, N, lg2p, n, hwt);
 
   FHE_PK pk(params, pr);
@@ -191,10 +191,15 @@ void init_certs()
     }
   EVP_PKEY_free(pkey);
 
+  /* XXXX
   cout << "Fake offline?" << endl;
   output << input_YN() << endl;
   cout << "Fake sacrifice?" << endl;
   output << input_YN() << endl;
+*/
+  // Choose non-fake in both cases
+  output << 0 << endl;
+  output << 0 << endl;
 
   output.close();
 }
@@ -286,7 +291,7 @@ void init_replicated(ShareData &SD, unsigned int n)
       AS.assign(n, t);
     }
 
-  cout << "What type of Offline Phase do you want (assuming non-fake)?\n";
+  cout << "What type of Offline Phase do you want ?\n";
   cout << "\t 1) Maurer\n";
   cout << "\t 2) Reduced";
   cout << endl;
diff --git a/src/System/Player.cpp b/src/System/Player.cpp
index 56b5a379..05666658 100644
--- a/src/System/Player.cpp
+++ b/src/System/Player.cpp
@@ -151,6 +151,8 @@ Player::Player(int mynumber, const SystemData &SD, int thread, SSL_CTX *ctx,
   ssl.resize(SD.n);
 
 #ifdef BENCH_NETDATA
+  br_messages_sent= 0;
+  pp_messages_sent= 0;
   data_sent= 0;
   data_received= 0;
 #endif
@@ -251,6 +253,7 @@ void Player::send_all(const string &o, int connection, bool verbose) const
 {
   uint8_t buff[4];
 #ifdef BENCH_NETDATA
+  br_messages_sent++;
   int len_buff= 4;
 #endif
   encode_length(buff, o.length());
@@ -301,6 +304,7 @@ void Player::send_to_player(int player, const string &o, int connection) const
       throw sending_error();
     }
 #ifdef BENCH_NETDATA
+  pp_messages_sent++;
   data_sent+= len_buff + o.length();
 #endif
 }
@@ -458,3 +462,22 @@ void Player::Send_Distinct_And_Receive(vector<string> &o, int connection) const
         }
     }
 }
+
+#ifdef BENCH_NETDATA
+void Player::print_network_data(int thread_num)
+{
+  printf(BENCH_TEXT_BOLD BENCH_COLOR_BLUE BENCH_MAGIC_START
+         "{\"player\":%u,\n"
+         "  \"thread\":%d,\n"
+         "  \"netdata\":{\n"
+         "    \"sent\":{\"bytes\":%ld,\"MB\":%.2f},\n"
+         "    \"received\":{\"bytes\":%ld,\"MB\":%.2f}\n"
+         "  }\n"
+         "  \"roundsdata\":{\n"
+         "    \"broadcast\":%ld\n"
+         "    \"p-to-p\":%ld\n"
+         "  }\n"
+         "}\n" BENCH_MAGIC_END BENCH_ATTR_RESET,
+         me, thread_num, data_sent, ((double) data_sent / 1000000), data_received, ((double) data_received / 1000000), br_messages_sent, pp_messages_sent);
+}
+#endif
diff --git a/src/System/Player.h b/src/System/Player.h
index 2883ded1..001d4fa0 100644
--- a/src/System/Player.h
+++ b/src/System/Player.h
@@ -15,6 +15,7 @@ All rights reserved
 #include "SystemData.h"
 #include "Tools/Timer.h"
 #include "Tools/random.h"
+#include <map>
 
 void Init_SSL_CTX(SSL_CTX *&ctx, unsigned int me, const SystemData &SD);
 
@@ -28,6 +29,9 @@ class Player
   // network data in bytes
   mutable long data_sent;
   mutable long data_received;
+  // messages sent (broadcast and pp)
+  mutable long pp_messages_sent;
+  mutable long br_messages_sent;
 #endif
 
   // We have an array of ssl[nplayer][3] connections
@@ -100,18 +104,7 @@ class Player
   void Send_Distinct_And_Receive(vector<string> &o, int connection= 0) const;
 
 #ifdef BENCH_NETDATA
-  void print_network_data(int thread_num)
-  {
-    printf(BENCH_TEXT_BOLD BENCH_COLOR_BLUE BENCH_MAGIC_START
-           "{\"player\":%u,\n"
-           "  \"thread\":%d,\n"
-           "  \"netdata\":{\n"
-           "    \"sent\":{\"bytes\":%ld,\"MB\":%.2f},\n"
-           "    \"received\":{\"bytes\":%ld,\"MB\":%.2f}\n"
-           "  }\n"
-           "}\n" BENCH_MAGIC_END BENCH_ATTR_RESET,
-           me, thread_num, data_sent, ((double) data_sent / 1000000), data_received, ((double) data_received / 1000000));
-  }
+  void print_network_data(int thread_num);
 #endif
 };
 
diff --git a/src/System/RunTime.cpp b/src/System/RunTime.cpp
index a4f88801..1db32d90 100644
--- a/src/System/RunTime.cpp
+++ b/src/System/RunTime.cpp
@@ -71,9 +71,6 @@ Timer global_time;
 // Forward declarations to make code easier to read
 void *Main_Func(void *ptr);
 
-vector<triples_data> TriplesD;
-vector<squares_data> SquaresD;
-vector<bits_data> BitsD;
 vector<sacrificed_data> SacrificeD;
 
 MaliciousDABitMachine daBitMachine;
@@ -112,12 +109,9 @@ void Run_Scale(unsigned int my_number, unsigned int no_online_threads,
 
   Global_Circuit_Store.initialize(gfp::pr());
 
-  OCD.resize(no_online_threads);
+  OCD.resize(no_online_threads, SD.n, my_number);
   OTD.init(no_online_threads);
 
-  TriplesD.resize(no_online_threads);
-  SquaresD.resize(no_online_threads);
-  BitsD.resize(no_online_threads);
   SacrificeD.resize(no_online_threads);
   for (unsigned int i= 0; i < no_online_threads; i++)
     {
@@ -129,7 +123,7 @@ void Run_Scale(unsigned int my_number, unsigned int no_online_threads,
   unsigned int tnthreads= nthreads + number_FHE_threads;
   // Add in the OT threads
   tnthreads+= 2;
-  daBitMachine.Initialize(SD.n);
+  daBitMachine.Initialize(SD.n, OCD);
 
   /* Initialize the networking TCP sockets */
   int ssocket;
@@ -192,6 +186,20 @@ void Run_Scale(unsigned int my_number, unsigned int no_online_threads,
     }
 
   Close_Connections(ssocket, csockets, my_number);
+
+  global_time.stop();
+  cout << "Total Time (with thread locking) = " << global_time.elapsed() << " seconds" << endl;
+
+  long long total_triples = 0, total_squares = 0, total_bits = 0;
+  for (size_t i = 0; i < no_online_threads; i++) {
+    total_triples += OCD.totm[i];
+    total_squares += OCD.tots[i];
+    total_bits += OCD.totb[i];
+  }
+  cout << "Produced a total of " << total_triples << " triples" << endl;
+  cout << "Produced a total of " << total_squares << " squares" << endl;
+  cout << "Produced a total of " << total_bits << " bits" << endl;
+
 }
 
 #ifdef BENCH_MEMORY
@@ -235,6 +243,7 @@ void *Main_Func(void *ptr)
   fflush(stdout);
 
   Player P(me, *(tinfo->SD), num, (tinfo->ctx), (tinfo->csockets), (tinfo->MacK), verbose - 1);
+
   printf("Set up player %d in thread %d \n", me, num);
   fflush(stdout);
 
@@ -245,20 +254,20 @@ void *Main_Func(void *ptr)
       switch (num5)
         {
           case 0:
-            mult_phase(num_online, P, *(tinfo->OCD), *(tinfo->pk), *(tinfo->sk),
-                       *(tinfo->PTD), *(tinfo)->industry, verbose);
+            mult_phase(num_online, P, (tinfo->SD)->fake_sacrifice, *(tinfo->OCD),
+                       *(tinfo->pk), *(tinfo->sk), *(tinfo->PTD), *(tinfo)->industry, verbose);
             break;
           case 1:
-            square_phase(num_online, P, *(tinfo->OCD), *(tinfo->pk), *(tinfo->sk),
-                         *(tinfo->PTD), *(tinfo)->industry, verbose);
+            square_phase(num_online, P, (tinfo->SD)->fake_sacrifice, *(tinfo->OCD),
+                         *(tinfo->pk), *(tinfo->sk), *(tinfo->PTD), *(tinfo)->industry, verbose);
             break;
           case 2:
-            bit_phase(num_online, P, *(tinfo->OCD), *(tinfo->pk), *(tinfo->sk),
-                      *(tinfo->PTD), *(tinfo)->industry, verbose);
+            bit_phase(num_online, P, (tinfo->SD)->fake_sacrifice, *(tinfo->OCD),
+                      *(tinfo->pk), *(tinfo->sk), *(tinfo->PTD), *(tinfo)->industry, verbose);
             break;
           case 3:
-            sacrifice_phase(num_online, P, (tinfo->SD)->fake_sacrifice, *(tinfo->OCD),
-                            *(tinfo->pk), *(tinfo->sk), *(tinfo->PTD), *(tinfo)->industry, verbose);
+            inputs_phase(num_online, P, (tinfo->SD)->fake_sacrifice, *(tinfo->OCD),
+                         *(tinfo->pk), *(tinfo->sk), *(tinfo->PTD), *(tinfo)->industry, verbose);
             break;
           case 4:
             online_phase(num_online, P, *(tinfo->OCD), *(tinfo)->machine);
diff --git a/src/Tools/Crypto.cpp b/src/Tools/Crypto.cpp
index f0ab5a38..6f0ed508 100644
--- a/src/Tools/Crypto.cpp
+++ b/src/Tools/Crypto.cpp
@@ -25,8 +25,10 @@ string Hash(const string &data)
   stringstream ss;
   for (int i= 0; i < SHA256_DIGEST_LENGTH; i++)
     {
-      ss << hex << setw(2) << setfill('0') << (int) hash[i];
+      // (human form) ss << hex << setw(2) << setfill('0') << (int) hash[i];
+      ss << hash[i];
     }
+
   return ss.str();
 }
 
diff --git a/src/cmake-modules/FindCryptoPP.cmake b/src/cmake-modules/FindCryptoPP.cmake
new file mode 100644
index 00000000..c3277eb9
--- /dev/null
+++ b/src/cmake-modules/FindCryptoPP.cmake
@@ -0,0 +1,17 @@
+find_path(CRYPTOPP_INCLUDE_DIR
+        NAMES cryptopp/cryptlib.h
+        HINTS "${CRYPTOPP_ROOT_DIR}/include"
+        )
+
+find_library(CRYPTOPP_LIBRARY
+        NAMES cryptopp
+        HINTS "${CRYPTOPP_ROOT_DIR}/lib"
+        )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CryptoPP REQUIRED_VARS CRYPTOPP_LIBRARY CRYPTOPP_INCLUDE_DIR)
+
+mark_as_advanced(CRYPTOPP_INCLUDE_DIRS CRYPTOPP_LIBRARIES)
+
+set(CRYPTOPP_LIBRARIES ${CRYPTOPP_LIBRARY})
+set(CRYPTOPP_INCLUDE_DIRS ${CRYPTOPP_INCLUDE_DIR})
diff --git a/src/cmake-modules/FindMPIR.cmake b/src/cmake-modules/FindMPIR.cmake
new file mode 100644
index 00000000..ed3f9d1a
--- /dev/null
+++ b/src/cmake-modules/FindMPIR.cmake
@@ -0,0 +1,17 @@
+find_path(MPIR_INCLUDE_DIR
+        NAMES mpir.h
+        HINTS "${MPIR_ROOT_DIR}/include"
+        )
+
+find_library(MPIR_LIBRARY
+        NAMES mpir
+        HINTS "${MPIR_ROOT_DIR}/lib"
+        )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPIR DEFAULT_MSG MPIR_LIBRARY MPIR_INCLUDE_DIR)
+
+mark_as_advanced(MPIR_LIBRARY MPIR_INCLUDE)
+
+set(MPIR_LIBRARIES ${MPIR_LIBRARY})
+set(MPIR_INCLUDE_DIRS ${MPIR_INCLUDE_DIR})
diff --git a/src/cmake-modules/FindMPIRXX.cmake b/src/cmake-modules/FindMPIRXX.cmake
new file mode 100644
index 00000000..edf76520
--- /dev/null
+++ b/src/cmake-modules/FindMPIRXX.cmake
@@ -0,0 +1,17 @@
+find_path(MPIRXX_INCLUDE_DIR
+        NAMES mpirxx.h
+        HINTS "${MPIR_ROOT_DIR}/include"
+        )
+
+find_library(MPIRXX_LIBRARY
+        NAMES mpirxx
+        HINTS "${MPIR_ROOT_DIR}/lib"
+        )
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPIRXX DEFAULT_MSG MPIRXX_LIBRARY MPIRXX_INCLUDE_DIR)
+
+mark_as_advanced(MPIRXX_INCLUDE_DIR MPIRXX_LIBRARY)
+
+set(MPIRXX_LIBRARIES ${MPIRXX_LIBRARY})
+set(MPIRXX_INCLUDE_DIRS ${MPIRXX_INCLUDE_DIR})
diff --git a/src/config.h b/src/config.h
index 4b8a8387..ae84195b 100644
--- a/src/config.h
+++ b/src/config.h
@@ -10,31 +10,11 @@ All rights reserved
 #define _CONFIG
 
 /* The minimum batch size for offline production per call 
- * to base routine. For FHE based routines these are likely to
- * be exceeded by a huge amount!
+ * to base routine. 
  */
 #define sz_offline_batch 60000
 #define sz_IO_batch 10000
 
-
-/* This are the maximum batch sizes for sacrificing per call
- * 
- * Note we do not need many squares in the end so this max
- * can be set quite low if we want
- */
-
-#define sz_triples_sacrifice 10000
-#define sz_squares_sacrifice 10000
-#define sz_bits_sacrifice 10000
-
-/* The max number of triples etc in the offline production queue 
- * These numbers are to avoid memory filling up, if data is not
- * being consumed. These are the size of the queues before sacrificing
- */
-#define max_triples_offline 200000
-#define max_squares_offline 200000
-#define max_bits_offline 200000
-
 /* The max number of triples etc in the sacrifice production queue 
  * where we stop producing stuff.
  * The actual queues may end up being larger, as we dont want to throw
@@ -128,6 +108,16 @@ All rights reserved
  */
 #define OT_stat_sec 40
 
+/* This is the bound we use on for the NewHope approximation
+ * to a discrete Gaussian with sigma=sqrt(B/2)
+ */
+#define NewHopeB 1
+
+/* This gives the Hamming Weight of the secret key distribution.
+ * Set this to -1 to use a Gaussian secret key distribution
+ */
+#define HwtSK 64
+
 /* Approx max size of each aBit and aAND queues in the OT thread */
 #define max_aBit_queue 500000
 #define max_aAND_queue 200000