diff --git a/.gitignore b/.gitignore
index 56cf42555..17a4dc647 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,7 @@ training/tf/venv
 leelaz-model*
 *.orig
 leelaz_opencl_tuning
+/build-autogtp-*
+/build-validation-*
+.vs/
+build/
diff --git a/.gitmodules b/.gitmodules
index 48294b183..b78eb5e33 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "gtest"]
 	path = gtest
 	url = https://github.com/google/googletest.git
+[submodule "src/Eigen"]
+	path = src/Eigen
+	url = https://github.com/eigenteam/eigen-git-mirror
diff --git a/.travis.yml b/.travis.yml
index 558ca56b3..85a549a4c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,14 +15,20 @@ jobs:
       - docker build -f Dockerfiles/Dockerfile.gpu -t leela-zero:gpu .
       - docker run leela-zero:gpu
     - script:
-      - docker build -f Dockerfiles/Dockerfile.gpu-half -t leela-zero:gpu-half .
-      - docker run leela-zero:gpu-half
+      - docker build -f Dockerfiles/Dockerfile.gpu-blas -t leela-zero:gpu-blas .
+      - docker run leela-zero:gpu-blas
     - script:
       - docker build -f Dockerfiles/Dockerfile.cpu -t leela-zero:cpu .
       - docker run leela-zero:cpu
+    - script:
+      - docker build -f Dockerfiles/Dockerfile.cpu-blas -t leela-zero:cpu-blas .
+      - docker run leela-zero:cpu-blas
     - script:
       - docker build -f Dockerfiles/Dockerfile.tests -t leela-zero:tests .
       - docker run leela-zero:tests
+    - script:
+      - docker build -f Dockerfiles/Dockerfile.tests-blas -t leela-zero:tests-blas .
+      - docker run leela-zero:tests-blas
     - stage: style
       before_install:
-      script: find . -regex ".*\.\(cpp\|h\|hpp\)" -not -regex ".*moc_.*.cpp" -not -path "./gtest/*" -not -path "./training/*" -not -path "./src/half/*" -not -path "./src/CL/*" | xargs python2 scripts/cpplint.py --filter=-build/c++11,-build/include,-build/include_order,-build/include_what_you_use,-build/namespaces,-readability/braces,-readability/casting,-readability/fn_size,-readability/namespace,-readability/todo,-runtime/explicit,-runtime/indentation_namespace,-runtime/int,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comma,-whitespace/comments,-whitespace/empty_loop_body,-whitespace/line_length,-whitespace/semicolon
+      script: find . -regex ".*\.\(cpp\|h\|hpp\)" -not -regex ".*moc_.*.cpp" -not -path "./gtest/*" -not -path "./training/*" -not -path "./src/half/*" -not -path "./src/CL/*" -not -path "./src/Eigen/*" | xargs python2 scripts/cpplint.py --filter=-build/c++11,-build/include,-build/include_order,-build/include_what_you_use,-build/namespaces,-readability/braces,-readability/casting,-readability/fn_size,-readability/namespace,-readability/todo,-runtime/explicit,-runtime/indentation_namespace,-runtime/int,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comma,-whitespace/comments,-whitespace/empty_loop_body,-whitespace/line_length,-whitespace/semicolon
diff --git a/AUTHORS b/AUTHORS
index 5e650c236..31ceccc54 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,40 +1,63 @@
 Gian-Carlo Pascutto
 Seth Troisi
-Marco Calignano
 Henrik Forstén
-Andy Olsen
-Junhee Yoo
 TFiFiE
+Junhee Yoo
+Marco Calignano
+Andy Olsen
+Hersmunch
 Bood Qian
 Peter Wen
 ywrt
 Arseny Krasutsky
 earthengine
+Jonathan Roy
+Mankit Pong
 michael
-Hersmunch
 Barry G Becker
+Junyan Xu
 Maks Kolman
+kuba97531
 Antti Korhonen
-Mankit Pong
+Chin-Chang Yang
 Xingcan LAN
+bittsitt
 tux3
 5525345551
+Adrian Petrescu
+Akita Noek
 Alderi-Tokori
+Alexander Taylor
+Ancalagon
 Ashley Griffiths
+Barry Becker
 Ed Lee
 Eddh
 F. Huizinga
 FFLaguna
 Jiannan Liu
 Joe Ren
-Jonathan Roy
+LL145
+Mark Andrew Gerads
 Nate
+OmnipotentEntity
+Przemek Wesołek
+Sebastian H
 Shen-Ta Hsieh(BestSteve)
 Virgile Andreani
+Ximin Luo
+ZenStone
+Zhenzhen Zhan
 afalturki
+betterworld
 cheshirecats
+dbosst
 fohristiwhirl
 gaieepo
+ncaq
 tterava
+wonderingabout
+zediir
+zliu1022
 Пахотин Иван
 Google LLC
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29714df6a..617e4b2f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 # This file is part of Leela Zero.
 # Copyright (C) 2017 Marco Calignano
-# Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+# Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 # Leela Zero is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@@ -15,6 +15,7 @@
 cmake_minimum_required(VERSION 3.1)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+include(GNUInstallDirs)
 
 project(leelaz)
 add_subdirectory(gtest EXCLUDE_FROM_ALL) # We don't want to install gtest, exclude it from `all`
@@ -22,7 +23,7 @@ add_subdirectory(gtest EXCLUDE_FROM_ALL) # We don't want to install gtest, exclu
 # Required Packages
 set(Boost_MIN_VERSION "1.58.0")
 set(Boost_USE_MULTITHREADED ON)
-find_package(Boost 1.58.0 REQUIRED program_options)
+find_package(Boost 1.58.0 REQUIRED program_options filesystem)
 find_package(Threads REQUIRED)
 find_package(ZLIB REQUIRED)
 find_package(OpenCL REQUIRED)
@@ -32,14 +33,20 @@ find_package(OpenCL REQUIRED)
 if(NOT APPLE)
   set(BLA_VENDOR OpenBLAS)
 endif()
-find_package(BLAS REQUIRED)
-find_path(BLAS_INCLUDE_DIRS openblas_config.h
-  /usr/include
-  /usr/local/include
-  /usr/include/openblas
-  /opt/OpenBLAS/include
-  /usr/include/x86_64-linux-gnu
-  $ENV{BLAS_HOME}/include)
+if(USE_BLAS)
+  message(STATUS "Looking for system BLAS/OpenBLAS library.")
+  find_package(BLAS REQUIRED)
+  find_path(BLAS_INCLUDE_DIRS openblas_config.h
+    /usr/include
+    /usr/local/include
+    /usr/include/openblas
+    /opt/OpenBLAS/include
+    /usr/include/x86_64-linux-gnu
+    $ENV{BLAS_HOME}/include)
+    add_definitions(-DUSE_BLAS)
+else()
+message(STATUS "Using built-in matrix library.")
+endif()
 find_package(Qt5Core)
 
 set(CMAKE_CXX_STANDARD 14)
@@ -79,7 +86,7 @@ if(USE_HALF)
   add_definitions(-DUSE_HALF)
 endif()
 
-set(IncludePath "${CMAKE_CURRENT_SOURCE_DIR}/src")
+set(IncludePath "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/src/Eigen")
 set(SrcPath "${CMAKE_CURRENT_SOURCE_DIR}/src")
 
 include_directories(${IncludePath})
@@ -108,7 +115,7 @@ target_link_libraries(leelaz ${BLAS_LIBRARIES})
 target_link_libraries(leelaz ${OpenCL_LIBRARIES})
 target_link_libraries(leelaz ${ZLIB_LIBRARIES})
 target_link_libraries(leelaz ${CMAKE_THREAD_LIBS_INIT})
-install(TARGETS leelaz DESTINATION bin)
+install(TARGETS leelaz DESTINATION ${CMAKE_INSTALL_BINDIR})
 
 if(Qt5Core_FOUND)
     if(NOT Qt5Core_VERSION VERSION_LESS "5.3.0")
@@ -135,3 +142,21 @@ target_link_libraries(tests ${BLAS_LIBRARIES})
 target_link_libraries(tests ${OpenCL_LIBRARIES})
 target_link_libraries(tests ${ZLIB_LIBRARIES})
 target_link_libraries(tests gtest_main ${CMAKE_THREAD_LIBS_INIT})
+
+include(GetGitRevisionDescription)
+git_describe(VERSION --tags)
+string(REGEX REPLACE "^v([0-9]+)\\..*" "\\1" MAJOR_VERSION "${VERSION}")
+string(REGEX REPLACE "^v[0-9]+\\.([0-9]+).*" "\\1" MINOR_VERSION "${VERSION}")
+
+SET(CPACK_GENERATOR "DEB")
+SET(CPACK_DEBIAN_PACKAGE_NAME "leelaz")
+SET(CPACK_DEBIAN_PACKAGE_VERSION "${MAJOR_VERSION}.${MINOR_VERSION}")
+SET(CPACK_DEBIAN_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
+SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "Gian-Carlo Pascutto https://github.com/gcp/leela-zero")
+SET(CPACK_DEBIAN_PACKAGE_DESCRIPTION "Go engine with no human-provided knowledge, modeled after the AlphaGo Zero paper.")
+SET(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
+SET(CPACK_DEBIAN_PACKAGE_SECTION "games")
+SET(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
+SET(CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${MAJOR_VERSION}.${MINOR_VERSION}")
+
+INCLUDE(CPack)
diff --git a/COLAB.md b/COLAB.md
deleted file mode 100644
index 4f277a86d..000000000
--- a/COLAB.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Run Leela Zero client on a Tesla K80 GPU for free (Google Colaboratory)
-
-[Google Colaboratory](https://colab.research.google.com) (Colab) is a free tool for machine learning research. It's a Python notebook environment on a personal VM with a NVIDIA Tesla K80 GPU. Using Colab requires **no installation and runs in your browser**.
-
-This example shows how to run a **Leela Zero client on the K80 GPU to contribute training games**. You can expect to contribute 60-80 games/day.
-
-Google offers **free and unlimited access to the GPU**, but each session will **stop running after 12 hours of use and need to be restarted**. You must also keep the browser tab open. More details are below.
-
-**Do not use multiple accounts for training.** Google has notified us they will block users for this.
-
-## Running the GPU client
-
-* Sign in to your Google account and [open the notebook in Google Colab](https://colab.research.google.com/drive/1WQfPOFhkahIJSxdIjeSQqK4q30j3T1qF).
-* **File** -> **Save a copy in Drive…**.
-* When the copied notebook opens, click **Runtime** -> **Run All**, which will run each of the cells in order. This will take around 10 minutes to complete.
-
-Note: Google offers **unlimited access to the GPU**, but each session will **stop running after 12 hours of use and need to be restarted**. The animated spinning "stop" symbol will turn into a static red "play" symbol when the cell has stopped. You can restart with **Runtime** -> **Restart Runtime** followed by **Runtime** -> **Run All**. A simple macro would work to automate the restarting process. 
-
-A session will also stop if you close the browser tab running Colab (about ~1.5 hours after closing the tab). To ensure the client runs for the full 12 hours, please **keep the tab open**.
-
-## Troubleshooting
- * If you get a **"signal: aborted (core dumped)" error** when running the client or **"failed to assign a backend"** popup (examples below), there are no GPUs available on Google Colab. Try **Runtime** -> **Restart Runtime** and running again, or **kill the entire VM** with `!kill -9 -1` and try again (VM may take 5 minutes to restart after being killed). **As Google Colab has a limited number of free GPUs, you may just have to try again another time.**
-```
-cl::Error
-what():  clGetPlatformIDs
-2018/04/18 14:52:31 signal: aborted (core dumped)
-```
-![No GPUs](https://i.imgur.com/UI63IrA.png)
- * If the notebook appears to be stuck in "Initializing" and won't run, try restarting as above. After restart, you should see "Connected" with a green checkmark.
-
-## Other Platforms
- * Other paid platforms offer a similar service as Google Colab (Jupyter notebook Python environment for machine learning). For example, [FloydHub](https://www.floydhub.com/) offers a free 2-hour Tesla K80 GPU trial, and a [working Jupyter notebook is available here](https://drive.google.com/open?id=1c0rxfB5r-5-JhfNAjJfvjDFBSVYIFOq7) (developed by @scs-ben).
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..778ead8f6
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,110 @@
+# Contributing to Leela Zero
+
+## C++ Usage
+
+Leela Zero is written in C++14, and generally encourages writing in modern C++ style.
+
+This means that:
+
+* The code overwhelmingly uses Almost Always Auto style, and so should you.
+* Prefer range based for and non-member (c)begin/(c)end.
+* You can rely on boost 1.58.0 or later being present.
+* Manipulation of raw pointers is to be avoided as much as possible.
+* Prefer constexpr over defines or constants.
+* Prefer "using" over typedefs.
+* Prefer uniform initialization.
+* Prefer default initializers for member variables.
+* Prefer emplace_back and making use of move assignment.
+* Aim for const-correctness. Prefer passing non-trivial parameters by const reference.
+* Use header include guards, not #pragma once (pragma once is non-standard, has issues with detecting identical files, and is slower https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58770)
+* config.h is always the first file included.
+* Feel free to use templates, but remember that debugging obscure template metaprogramming bugs is not something people enjoy doing in their spare time.
+* Using exceptions is allowed.
+
+## Code Style
+
+* Look at the surrounding code and the rest of the project!
+* Indentation is 4 spaces. No tabs.
+* public/private/protected access modifiers are de-indented
+* Maximum line length is 80 characters. There are rare exceptions in the code, usually involving user-visible text strings.
+* Ifs are always braced, with very rare exceptions when everything fits on one line and doing it properly makes the code less readable.
+* The code generally avoids any pointer passing and allows non-const references for parameters. Still, for new code it should be preferred to a) put input parameters first b) use return values over output parameters.
+* Function arguments that wrap are aligned.
+* Member variables in a class have an m_ prefix and are private. Members of POD structs don't and aren't.
+* Constants and enum values are ALL_CAPS.
+* Variables are lowercase.
+* Function names are underscore_case.
+* Classes are CamelCase.
+* Comments are preferably full sentences with proper capitalization and a period.
+* Split the includes list into config.h, standard headers and our headers.
+
+If something is not addressed here or there is no similar code, the Google C++ Style Guide is always a good reference.
+
+We might move to enforce clang-format at some point.
+
+## Adding dependencies
+
+C++ does not quite have the package systems JavaScript and Rust have, so some restraint should be excercised when adding dependencies. Dependencies typically complicate the build for new contributors, especially on Windows, and reliance on specific, new versions can be a nuisance on Unix based systems.
+
+The restraints on modern header-only libraries are significantly less because they avoid most of the above problems.
+
+If a library is not mature and well-supported on Windows, Linux *and* macOS, you do not want it.
+
+This is not an excuse to re-invent the wheel.
+
+## Upgrading dependencies
+
+The code and dependencies should target the latest stable versions of Visual Studio/MSVC, and the latest stable/LTS releases of common Linux distros, with some additional delay as not everyone will be able to upgrade to a new stable/LTS right away.
+
+For example, upgrading to C++17 or boost 1.62.0 (oldest version in a Debian stable or Ubuntu LTS release) can be considered if there's a compelling use case and/or we can confirm it is supported on all platforms we reasonably target.
+
+## Merging contributions
+
+Contributions come in the form of pull requests against the "next" branch.
+
+They are rebased or squashed on top of the next branch, so the history will stay linear, i.e. no merge commits.
+
+Commit messages follow Linux kernel style: a summary phrase that is no more than 70-75 characters (but preferably <50) and describes both what the patch changes, as well as why the patch might be necessary.
+
+If the patch is to a specific subsystem (AutoGTP, Validation, ...) then prefix the summary by that subsystem (e.g. AutoGTP: ...).
+
+This is followed by a blank line, and a description that is wrapped at 72 characters. Good patch descriptions can be large time savers when someone has to bugfix the code afterwards.
+
+The end of the commit message should mention which (github) issue the patch fixes, if any, and the pull request it belongs to.
+
+Patches need to be reviewed before merging. Try to find the person who worked on the code last, or who has done work in nearby code (git blame is your friend, and this is why we write proper commit messages...). With some luck that is someone with write access to the repository. If not, you'll have to ping someone who does.
+
+Experience says that the majority of the pull requests won't live up to this ideal, which means that maintainers will have to squash patch series and clean up the commit message to be coherent before merging.
+
+If you are a person with write access to the repo, and are about to merge a commit, ask yourself the following question: am I confident enough that I understand this code, so that I can and am willing to go in and fix it if it turns out to be necessary? If the answer to this question is no, then do not merge the code. Not merging a contribution (quickly) is annoying for the individual contributor. Merging a bad contribution is annoying for everyone who wants to contribute now and in the future.
+
+If a contributor can't be bothered to fix up the trailing whitespace in their patch, odds are they aren't going to be willing to fix the threading bug it introduces either.
+
+## "Improvements" and Automagic
+
+Improvements to the engine that can affect strength should include supporting data. This means no-regression tests for functional changes, and a proof of strength improvement for things which are supposed to increase strength.
+
+The tools in the validation directory are well-fit for this purpose, as
+is the python tool "ringmaster".
+
+The number of configurable options should be limited where possible. If it is not possible for the author to make rules of thumb for suitable values for those options, then the majority of users have no hope of getting them right, and may mistakenly make the engine weaker. If you must introduce new ones, consider limiting their exposure to developers only via USE_TUNER and set a good default for them.
+
+## GTP Extensions
+
+GTP makes it possible to connect arbitrary engines to arbitrary interfaces.
+
+Unfortunately GTP 2 isn't extensive enough to realistically fit all needs of analysis GUIs, which means we have had to extend it. The lack of standardization here means that Go software is continously catching up to the chess world, especially after UCI was introduced. We should aim to make this situation better, not worse.
+
+This means that extensions have the possibility of outliving Leela Zero (or any GUIs) provided they are well thought out.
+
+It makes sense to be thoughtful here, consider the responsibilities of both GUI and engine, and try to come up with flexible building blocks rather than a plethora of commands for very specific use cases.
+
+Experience and previous discussions can help understanding:
+
+* lz-analyze "avoid" and "allow" were added in pull request [#1949](https://github.com/leela-zero/leela-zero/pull/1949).
+* lz-analyze got a side-to-move option in pull request [#1872](https://github.com/leela-zero/leela-zero/pull/1872) and [#1642](https://github.com/leela-zero/leela-zero/pull/1642).
+* lz-analyze got a "prior" tag in pull request [#1836](https://github.com/leela-zero/leela-zero/pull/1836).
+* lz-analyze was added in pull request [#1388](https://github.com/leela-zero/leela-zero/pull/1388).
+* lz-setoption was added in pull request [#1741](https://github.com/leela-zero/leela-zero/pull/1741).
+* Pull request [#2170](https://github.com/leela-zero/leela-zero/pull/2170) has some discussion regarding how to navigate SGF
+  files that were parsed by the engine via GTP.
diff --git a/Dockerfiles/Dockerfile.gpu-half b/Dockerfiles/Dockerfile.cpu-blas
similarity index 51%
rename from Dockerfiles/Dockerfile.gpu-half
rename to Dockerfiles/Dockerfile.cpu-blas
index 5d356b5e9..f20b97ff6 100644
--- a/Dockerfiles/Dockerfile.gpu-half
+++ b/Dockerfiles/Dockerfile.cpu-blas
@@ -1,6 +1,6 @@
 FROM leela-zero:base
 
-# GPU 16-bit (memory storage) build
-RUN CXX=g++ CC=gcc cmake -DUSE_HALF=1 ..
+# CPU build
+RUN CXX=g++ CC=gcc cmake -DUSE_CPU_ONLY=1 -DUSE_BLAS=1 ..
 
 CMD cmake --build . --target leelaz --config Release -- -j2
diff --git a/Dockerfiles/Dockerfile.gpu-blas b/Dockerfiles/Dockerfile.gpu-blas
new file mode 100644
index 000000000..2c9a51005
--- /dev/null
+++ b/Dockerfiles/Dockerfile.gpu-blas
@@ -0,0 +1,6 @@
+FROM leela-zero:base
+
+# GPU build
+RUN CXX=g++ CC=gcc cmake -DUSE_BLAS=1 ..
+
+CMD cmake --build . --target leelaz --config Release -- -j2
diff --git a/Dockerfiles/Dockerfile.tests-blas b/Dockerfiles/Dockerfile.tests-blas
new file mode 100644
index 000000000..3433ffc66
--- /dev/null
+++ b/Dockerfiles/Dockerfile.tests-blas
@@ -0,0 +1,8 @@
+FROM leela-zero:base
+
+# CPU build
+RUN CXX=g++ CC=gcc cmake -DUSE_CPU_ONLY=1 -DUSE_BLAS=1 ..
+RUN cmake --build . --target tests --config Release -- -j2
+
+CMD ./tests
+
diff --git a/FAQ.md b/FAQ.md
index 9c965de03..c3ef386b3 100644
--- a/FAQ.md
+++ b/FAQ.md
@@ -8,17 +8,6 @@
 
 AZ also had this behavior, besides we're testing our approach right now. Please be patient.
 
-## 为什么现在训练的是5/6 block网络，而AZ用的是20block ##
-## Why the network size is only 6 blocks comparing to 20 blocks of AZ ##
-
-在项目起步阶段，较小的网络可以在短时间内得到结果，也可以尽早发现/解决问题，
-
-目前的主要目的是为了测试系统的可行性，这对今后的完整重现十分重要（为将来的大网络打好基础）。
-
-This is effectively a testing run to see if the system works, and which things are important for doing a full run. I expected 10 to 100 people to run the client, not 600.
-
-Even so, the 20 block version is 13 times more computationally expensive, and expected to make SLOWER progress early on. I think it's unwise to do such a run unless it's proven that the setup works, because you are going to be in for a very long haul.
-
 ## 为什么比较两个网络强弱时经常下十几盘就不下了 ##
 ## Why only dozens of games are played when comparing two networks ##
 
@@ -33,27 +22,6 @@ We use SPRT to decide if a newly trained network is better. A better network is
 
 The MCTS playouts of self-play games is only 3200, and with noise added (For randomness of each move thus training has something to learn from). If you load Leela Zero with Sabaki, you'll probably find it is actually not that weak.
 
-## 自对弈为什么使用1000的模拟次数，而不是AZ的1600 ##
-## For self-play, why use 3200 visits instead of 1600 playouts as AZ ##
-
-没人知道AZ的1600是怎么得到的。这里的3200是基于下面几点估计得到的：
-
-1. 对于某一个选点，MCTS需要模拟几次才能得出概率结果。在开始阶段，每个选点的概率不会差太多，所以开始的360次模拟大概会覆盖整个棋盘。所以如果要让某些选点可以做几次模拟的话，大概需要2到3 x 360次的模拟。
-
-2. 在computer-go上有人跑过7x7的实验，看到模拟次数从1000到2000的时候性能有提高。所以如果我们观察到瓶颈的时候，可能是可以考虑增加模拟次数。
-
-3. 模拟次数太多会影响得到数据的速度。
-
-Nobody knows. The Zero paper doesn't mention how they arrive at this number, and I know of no sound background to estimate the optimal. I chose it based on some observations:
-
-a) For the MCTS to feed back search probabilities to the learning, it must be able to achieve a reasonable amount of look-ahead on at least a few variations. In the beginning, when the network is untrained, the move probabilities are not very extreme, and this means that the first 360~ simulations will be spent expanding every answer at the root. So if we want to expand at least a few moves, we probably need 2 to 3 x 360 playouts.
-
-b) One person on computer-go, who ran a similar experiment on 7x7, reported that near the end of the learning, he observed increased performance from increasing the number from 1000 to 2000. So maybe this is worthwhile to try when the learning speed starts to decrease or flatten out. But it almost certainly isn't needed early on.
-
-c) Obviously, the speed of acquiring data is linearly related to this setting.
-
-So, the current number is a best guess based on these observations. To be sure what the best value is, one would have to rerun this experiment several times.
-
 ## 有些自对弈对局非常短 ##
 ## Very short self-play games ends with White win?! ##
 
diff --git a/README.md b/README.md
index 67bf7a6a1..b194b1adb 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,5 @@
-[![Linux Build Status](https://travis-ci.org/gcp/leela-zero.svg?branch=next)](https://travis-ci.org/gcp/leela-zero)
-[![Windows Build Status](https://ci.appveyor.com/api/projects/status/pf1hcgly8f1a8iu0/branch/next?svg=true)](https://ci.appveyor.com/project/gcp/leela-zero/branch/next)
-
-
+[![Linux Build Status](https://travis-ci.org/leela-zero/leela-zero.svg?branch=next)](https://travis-ci.org/leela-zero/leela-zero)
+[![Windows Build Status](https://ci.appveyor.com/api/projects/status/dcvp31x1e0yavrtf/branch/next?svg=true)](https://ci.appveyor.com/project/gcp/leela-zero-8arv1/branch/next)
 
 # What
 
@@ -9,7 +7,7 @@ A Go program with no human provided knowledge. Using MCTS (but without
 Monte Carlo playouts) and a deep residual convolutional neural network stack.
 
 This is a fairly faithful reimplementation of the system described
-in the Alpha Go Zero paper "[Mastering the Game of Go without Human Knowledge](https://deepmind.com/documents/119/agz_unformatted_nature.pdf)".
+in the Alpha Go Zero paper "[Mastering the Game of Go without Human Knowledge](https://www.nature.com/articles/nature24270.epdf?author_access_token=VJXbVjaSHxFoctQQ4p2k4tRgN0jAjWel9jnR3ZoTv0PVW4gB86EEpGqTRDtpIz-2rmo8-KG06gqVobU5NSCFeHILHcVFUeMsbvwS-lxjqQGg98faovwjxeTUgZAUMnRQ)".
 For all intents and purposes, it is an open source AlphaGo Zero.
 
 # Wait, what?
@@ -23,7 +21,7 @@ be an engine that is far stronger than the top humans.
 
 # Gimme the weights
 
-Recomputing the AlphaGo Zero weights will [take about 1700 years on commodity hardware](http://computer-go.org/pipermail/computer-go/2017-October/010307.html).
+Recomputing the AlphaGo Zero weights will [take about 1700 years on commodity hardware](http://web.archive.org/web/20190205013627/http://computer-go.org/pipermail/computer-go/2017-October/010307.html).
 
 One reason for publishing this program is that we are running a public,
 distributed effort to repeat the work. Working together, and especially
@@ -32,6 +30,8 @@ a good network (which you can feed into this program, suddenly making it strong)
 
 # I want to help
 
+## Using your own hardware
+
 You need a PC with a GPU, i.e. a discrete graphics card made by NVIDIA or AMD,
 preferably not too old, and with the most recent drivers installed.
 
@@ -40,111 +40,158 @@ lower. If your CPU is not *very* recent (Haswell or newer, Ryzen or newer),
 performance will be outright bad, and it's probably of no use trying to join
 the distributed effort. But you can still play, especially if you are patient.
 
-[Running Leela Zero client on a Tesla K80 GPU for free (Google Colaboratory)](COLAB.md)
-
-## Windows
+### Windows
 
-Head to the Github releases page at https://github.com/gcp/leela-zero/releases,
+Head to the Github releases page at https://github.com/leela-zero/leela-zero/releases,
 download the latest release, unzip, and launch autogtp.exe. It will connect to
 the server automatically and do its work in the background, uploading results
 after each game. You can just close the autogtp window to stop it.
 
-## macOS and Linux
+### macOS and Linux
 
-Follow the instructions below to compile the leelaz binary, then go into
-the autogtp subdirectory and follow [the instructions there](autogtp/README.md)
-to build the autogtp binary. Copy the leelaz binary into the autogtp dir, and
-launch autogtp.
+Follow the instructions below to compile the leelaz and autogtp binaries in
+the build subdirectory. Then run autogtp as explained in the
+[contributing](#contributing) instructions below.
+Contributing will start when you run autogtp.
 
-# I just want to play right now
+## Using a Cloud provider
 
-Download the best known network weights file from: http://zero.sjeng.org/best-network
+Many cloud companies offer free trials (or paid solutions, not discussed here)
+that are usable for helping the leela-zero project.
 
-And head to the [Usage](#usage) section of this README.
+There are community maintained instructions available here:
+* [Running Leela Zero client on a Tesla V100 GPU for free (Google Cloud Free Trial)](https://docs.google.com/document/d/1P_c-RbeLKjv1umc4rMEgvIVrUUZSeY0WAtYHjaxjD64/edit?usp=sharing)
 
-If you prefer a more human style, a network trained from human games is available here: https://sjeng.org/zero/best_v1.txt.zip.
+* [Running Leela Zero client on a Tesla V100 GPU for free (Microsoft Azure Cloud Free Trial)](https://docs.google.com/document/d/1DMpi16Aq9yXXvGj0OOw7jbd7k2A9LHDUDxxWPNHIRPQ/edit?usp=sharing)
 
-# Compiling
+# I just want to play with Leela Zero right now
+
+Download the best known network weights file from [here](https://zero.sjeng.org/best-network), or, if you prefer a more human style,
+a (weaker) network trained from human games [here](https://sjeng.org/zero/best_v1.txt.zip).
+
+If you are on Windows, download an official release from [here](https://github.com/leela-zero/leela-zero/releases) and head to the [Usage](#usage-for-playing-or-analyzing-games)
+section of this README.
+
+If you are on macOS, Leela Zero is available through [Homebrew](https://homebrew.sh), the de facto standard
+package manager. You can install it with:
+```
+brew install leela-zero
+```
+
+If you are on Unix, you have to compile the program yourself. Follow
+the compilation instructions below and then read the [Usage](#usage-for-playing-or-analyzing-games) section.
+
+# Compiling AutoGTP and/or Leela Zero
 
 ## Requirements
 
 * GCC, Clang or MSVC, any C++14 compiler
-* Boost 1.58.x or later, headers and program_options library (libboost-dev & libboost-program-options-dev on Debian/Ubuntu)
-* BLAS Library: OpenBLAS (libopenblas-dev) or (optionally) Intel MKL
+* Boost 1.58.x or later, headers and program_options, filesystem and system libraries (libboost-dev, libboost-program-options-dev and libboost-filesystem-dev on Debian/Ubuntu)
 * zlib library (zlib1g & zlib1g-dev on Debian/Ubuntu)
 * Standard OpenCL C headers (opencl-headers on Debian/Ubuntu, or at
-https://github.com/KhronosGroup/OpenCL-Headers/tree/master/opencl22/)
+https://github.com/KhronosGroup/OpenCL-Headers/tree/master/CL)
 * OpenCL ICD loader (ocl-icd-libopencl1 on Debian/Ubuntu, or reference implementation at https://github.com/KhronosGroup/OpenCL-ICD-Loader)
 * An OpenCL capable device, preferably a very, very fast GPU, with recent
-drivers is strongly recommended (OpenCL 1.1 support is enough).
-If you do not have a GPU, modify config.h in the source and remove
-the line that says "#define USE_OPENCL".
+drivers is strongly recommended (OpenCL 1.1 support is enough). Don't
+forget to install the OpenCL driver if this part is packaged seperately
+by the Linux distribution (e.g. nvidia-opencl-icd).
+If you do not have a GPU, add the define "USE_CPU_ONLY", for example
+by adding -DUSE_CPU_ONLY=1 to the cmake command line.
+* Optional: BLAS Library: OpenBLAS (libopenblas-dev) or Intel MKL
 * The program has been tested on Windows, Linux and macOS.
 
-## Example of compiling and running - Ubuntu
+## Example of compiling - Ubuntu & similar
 
     # Test for OpenCL support & compatibility
     sudo apt install clinfo && clinfo
 
     # Clone github repo
-    git clone https://github.com/gcp/leela-zero
-    cd leela-zero/src
-    sudo apt install libboost-dev libboost-program-options-dev libopenblas-dev opencl-headers ocl-icd-libopencl1 ocl-icd-opencl-dev zlib1g-dev
-    make
-    cd ..
-    wget http://zero.sjeng.org/best-network
-    src/leelaz --weights best-network
+    git clone https://github.com/leela-zero/leela-zero
+    cd leela-zero
+    git submodule update --init --recursive
 
-## Example of compiling and running - macOS
+    # Install build depedencies
+    sudo apt install cmake g++ libboost-dev libboost-program-options-dev libboost-filesystem-dev opencl-headers ocl-icd-libopencl1 ocl-icd-opencl-dev zlib1g-dev
+
+    # Use a stand alone build directory to keep source dir clean
+    mkdir build && cd build
+
+    # Compile leelaz and autogtp in build subdirectory with cmake
+    cmake ..
+    cmake --build .
+
+    # Optional: test if your build works correctly
+    ./tests
+
+## Example of compiling - macOS
 
     # Clone github repo
-    git clone https://github.com/gcp/leela-zero
-    cd leela-zero/src
-    brew install boost
-    make
-    cd ..
-    curl -O http://zero.sjeng.org/best-network
-    src/leelaz --weights best-network
+    git clone https://github.com/leela-zero/leela-zero
+    cd leela-zero
+    git submodule update --init --recursive
+
+    # Install build depedencies
+    brew install boost cmake zlib
 
-## Example of compiling and running - Windows
+    # Use a stand alone build directory to keep source dir clean
+    mkdir build && cd build
+
+    # Compile leelaz and autogtp in build subdirectory with cmake
+    cmake ..
+    cmake --build .
+
+    # Optional: test if your build works correctly
+    ./tests
+
+## Example of compiling - Windows
 
     # Clone github repo
-    git clone https://github.com/gcp/leela-zero
+    git clone https://github.com/leela-zero/leela-zero
     cd leela-zero
+    git submodule update --init --recursive
+
     cd msvc
     Double-click the leela-zero2015.sln or leela-zero2017.sln corresponding
     to the Visual Studio version you have.
     # Build from Visual Studio 2015 or 2017
-    # Download <http://zero.sjeng.org/best-network> to msvc\x64\Release
-    msvc\x64\Release\leelaz.exe --weights best-network
 
-## Example of compiling and running - CMake (macOS/Ubuntu)
+# Contributing
 
-    # Clone github repo
-    git clone https://github.com/gcp/leela-zero
-    cd leela-zero
-    git submodule update --init --recursive
+For Windows, you can use a release package, see ["I want to help"](#windows).
 
-    # Use stand alone directory to keep source dir clean
-    mkdir build && cd build
-    cmake ..
-    make leelaz
-    make tests
-    ./tests
-    curl -O http://zero.sjeng.org/best-network
-    ./leelaz --weights best-network
+Unix and macOS, after finishing the compile and while in the build directory:
 
+    # Copy leelaz binary to autogtp subdirectory
+    cp leelaz autogtp
 
-# Usage
+    # Run AutoGTP to start contributing
+    ./autogtp/autogtp
 
-The engine supports the [GTP protocol, version 2](https://www.lysator.liu.se/~gunnar/gtp/gtp2-spec-draft2/gtp2-spec.html).
+
+# Usage for playing or analyzing games
 
 Leela Zero is not meant to be used directly. You need a graphical interface
 for it, which will interface with Leela Zero through the GTP protocol.
 
+The engine supports the [GTP protocol, version 2](https://www.lysator.liu.se/~gunnar/gtp/gtp2-spec-draft2/gtp2-spec.html).
+
+[Lizzie](https://github.com/featurecat/lizzie/releases) is a client specifically
+for Leela Zero which shows live search probilities, a win rate graph, and has
+an automatic game analysis mode. Has binaries for Windows, Mac, and Linux.
+
 [Sabaki](http://sabaki.yichuanshen.de/) is a very nice looking GUI with GTP 2
-capability. It should work with this engine. A lot of go software can
-interface to an engine via GTP, so look around.
+capability.
+
+[LeelaSabaki](https://github.com/SabakiHQ/LeelaSabaki) is modified to
+show variations and winning statistics in the game tree, as well as a heatmap
+on the game board.
+
+[GoReviewPartner](https://github.com/pnprog/goreviewpartner) is a tool for
+automated review and analysis of games using bots (saved as .rsgf files),
+Leela Zero is supported.
+
+A lot of go software can interface to an engine via GTP,
+so look around.
 
 Add the --gtp commandline option on the engine command line to enable Leela
 Zero's GTP support. You will need a weights file, specify that with the -w option.
@@ -271,40 +318,56 @@ This requires a working installation of TensorFlow 1.4 or later:
     src/leelaz -w weights.txt
     dump_supervised bigsgf.sgf train.out
     exit
-    training/tf/parse.py train.out
+    training/tf/parse.py 6 128 train.out
 
-This will run and regularly dump Leela Zero weight files to disk, as
-well as snapshots of the learning state numbered by the batch number.
-If interrupted, training can be resumed with:
+This will run and regularly dump Leela Zero weight files (of networks with 6
+blocks and 128 filters) to disk, as well as snapshots of the learning state
+numbered by the batch number. If interrupted, training can be resumed with:
 
-    training/tf/parse.py train.out leelaz-model-batchnumber
+    training/tf/parse.py 6 128 train.out leelaz-model-batchnumber
 
 # Todo
 
-- [ ] List of package names for more distros
-- [ ] Multi-GPU support for training
-- [ ] Optimize Winograd transformations
-- [ ] CUDA specific version using cuDNN
-- [ ] AMD specific version using MIOpen
+- [ ] Further optimize Winograd transformations.
+- [ ] Improve GPU batching in the search.
+- [ ] Root filtering for handicap play.
+- More backends:
+- [ ] MKL-DNN based backend.
+- [ ] CUDA specific version using cuDNN or cuBLAS.
+- [ ] AMD specific version using MIOpen/ROCm.
 
 # Related links
 
 * Status page of the distributed effort:
-http://zero.sjeng.org
-* Watch Leela Zero's training games live in a GUI:
-https://github.com/fsparv/LeelaWatcher
+https://zero.sjeng.org
 * GUI and study tool for Leela Zero:
-https://github.com/CamWagner/lizzie
-* Stockfish chess engine ported to Leela Zero framework:
-https://github.com/glinscott/leela-chess
+https://github.com/featurecat/lizzie
+* Watch Leela Zero's training games live in a GUI:
+https://github.com/barrybecker4/LeelaWatcher
 * Original Alpha Go (Lee Sedol) paper:
 https://storage.googleapis.com/deepmind-media/alphago/AlphaGoNaturePaper.pdf
-* Newer Alpha Zero (Go, Chess, Shogi) paper:
+* Alpha Go Zero paper:
+https://deepmind.com/documents/119/agz_unformatted_nature.pdf
+* Alpha Zero (Go, Chess, Shogi) paper:
 https://arxiv.org/pdf/1712.01815.pdf
 * AlphaGo Zero Explained In One Diagram:
 https://medium.com/applied-data-science/alphago-zero-explained-in-one-diagram-365f5abf67e0
+* Stockfish chess engine ported to Leela Zero framework:
+https://github.com/LeelaChessZero/lczero
+* Leela Chess Zero (chess optimized client)
+https://github.com/LeelaChessZero/lc0
 
 # License
 
-The code is released under the GPLv3 or later, except for ThreadPool.h, cl2.hpp,
-half.hpp and the clblast_level3 subdirs, which have specific licenses (compatible with GPLv3) mentioned in those files.
+The code is released under the GPLv3 or later, except for ThreadPool.h, cl2.hpp, half.hpp and the eigen and clblast_level3 subdirs, which have specific licenses (compatible with GPLv3) mentioned in those files.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with NVIDIA Corporation's libraries from the
+NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+Network library and/or the NVIDIA TensorRT inference library
+(or a modified version of those libraries), containing parts covered
+by the terms of the respective license agreement, the licensors of
+this Program grant you additional permission to convey the resulting
+work.
diff --git a/appveyor.yml b/appveyor.yml
index 3a6cfade2..674c729cd 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,12 +1,21 @@
 version: '{build}'
+image:
+- Visual Studio 2015
+- Visual Studio 2017
 configuration: Release
 platform: x64
+matrix:
+  fast_finish: true
 environment:
   matrix:
-  - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-  - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+  - cmake_build: 1
+  - cmake_build: 1
     features: USE_CPU_ONLY
     run_tests: 1
+  - cmake_build: 1
+    features: USE_CPU_ONLY USE_BLAS
+    run_tests: 1
+  - msbuild: 1
 skip_commits:
   files:
     - '**/*.md'
@@ -15,30 +24,27 @@ skip_commits:
     - training/
     - AUTHORS
     - COPYING
-install:
-- cmd: nuget restore msvc\VS2017 -PackagesDirectory pkgs
-cache: pkgs -> appveyor.yml
+for:
+- matrix:
+    only:
+    - image: Visual Studio 2015
+  install:
+  - cmd: set MSVCDIR=msvc\VS2015
+  - cmd: echo %MSVCDIR%
+  - cmd: nuget restore %MSVCDIR% -PackagesDirectory msvc\packages
+- matrix:
+    only:
+    - image: Visual Studio 2017
+  install:
+  - cmd: set MSVCDIR=msvc\VS2017
+  - cmd: echo %MSVCDIR%
+  - cmd: nuget restore %MSVCDIR% -PackagesDirectory msvc\packages
 build_script:
-- cmd: >-
-    set PKG_FOLDER="%cd%\pkgs"
-
-    git submodule update --init --recursive
-
-    mkdir build
-
-    cd build
-
-    set BLAS_HOME="..\pkgs\OpenBLAS.0.2.14.1\lib\native"
-
-    for /F %%f in ("%features%") do set DEFINES=%DEFINES% -D%%f=1
-
-    cmake -G "Visual Studio 15 2017 Win64" %DEFINES% -DCMAKE_PREFIX_PATH="%QTDIR%/lib/cmake/" -DBOOST_ROOT="C:/Libraries/boost_1_65_1" -DBOOST_LIBRARYDIR="C:/Libraries/boost_1_65_1/lib64-msvc-14.1" -DBoost_USE_STATIC_LIBS=ON -DZLIB_ROOT="%PKG_FOLDER%/zlib-msvc14-x64.1.2.11.7795/build/native" -DZLIB_LIBRARY="%PKG_FOLDER%/zlib-msvc14-x64.1.2.11.7795/build/native/zlib-msvc14-x64.targets" -DOpenCL_LIBRARY="%PKG_FOLDER%/opencl-nug.0.777.12/build/native/opencl-nug.targets" -DOpenCL_INCLUDE_DIR="%PKG_FOLDER%/opencl-nug.0.777.12/build/native/include" -DBLAS_LIBRARIES="%PKG_FOLDER%/OpenBLAS.0.2.14.1/build/native/openblas.targets" -Dgtest_force_shared_crt=ON ..
-
-    cmake --build . --config Release -- /maxcpucount:1
+- cmd: if "%cmake_build%"=="1" %MSVCDIR%\cmake_build.bat
+- cmd: if "%msbuild%"=="1" git submodule update --init --recursive
+- cmd: if "%msbuild%"=="1" msbuild /t:build %MSVCDIR%\leela-zero.vcxproj
 
 test_script:
-- cmd: >-
-
-    set PATH=Release;%PATH
+- cmd: if "%run_tests%"=="1" cd build && Release\tests.exe
 
-    if NOT "%run_tests%"=="" tests.exe
+cache: msvc\packages -> appveyor.yml
diff --git a/autogtp/.gitignore b/autogtp/.gitignore
index 96b15e05e..4042e0246 100644
--- a/autogtp/.gitignore
+++ b/autogtp/.gitignore
@@ -2,9 +2,12 @@
 /.qmake.stash
 /autogtp
 /*.sgf
+/*.train
+/storefile*.bin
 /*.gz
 /moc_*.cpp
 /moc_*.h
+/autogtp.pro.user
 
 # Weight files
 /[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]*[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]
diff --git a/autogtp/CMakeLists.txt b/autogtp/CMakeLists.txt
index be73e90c4..a97d5fc0a 100644
--- a/autogtp/CMakeLists.txt
+++ b/autogtp/CMakeLists.txt
@@ -7,4 +7,4 @@ add_executable(autogtp
 set_target_properties(autogtp PROPERTIES AUTOMOC 1)
 target_link_libraries(autogtp Qt5::Core)
 
-install(TARGETS autogtp DESTINATION bin)
+install(TARGETS autogtp DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/autogtp/Console.h b/autogtp/Console.h
index d0be24b91..c44e9e1d4 100644
--- a/autogtp/Console.h
+++ b/autogtp/Console.h
@@ -22,32 +22,29 @@
 #include <QObject>
 #include <QSocketNotifier>
 #include <QTextStream>
-#include "stdio.h"
-
+#include <cstdio>
 
 #ifdef Q_OS_WIN
-    #include <QWinEventNotifier>
-    #include <windows.h>
-    typedef QWinEventNotifier Notifier;
+#include <QWinEventNotifier>
+#include <windows.h>
+typedef QWinEventNotifier Notifier;
 #else
-    #include <QSocketNotifier>
-    typedef QSocketNotifier Notifier;
+#include <QSocketNotifier>
+typedef QSocketNotifier Notifier;
 #endif
 
-
-class Console : public QObject
-{
+class Console : public QObject {
     Q_OBJECT
 public:
-    Console(QObject *parent = nullptr)
+    Console(QObject* parent = nullptr)
         : QObject(parent),
 #ifdef Q_OS_WIN
           m_notifier(GetStdHandle(STD_INPUT_HANDLE)) {
 #else
           m_notifier(fileno(stdin), Notifier::Read) {
 #endif
-            connect(&m_notifier, &Notifier::activated, this, &Console::readInput);
-        }
+        connect(&m_notifier, &Notifier::activated, this, &Console::readInput);
+    }
     ~Console() = default;
 
 signals:
@@ -61,6 +58,7 @@ public slots:
             emit sendQuit();
         }
     }
+
 private:
     Notifier m_notifier;
 };
diff --git a/autogtp/Game.cpp b/autogtp/Game.cpp
index 061124e0d..a41f6c4f1 100644
--- a/autogtp/Game.cpp
+++ b/autogtp/Game.cpp
@@ -16,58 +16,49 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <QUuid>
 #include <QFile>
-#include <QTextStream>
+#include <QFileInfo>
 #include <QRegularExpression>
+#include <QTextStream>
+#include <QUuid>
+
 #include "Game.h"
 
-Game::Game(const QString& weights, const QString& opt, const QString& binary) :
-    QProcess(),
-    m_cmdLine(""),
-    m_binary(binary),
-    m_timeSettings("time_settings 0 1 0"),
-    m_resignation(false),
-    m_blackToMove(true),
-    m_blackResigned(false),
-    m_passes(0),
-    m_moveNum(0)
-{
-#ifdef WIN32
-    m_binary.append(".exe");
-#endif
-    m_cmdLine = m_binary + " " + opt + " " + weights;
+Game::Game(const Engine& engine)
+    : QProcess(),
+      m_engine(engine),
+      m_isHandicap(false),
+      m_resignation(false),
+      m_blackToMove(true),
+      m_blackResigned(false),
+      m_passes(0),
+      m_moveNum(0) {
     m_fileName = QUuid::createUuid().toRfc4122().toHex();
 }
 
 bool Game::checkGameEnd() {
-    return (m_resignation ||
-            m_passes > 1 ||
-            m_moveNum > (19 * 19 * 2));
+    return (m_resignation || m_passes > 1 || m_moveNum > (19 * 19 * 2));
 }
 
 void Game::error(int errnum) {
     QTextStream(stdout) << "*ERROR*: ";
     switch (errnum) {
         case Game::NO_LEELAZ:
-            QTextStream(stdout)
-                << "No 'leelaz' binary found." << endl;
+            QTextStream(stdout) << "No 'leelaz' binary found." << endl;
             break;
         case Game::PROCESS_DIED:
             QTextStream(stdout)
                 << "The 'leelaz' process died unexpected." << endl;
             break;
         case Game::WRONG_GTP:
-            QTextStream(stdout)
-                << "Error in GTP response." << endl;
+            QTextStream(stdout) << "Error in GTP response." << endl;
             break;
         case Game::LAUNCH_FAILURE:
             QTextStream(stdout)
                 << "Could not talk to engine after launching." << endl;
             break;
         default:
-            QTextStream(stdout)
-                << "Unexpected error." << endl;
+            QTextStream(stdout) << "Unexpected error." << endl;
             break;
     }
 }
@@ -108,7 +99,7 @@ bool Game::sendGtpCommand(QString cmd) {
     return true;
 }
 
-void Game::checkVersion(const VersionTuple &min_version) {
+void Game::checkVersion(const VersionTuple& min_version) {
     write(qPrintable("version\n"));
     waitForBytesWritten(-1);
     if (!waitReady()) {
@@ -117,10 +108,10 @@ void Game::checkVersion(const VersionTuple &min_version) {
     }
     char readBuffer[256];
     int readCount = readLine(readBuffer, 256);
-    //If it is a GTP comment just print it and wait for the real answer
-    //this happens with the winogard tuning
+    // If it is a GTP comment just print it and wait for the real answer
+    // this happens with the winogard tuning
     if (readBuffer[0] == '#') {
-        readBuffer[readCount-1] = 0;
+        readBuffer[readCount - 1] = 0;
         QTextStream(stdout) << readBuffer << endl;
         if (!waitReady()) {
             error(Game::PROCESS_DIED);
@@ -145,16 +136,15 @@ void Game::checkVersion(const VersionTuple &min_version) {
     if (version_list.size() < 3) {
         version_list.append("0");
     }
-    int versionCount = (version_list[0].toInt() - std::get<0>(min_version)) * 10000;
+    int versionCount =
+        (version_list[0].toInt() - std::get<0>(min_version)) * 10000;
     versionCount += (version_list[1].toInt() - std::get<1>(min_version)) * 100;
     versionCount += version_list[2].toInt() - std::get<2>(min_version);
     if (versionCount < 0) {
-        QTextStream(stdout)
-            << "Leela version is too old, saw " << version_buff
-            << " but expected "
-            << std::get<0>(min_version) << "."
-            << std::get<1>(min_version) << "."
-            << std::get<2>(min_version)  << endl;
+        QTextStream(stdout) << "Leela version is too old, saw " << version_buff
+                            << " but expected " << std::get<0>(min_version)
+                            << "." << std::get<1>(min_version) << "."
+                            << std::get<2>(min_version) << endl;
         QTextStream(stdout)
             << "Check https://github.com/gcp/leela-zero for updates." << endl;
         exit(EXIT_FAILURE);
@@ -165,8 +155,9 @@ void Game::checkVersion(const VersionTuple &min_version) {
     }
 }
 
-bool Game::gameStart(const VersionTuple &min_version) {
-    start(m_cmdLine);
+bool Game::gameStart(const VersionTuple& min_version, const QString& sgf,
+                     const int moves) {
+    start(m_engine.getCmdLine());
     if (!waitForStarted()) {
         error(Game::NO_LEELAZ);
         return false;
@@ -175,8 +166,50 @@ bool Game::gameStart(const VersionTuple &min_version) {
     // check any return values.
     checkVersion(min_version);
     QTextStream(stdout) << "Engine has started." << endl;
-    sendGtpCommand(m_timeSettings);
-    QTextStream(stdout) << "Infinite thinking time set." << endl;
+    // If there is an sgf file to start playing from then it will contain
+    // whether there is handicap in use. If there is no sgf file then instead,
+    // check whether there are any handicap commands to send (these fail
+    // if the board is not empty).
+    // Then send the rest of the GTP commands after any SGF has been loaded so
+    // that they can override any settings loaded from the SGF.
+    if (!sgf.isEmpty()) {
+        QFile sgfFile(sgf + ".sgf");
+        if (!sgfFile.exists()) {
+            QTextStream(stdout) << "Cannot find sgf file " << sgf << endl;
+            exit(EXIT_FAILURE);
+        }
+        sgfFile.open(QIODevice::Text | QIODevice::ReadOnly);
+        const auto sgfData = QTextStream(&sgfFile).readAll();
+        const auto re = QRegularExpression("HA\\[\\d+\\]");
+        const auto match = re.match(sgfData);
+        m_isHandicap = match.hasMatch();
+        sgfFile.close();
+        if (moves == 0) {
+            loadSgf(sgf);
+        } else {
+            loadSgf(sgf, moves);
+        }
+        setMovesCount(moves);
+    } else {
+        for (auto command : m_engine.m_commands.filter("handicap")) {
+            QTextStream(stdout) << command << endl;
+            if (!sendGtpCommand(command)) {
+                QTextStream(stdout) << "GTP failed on: " << command << endl;
+                exit(EXIT_FAILURE);
+            }
+            m_isHandicap = true;
+            m_blackToMove = false;
+        }
+    }
+    const auto re = QRegularExpression("^((?!handicap).)*$");
+    for (auto command : m_engine.m_commands.filter(re)) {
+        QTextStream(stdout) << command << endl;
+        if (!sendGtpCommand(command)) {
+            QTextStream(stdout) << "GTP failed on: " << command << endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+    QTextStream(stdout) << "Starting GTP commands sent." << endl;
     return true;
 }
 
@@ -194,7 +227,11 @@ void Game::move() {
 
 void Game::setMovesCount(int moves) {
     m_moveNum = moves;
-    m_blackToMove = (moves % 2) == 0;
+    // The game always starts at move 0 (GTP states that handicap stones are not
+    // part of the move history), so if there is no handicap then black moves on
+    // even numbered turns but if there is handicap then black moves on odd
+    // numbered turns.
+    m_blackToMove = (moves % 2) == (m_isHandicap ? 1 : 0);
 }
 
 bool Game::waitReady() {
@@ -232,11 +269,10 @@ bool Game::readMove() {
     QTextStream(stdout) << m_moveNum << " (";
     QTextStream(stdout) << (m_blackToMove ? "B " : "W ") << m_moveDone << ") ";
     QTextStream(stdout).flush();
-    if (m_moveDone.compare(QStringLiteral("pass"),
-                          Qt::CaseInsensitive) == 0) {
+    if (m_moveDone.compare(QStringLiteral("pass"), Qt::CaseInsensitive) == 0) {
         m_passes++;
-    } else if (m_moveDone.compare(QStringLiteral("resign"),
-                                 Qt::CaseInsensitive) == 0) {
+    } else if (m_moveDone.compare(QStringLiteral("resign"), Qt::CaseInsensitive)
+               == 0) {
         m_resignation = true;
         m_blackResigned = m_blackToMove;
     } else {
@@ -251,13 +287,15 @@ bool Game::setMove(const QString& m) {
     }
     m_moveNum++;
     QStringList moves = m.split(" ");
-    if (moves.at(2)
-        .compare(QStringLiteral("pass"), Qt::CaseInsensitive) == 0) {
+    if (moves.at(2).compare(QStringLiteral("pass"), Qt::CaseInsensitive) == 0) {
         m_passes++;
-    } else if (moves.at(2)
-               .compare(QStringLiteral("resign"), Qt::CaseInsensitive) == 0) {
+    } else if (moves.at(2).compare(QStringLiteral("resign"),
+                                   Qt::CaseInsensitive)
+               == 0) {
         m_resignation = true;
-        m_blackResigned = (moves.at(1).compare(QStringLiteral("black"), Qt::CaseInsensitive) == 0);
+        m_blackResigned =
+            (moves.at(1).compare(QStringLiteral("black"), Qt::CaseInsensitive)
+             == 0);
     } else {
         m_passes = 0;
     }
@@ -284,7 +322,7 @@ bool Game::getScore() {
             m_result = "B+Resign ";
             QTextStream(stdout) << "Score: " << m_result << endl;
         }
-    } else{
+    } else {
         write("final_score\n");
         waitForBytesWritten(-1);
         if (!waitReady()) {
@@ -325,30 +363,30 @@ bool Game::writeSgf() {
     return sendGtpCommand(qPrintable("printsgf " + m_fileName + ".sgf"));
 }
 
-bool Game::loadTraining(const QString &fileName) {
+bool Game::loadTraining(const QString& fileName) {
     QTextStream(stdout) << "Loading " << fileName + ".train" << endl;
     return sendGtpCommand(qPrintable("load_training " + fileName + ".train"));
-
 }
 
 bool Game::saveTraining() {
-     QTextStream(stdout) << "Saving " << m_fileName + ".train" << endl;
-     return sendGtpCommand(qPrintable("save_training " + m_fileName + ".train"));
+    QTextStream(stdout) << "Saving " << m_fileName + ".train" << endl;
+    return sendGtpCommand(qPrintable("save_training " + m_fileName + ".train"));
 }
 
-
-bool Game::loadSgf(const QString &fileName) {
+bool Game::loadSgf(const QString& fileName) {
     QTextStream(stdout) << "Loading " << fileName + ".sgf" << endl;
     return sendGtpCommand(qPrintable("loadsgf " + fileName + ".sgf"));
 }
 
-bool Game::fixSgf(QString& weightFile, bool resignation) {
-    QFile sgfFile(m_fileName + ".sgf");
-    if (!sgfFile.open(QIODevice::Text | QIODevice::ReadOnly)) {
-        return false;
-    }
-    QString sgfData = sgfFile.readAll();
-    QRegularExpression re("PW\\[Human\\]");
+bool Game::loadSgf(const QString& fileName, const int moves) {
+    QTextStream(stdout) << "Loading " << fileName + ".sgf with " << moves
+                        << " moves" << endl;
+    return sendGtpCommand(qPrintable("loadsgf " + fileName + ".sgf "
+                                     + QString::number(moves + 1)));
+}
+
+void Game::fixSgfPlayer(QString& sgfData, const Engine& whiteEngine) {
+    QRegularExpression oldPlayer("PW\\[Human\\]");
     QString playerName("PB[Leela Zero ");
     QRegularExpression le("PB\\[Leela Zero \\S+ ");
     QRegularExpressionMatch match = le.match(sgfData);
@@ -356,10 +394,36 @@ bool Game::fixSgf(QString& weightFile, bool resignation) {
         playerName = match.captured(0);
     }
     playerName = "PW" + playerName.remove(0, 2);
-    playerName += weightFile.left(8);
+    playerName += whiteEngine.getNetworkFile().left(8);
     playerName += "]";
-    sgfData.replace(re, playerName);
+    sgfData.replace(oldPlayer, playerName);
+}
 
+void Game::fixSgfComment(QString& sgfData, const Engine& whiteEngine,
+                         const bool isSelfPlay) {
+    QRegularExpression oldComment("(C\\[Leela Zero)( options:.*)\\]");
+    QString comment("\\1");
+    if (!isSelfPlay) {
+        comment += " Black";
+    }
+    comment += "\\2 Starting GTP commands:";
+    for (const auto command : m_engine.m_commands) {
+        comment += " " + command;
+    }
+    if (!isSelfPlay) {
+        comment += " White options:";
+        comment += whiteEngine.m_options + " " + whiteEngine.m_network;
+        comment += " Starting GTP commands:";
+        for (const auto command : whiteEngine.m_commands) {
+            comment += " " + command;
+        }
+    }
+    comment += "]";
+    comment.replace(QRegularExpression("\\s\\s+"), " ");
+    sgfData.replace(oldComment, comment);
+}
+
+void Game::fixSgfResult(QString& sgfData, const bool resignation) {
     if (resignation) {
         QRegularExpression oldResult("RE\\[B\\+.*\\]");
         QString newResult("RE[B+Resign] ");
@@ -372,7 +436,18 @@ bool Game::fixSgf(QString& weightFile, bool resignation) {
         QString noPass(")");
         sgfData.replace(lastpass, noPass);
     }
+}
 
+bool Game::fixSgf(const Engine& whiteEngine, const bool resignation,
+                  const bool isSelfPlay) {
+    QFile sgfFile(m_fileName + ".sgf");
+    if (!sgfFile.open(QIODevice::Text | QIODevice::ReadOnly)) {
+        return false;
+    }
+    QString sgfData = sgfFile.readAll();
+    fixSgfPlayer(sgfData, whiteEngine);
+    fixSgfComment(sgfData, whiteEngine, isSelfPlay);
+    fixSgfResult(sgfData, resignation);
     sgfFile.close();
     if (sgfFile.open(QFile::WriteOnly | QFile::Truncate)) {
         QTextStream out(&sgfFile);
diff --git a/autogtp/Game.h b/autogtp/Game.h
index c321d8414..64b554833 100644
--- a/autogtp/Game.h
+++ b/autogtp/Game.h
@@ -19,42 +19,85 @@
 #ifndef GAME_H
 #define GAME_H
 
+#include <QFileInfo>
 #include <QProcess>
 #include <tuple>
 
 using VersionTuple = std::tuple<int, int, int>;
 
+class Engine {
+public:
+    Engine(const QString& network, const QString& options,
+           const QStringList& commands = QStringList("time_settings 0 1 0"),
+           const QString& binary = QString("./leelaz"))
+        : m_binary(binary),
+          m_options(options),
+          m_network(network),
+          m_commands(commands) {
+#ifdef WIN32
+        m_binary.append(".exe");
+#endif
+        if (!QFileInfo::exists(m_binary)) {
+            m_binary.remove(0, 2); // ./leelaz -> leelaz
+        }
+    }
+    Engine() = default;
+    QString getCmdLine() const {
+        return m_binary + " " + m_options + " " + m_network;
+    }
+    QString getNetworkFile() const {
+        return QFileInfo(m_network).baseName();
+    }
+    QString m_binary;
+    QString m_options;
+    QString m_network;
+    QStringList m_commands;
+};
+
 class Game : QProcess {
 public:
-    Game(const QString& weights,
-         const QString& opt,
-         const QString& binary = QString("./leelaz"));
+    Game(const Engine& engine);
     ~Game() = default;
-    bool gameStart(const VersionTuple& min_version);
+    bool gameStart(const VersionTuple& min_version,
+                   const QString& sgf = QString(), int moves = 0);
     void move();
-    bool waitForMove() { return waitReady(); }
+    bool waitForMove() {
+        return waitReady();
+    }
     bool readMove();
     bool nextMove();
     bool getScore();
-    bool loadSgf(const QString &fileName);
+    bool loadSgf(const QString& fileName);
+    bool loadSgf(const QString& fileName, int moves);
     bool writeSgf();
-    bool loadTraining(const QString &fileName);
+    bool loadTraining(const QString& fileName);
     bool saveTraining();
-    bool fixSgf(QString& weightFile, bool resignation);
+    bool fixSgf(const Engine& whiteEngine, bool resignation, bool isSelfPlay);
     bool dumpTraining();
-    QString getCmdLine() const { return m_cmdLine; }
     bool dumpDebug();
     void gameQuit();
-    QString getMove() const { return m_moveDone; }
-    QString getFile() const { return m_fileName; }
+    QString getMove() const {
+        return m_moveDone;
+    }
+    QString getFile() const {
+        return m_fileName;
+    }
     bool setMove(const QString& m);
     bool checkGameEnd();
-    void setCmdLine(const QString& cmd)  { m_cmdLine = cmd; }
     int getWinner();
-    QString getWinnerName() const { return m_winner; }
-    int getMovesCount() const { return m_moveNum; }
+    QString getWinnerName() const {
+        return m_winner;
+    }
+    int getMovesCount() const {
+        return m_moveNum;
+    }
     void setMovesCount(int moves);
-    QString getResult() const { return m_result.trimmed(); }
+    int getToMove() const {
+        return m_blackToMove ? BLACK : WHITE;
+    }
+    QString getResult() const {
+        return m_result.trimmed();
+    }
     enum {
         BLACK = 0,
         WHITE = 1,
@@ -67,23 +110,26 @@ class Game : QProcess {
         WRONG_GTP,
         LAUNCH_FAILURE
     };
-    QString m_cmdLine;
-    QString m_binary;
-    QString m_timeSettings;
+    Engine m_engine;
     QString m_winner;
     QString m_fileName;
     QString m_moveDone;
     QString m_result;
+    bool m_isHandicap;
     bool m_resignation;
     bool m_blackToMove;
     bool m_blackResigned;
     int m_passes;
     int m_moveNum;
     bool sendGtpCommand(QString cmd);
-    void checkVersion(const VersionTuple &min_version);
+    void checkVersion(const VersionTuple& min_version);
     bool waitReady();
     bool eatNewLine();
     void error(int errnum);
+    void fixSgfPlayer(QString& sgfData, const Engine& whiteEngine);
+    void fixSgfComment(QString& sgfData, const Engine& whiteEngine,
+                       bool isSelfPlay);
+    void fixSgfResult(QString& sgfData, bool resignation);
 };
 
 #endif /* GAME_H */
diff --git a/autogtp/Job.cpp b/autogtp/Job.cpp
index b58265775..727e9cc0e 100644
--- a/autogtp/Job.cpp
+++ b/autogtp/Job.cpp
@@ -16,29 +16,25 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "Job.h"
-#include "Game.h"
-#include "Management.h"
+#include <QFile>
 #include <QTextStream>
+#include <QThread>
 #include <chrono>
 
-#include <QFile>
-#include <QThread>
+#include "Job.h"
 
-Job::Job(QString gpu, Management *parent) :
-    m_state(RUNNING),
-    m_option(""),
-  m_gpu(gpu),
-  m_boss(parent)
-{
-}
+#include "Game.h"
+#include "Management.h"
 
-void Job::init(const Order &o) {
-    m_option = " " + o.parameters()["options"] + m_gpu + " -g -q -w ";
+Job::Job(QString gpu, Management* parent)
+    : m_state(RUNNING), m_gpu(gpu), m_boss(parent) {}
+
+void Job::init(const Order& o) {
     QStringList version_list = o.parameters()["leelazVer"].split(".");
     if (version_list.size() < 2) {
         QTextStream(stdout)
-                << "Unexpected Leela Zero version: " << o.parameters()["leelazVer"] << endl;
+            << "Unexpected Leela Zero version: " << o.parameters()["leelazVer"]
+            << endl;
         exit(EXIT_FAILURE);
     }
     if (version_list.size() < 3) {
@@ -47,36 +43,30 @@ void Job::init(const Order &o) {
     std::get<0>(m_leelazMinVersion) = version_list[0].toInt();
     std::get<1>(m_leelazMinVersion) = version_list[1].toInt();
     std::get<2>(m_leelazMinVersion) = version_list[2].toInt();
-
 }
 
-ProductionJob::ProductionJob(QString gpu, Management *parent) :
-Job(gpu, parent)
-{
-}
+ProductionJob::ProductionJob(QString gpu, Management* parent)
+    : Job(gpu, parent), m_engine(Engine(QString(), QString())) {}
 
-ValidationJob::ValidationJob(QString gpu, Management *parent) :
-Job(gpu, parent)
-{
-}
+ValidationJob::ValidationJob(QString gpu, Management* parent)
+    : Job(gpu, parent),
+      m_engineFirst(Engine(QString(), QString())),
+      m_engineSecond(Engine(QString(), QString())) {}
 
-WaitJob::WaitJob(QString gpu, Management *parent) :
-Job(gpu, parent)
-{
-}
+WaitJob::WaitJob(QString gpu, Management* parent) : Job(gpu, parent) {}
 
-Result ProductionJob::execute(){
+Result ProductionJob::execute() {
     Result res(Result::Error);
-    Game game("networks/" + m_network, m_option);
-    if (!game.gameStart(m_leelazMinVersion)) {
+    Game game(m_engine);
+    if (!game.gameStart(m_leelazMinVersion, m_sgf, m_moves)) {
         return res;
     }
     if (!m_sgf.isEmpty()) {
-        game.loadSgf(m_sgf);
-        game.loadTraining(m_sgf);
-        game.setMovesCount(m_moves);
         QFile::remove(m_sgf + ".sgf");
-        QFile::remove(m_sgf + ".train");
+        if (m_restore) {
+            game.loadTraining(m_sgf);
+            QFile::remove(m_sgf + ".train");
+        }
     }
     do {
         game.move();
@@ -87,146 +77,144 @@ Result ProductionJob::execute(){
         m_boss->incMoves();
     } while (game.nextMove() && m_state.load() == RUNNING);
     switch (m_state.load()) {
-    case RUNNING:
-        QTextStream(stdout) << "Game has ended." << endl;
-        if (game.getScore()) {
-            game.writeSgf();
-            game.fixSgf(m_network, false);
-            game.dumpTraining();
-            if (m_debug) {
-                game.dumpDebug();
+        case RUNNING:
+            QTextStream(stdout) << "Game has ended." << endl;
+            if (game.getScore()) {
+                game.writeSgf();
+                game.fixSgf(m_engine, false, true);
+                game.dumpTraining();
+                if (m_debug) {
+                    game.dumpDebug();
+                }
             }
-        }
-        res.type(Result::File);
-        res.add("file", game.getFile());
-        res.add("winner", game.getWinnerName());
-        res.add("moves", QString::number(game.getMovesCount()));
-        break;
-    case STORING:
-        res.type(Result::StoreSelfPlayed);
-        game.writeSgf();
-        game.saveTraining();
-        res.add("sgf", game.getFile());
-        res.add("moves", QString::number(game.getMovesCount()));
-        break;
-    default:
-        break;
+            res.type(Result::File);
+            res.add("file", game.getFile());
+            res.add("winner", game.getWinnerName());
+            res.add("moves", QString::number(game.getMovesCount()));
+            break;
+        case STORING:
+            game.writeSgf();
+            game.saveTraining();
+            res.type(Result::StoreSelfPlayed);
+            res.add("sgf", game.getFile());
+            res.add("moves", QString::number(game.getMovesCount()));
+            break;
+        default:
+            break;
     }
     game.gameQuit();
     return res;
 }
 
-void ProductionJob::init(const Order &o) {
+void ProductionJob::init(const Order& o) {
     Job::init(o);
-    m_network = o.parameters()["network"];
-    m_debug = o.parameters()["debug"] == "true";
-    if (o.type() == Order::RestoreSelfPlayed) {
-        m_sgf = o.parameters()["sgf"];
-        m_moves = o.parameters()["moves"].toInt();
-    } else {
-        m_sgf = "";
-        m_moves = 0;
+    m_engine.m_network = "networks/" + o.parameters()["network"] + ".gz";
+    m_engine.m_options = " " + o.parameters()["options"] + m_gpu + " -g -q -w ";
+    if (o.parameters().contains("gtpCommands")) {
+        m_engine.m_commands = o.parameters()["gtpCommands"].split(",");
     }
+    m_debug = o.parameters()["debug"] == "true";
+    m_sgf = o.parameters()["sgf"];
+    m_moves = o.parameters()["moves"].toInt();
+    m_restore = o.type() == Order::RestoreSelfPlayed;
 }
 
-Result ValidationJob::execute(){
+Result ValidationJob::execute() {
     Result res(Result::Error);
-    Game first("networks/" + m_firstNet,  m_option);
-    if (!first.gameStart(m_leelazMinVersion)) {
+    Game first(m_engineFirst);
+    if (!first.gameStart(m_leelazMinVersion, m_sgf, m_moves)) {
         return res;
     }
-    if (!m_sgfFirst.isEmpty()) {
-        first.loadSgf(m_sgfFirst);
-        first.setMovesCount(m_moves);
-        QFile::remove(m_sgfFirst + ".sgf");
-    }
-    Game second("networks/" + m_secondNet, m_option);
-    if (!second.gameStart(m_leelazMinVersion)) {
+    Game second(m_engineSecond);
+    if (!second.gameStart(m_leelazMinVersion, m_sgf, m_moves)) {
         return res;
     }
-    if (!m_sgfSecond.isEmpty()) {
-        second.loadSgf(m_sgfSecond);
-        second.setMovesCount(m_moves);
-        QFile::remove(m_sgfSecond + ".sgf");
+    if (!m_sgf.isEmpty()) {
+        QFile::remove(m_sgf + ".sgf");
     }
 
-    QString wmove = "play white ";
-    QString bmove = "play black ";
+    const QString stringWhite = "white";
+    const QString stringBlack = "black";
+    // Start with the side to move set to the opposite of the expected way
+    // around because the game playing loop swaps the sides at the start of each
+    // iteration. This avoids having to test which side is to move on every
+    // iteration of the loop.
+    auto gameToMove = &second;
+    auto colorToMove = &stringWhite;
+    auto gameOpponent = &first;
+    auto colorOpponent = &stringBlack;
+    if (first.getToMove() == Game::WHITE) {
+        std::swap(gameToMove, gameOpponent);
+        std::swap(colorToMove, colorOpponent);
+    }
     do {
-        first.move();
-        if (!first.waitForMove()) {
+        std::swap(gameToMove, gameOpponent);
+        std::swap(colorToMove, colorOpponent);
+        gameToMove->move();
+        if (!gameToMove->waitForMove()) {
             return res;
         }
-        first.readMove();
-       m_boss->incMoves();
-        if (first.checkGameEnd()) {
-            break;
-        }
-        second.setMove(bmove + first.getMove());
-        second.move();
-        if (!second.waitForMove()) {
-            return res;
-        }
-        second.readMove();
-       m_boss->incMoves();
-        first.setMove(wmove + second.getMove());
-        second.nextMove();
-    } while (first.nextMove() && m_state.load() == RUNNING);
+        gameToMove->readMove();
+        m_boss->incMoves();
+        gameOpponent->setMove("play " + *colorToMove + " "
+                              + gameToMove->getMove());
+    } while (gameToMove->nextMove() && m_state.load() == RUNNING);
 
     switch (m_state.load()) {
-    case RUNNING:
-        res.add("moves", QString::number(first.getMovesCount()));
-       QTextStream(stdout) << "Game has ended." << endl;
-        if (first.getScore()) {
-            res.add("score", first.getResult());
-            res.add("winner", first.getWinnerName());
+        case RUNNING:
+            QTextStream(stdout) << "Game has ended." << endl;
+            if (first.getScore()) {
+                res.add("score", first.getResult());
+                res.add("winner", first.getWinnerName());
+                first.writeSgf();
+                first.fixSgf(m_engineSecond,
+                             (res.parameters()["score"] == "B+Resign"), false);
+                res.add("file", first.getFile());
+            }
+            res.type(Result::Win);
+            res.add("moves", QString::number(first.getMovesCount()));
+            break;
+        case STORING:
             first.writeSgf();
-            first.fixSgf(m_secondNet, (res.parameters()["score"] == "B+Resign"));
-            res.add("file", first.getFile());
-        }
-        // Game is finished, send the result
-        res.type(Result::Win);
-        break;
-    case STORING:
-        res.type(Result::StoreMatch);
-        first.writeSgf();
-        second.writeSgf();
-        res.add("sgfFirst", first.getFile());
-        res.add("sgfSecond", second.getFile());
-        res.add("moves", QString::number(first.getMovesCount()));
-        break;
-    default:
-        break;
+            res.type(Result::StoreMatch);
+            res.add("sgf", first.getFile());
+            res.add("moves", QString::number(first.getMovesCount()));
+            break;
+        default:
+            break;
     }
     first.gameQuit();
     second.gameQuit();
     return res;
 }
 
-void ValidationJob::init(const Order &o) {
+void ValidationJob::init(const Order& o) {
     Job::init(o);
-    m_firstNet = o.parameters()["firstNet"];
-    m_secondNet = o.parameters()["secondNet"];
-    if (o.type() == Order::RestoreMatch) {
-        m_sgfFirst = o.parameters()["sgfFirst"];
-        m_sgfSecond = o.parameters()["sgfSecond"];
-        m_moves = o.parameters()["moves"].toInt();
-    } else {
-        m_sgfFirst = "";
-        m_sgfSecond = "";
-        m_moves = 0;
+    m_engineFirst.m_network = "networks/" + o.parameters()["firstNet"] + ".gz";
+    m_engineFirst.m_options =
+        " " + o.parameters()["options"] + m_gpu + " -g -q -w ";
+    if (o.parameters().contains("gtpCommands")) {
+        m_engineFirst.m_commands = o.parameters()["gtpCommands"].split(",");
+    }
+    m_engineSecond.m_network =
+        "networks/" + o.parameters()["secondNet"] + ".gz";
+    m_engineSecond.m_options =
+        " " + o.parameters()["optionsSecond"] + m_gpu + " -g -q -w ";
+    if (o.parameters().contains("gtpCommandsSecond")) {
+        m_engineSecond.m_commands =
+            o.parameters()["gtpCommandsSecond"].split(",");
     }
+    m_sgf = o.parameters()["sgf"];
+    m_moves = o.parameters()["moves"].toInt();
 }
 
-Result WaitJob::execute(){
+Result WaitJob::execute() {
     Result res(Result::Waited);
     QThread::sleep(m_minutes * 60);
     return res;
 }
 
-void WaitJob::init(const Order &o) {
+void WaitJob::init(const Order& o) {
     Job::init(o);
     m_minutes = o.parameters()["minutes"].toInt();
 }
-
-
diff --git a/autogtp/Job.h b/autogtp/Job.h
index 8805265e6..b56318294 100644
--- a/autogtp/Job.h
+++ b/autogtp/Job.h
@@ -19,11 +19,13 @@
 #ifndef JOB_H
 #define JOB_H
 
-#include "Result.h"
-#include "Order.h"
-#include <QObject>
 #include <QAtomicInt>
+#include <QObject>
 #include <QTextStream>
+
+#include "Game.h"
+#include "Order.h"
+#include "Result.h"
 class Management;
 using VersionTuple = std::tuple<int, int, int>;
 
@@ -39,59 +41,62 @@ class Job : public QObject {
         Production = 0,
         Validation
     };
-    Job(QString gpu, Management *parent);
+    Job(QString gpu, Management* parent);
     ~Job() = default;
     virtual Result execute() = 0;
-    virtual void init(const Order &o);
-    void finish() { m_state.store(FINISHING); }
+    virtual void init(const Order& o);
+    void finish() {
+        m_state.store(FINISHING);
+    }
     void store() {
         m_state.store(STORING);
     }
 
 protected:
     QAtomicInt m_state;
-    QString m_option;
     QString m_gpu;
     int m_moves;
     VersionTuple m_leelazMinVersion;
-    Management *m_boss;
+    Management* m_boss;
 };
 
-
 class ProductionJob : public Job {
     Q_OBJECT
 public:
-    ProductionJob(QString gpu, Management *parent);
+    ProductionJob(QString gpu, Management* parent);
     ~ProductionJob() = default;
-    void init(const Order &o);
+    void init(const Order& o);
     Result execute();
+
 private:
-    QString m_network;
+    Engine m_engine;
     QString m_sgf;
     bool m_debug;
+    bool m_restore;
 };
 
 class ValidationJob : public Job {
     Q_OBJECT
 public:
-    ValidationJob(QString gpu, Management *parent);
+    ValidationJob(QString gpu, Management* parent);
     ~ValidationJob() = default;
-    void init(const Order &o);
+    void init(const Order& o);
     Result execute();
+
 private:
-    QString m_firstNet;
-    QString m_secondNet;
-    QString m_sgfFirst;
-    QString m_sgfSecond;
+    Engine m_engineFirst;
+    Engine m_engineSecond;
+    QString m_sgf;
 };
 
 class WaitJob : public Job {
     Q_OBJECT
 public:
-    WaitJob(QString gpu, Management *parent);
+    WaitJob(QString gpu, Management* parent);
     ~WaitJob() = default;
-    void init(const Order &o);
+    void init(const Order& o);
     Result execute();
+
 private:
     int m_minutes;
 };
diff --git a/autogtp/Keypress.cpp b/autogtp/Keypress.cpp
deleted file mode 100644
index 7cda8b878..000000000
--- a/autogtp/Keypress.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-    This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Marco Calignano
-
-    Leela Zero is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Leela Zero is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include "Keypress.h"
-#include <QtGui/QKeyEvent>
-
-KeyPress::KeyPress(Management *boss, QObject *parent) :
-    QObject(parent),
-    m_boss(boss) {
-}
-
-bool KeyPress::eventFilter(QObject *obj, QEvent *event)
-{
-    if (event->type() == QEvent::KeyPress) {
-        QKeyEvent *keyEvent = static_cast<QKeyEvent *>(event);
-        if (keyEvent->modifiers() == Qt::ControlModifier &&
-            keyEvent->key() == Qt::Key_C ) {
-            m_boss->storeGames();
-        }
-        qDebug("Ate key press %d", keyEvent->key());
-        return true;
-    } else {
-        // standard event processing
-        return QObject::eventFilter(obj, event);
-    }
-}
diff --git a/autogtp/Management.cpp b/autogtp/Management.cpp
index 069b0bfce..fadd4a187 100644
--- a/autogtp/Management.cpp
+++ b/autogtp/Management.cpp
@@ -16,68 +16,77 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <cmath>
-#include <random>
-#include <QDir>
-#include <QThread>
-#include <QList>
 #include <QCryptographicHash>
+#include <QDir>
+#include <QJsonArray>
 #include <QJsonDocument>
 #include <QJsonObject>
+#include <QList>
 #include <QLockFile>
-#include <QUuid>
 #include <QRegularExpression>
+#include <QThread>
+#include <QUuid>
+#include <QVariant>
+#include <cmath>
+#include <random>
+
 #include "Management.h"
-#include "Game.h"
 
+#include "Game.h"
 
 constexpr int RETRY_DELAY_MIN_SEC = 30;
-constexpr int RETRY_DELAY_MAX_SEC = 60 * 60;  // 1 hour
-constexpr int MAX_RETRIES = 3;           // Stop retrying after 3 times
+constexpr int RETRY_DELAY_MAX_SEC = 60 * 60; // 1 hour
+constexpr int MAX_RETRIES = 3;               // Stop retrying after 3 times
+
+const QString server_url = "https://zero.sjeng.org/";
 const QString Leelaz_min_version = "0.12";
 
-Management::Management(const int gpus,
-                       const int games,
-                       const QStringList& gpuslist,
-                       const int ver,
-                       const int maxGames,
-                       const bool delNetworks,
-                       const QString& keep,
-                       const QString& debug)
+Management::Management(const int gpus, const int games,
+                       const QStringList& gpuslist, const int ver,
+                       const int maxGames, const bool delNetworks,
+                       const QString& keep, const QString& debug)
 
     : m_syncMutex(),
-    m_gamesThreads(gpus * games),
-    m_games(games),
-    m_gpus(gpus),
-    m_gpusList(gpuslist),
-    m_selfGames(0),
-    m_matchGames(0),
-    m_gamesPlayed(0),
-    m_keepPath(keep),
-    m_debugPath(debug),
-    m_version(ver),
-    m_fallBack(Order::Error),
-    m_lastMatch(Order::Error),
-    m_gamesLeft(maxGames),
-    m_threadsLeft(gpus * games),
-    m_delNetworks(delNetworks),
-    m_lockFile(nullptr) {
-}
-
-void Management::runTuningProcess(const QString &tuneCmdLine) {
+      m_gamesThreads(gpus * games),
+      m_games(games),
+      m_gpus(gpus),
+      m_gpusList(gpuslist),
+      m_selfGames(0),
+      m_matchGames(0),
+      m_gamesPlayed(0),
+      m_keepPath(keep),
+      m_debugPath(debug),
+      m_version(ver),
+      m_fallBack(Order::Error),
+      m_lastMatch(Order::Error),
+      m_gamesLeft(maxGames),
+      m_threadsLeft(gpus * games),
+      m_delNetworks(delNetworks),
+      m_lockFile(nullptr) {}
+
+void Management::runTuningProcess(const QString& tuneCmdLine) {
     QTextStream(stdout) << tuneCmdLine << endl;
     QProcess tuneProcess;
     tuneProcess.start(tuneCmdLine);
     tuneProcess.waitForStarted(-1);
     while (tuneProcess.state() == QProcess::Running) {
         tuneProcess.waitForReadyRead(1000);
+        QByteArray text = tuneProcess.readAllStandardOutput();
+        int version_start = text.indexOf("Leela Zero ") + 11;
+        if (version_start > 10) {
+            int version_end = text.indexOf(" ", version_start);
+            m_leelaversion =
+                QString(text.mid(version_start, version_end - version_start));
+        }
+        QTextStream(stdout) << text;
         QTextStream(stdout) << tuneProcess.readAllStandardError();
     }
+    QTextStream(stdout) << "Found Leela Version : " << m_leelaversion << endl;
     tuneProcess.waitForFinished(-1);
 }
 
-Order Management::getWork(const QFileInfo &file) {
-    QTextStream(stdout) << "Got previously stored file" <<endl;
+Order Management::getWork(const QFileInfo& file) {
+    QTextStream(stdout) << "Got previously stored file" << endl;
     Order o;
     o.load(file.fileName());
     QFile::remove(file.fileName());
@@ -90,12 +99,12 @@ Order Management::getWork(const QFileInfo &file) {
 void Management::giveAssignments() {
     sendAllGames();
 
-    //Make the OpenCl tuning before starting the threads
+    // Make the OpenCl tuning before starting the threads
     QTextStream(stdout) << "Starting tuning process, please wait..." << endl;
 
     Order tuneOrder = getWork(true);
-    QString tuneCmdLine("./leelaz --tune-only -w networks/");
-    tuneCmdLine.append(tuneOrder.parameters()["network"]);
+    QString tuneCmdLine("./leelaz --batchsize=5 --tune-only -w networks/");
+    tuneCmdLine.append(tuneOrder.parameters()["network"] + ".gz");
     if (m_gpusList.isEmpty()) {
         runTuningProcess(tuneCmdLine);
     } else {
@@ -115,14 +124,12 @@ void Management::giveAssignments() {
             } else {
                 myGpu = m_gpusList.at(gpu);
             }
-            QTextStream(stdout) << "Starting thread " << game + 1 ;
-            QTextStream(stdout) << " on GPU " << gpu << endl;
-            m_gamesThreads[thread_index] = new Worker(thread_index, myGpu, this);
-            connect(m_gamesThreads[thread_index],
-                    &Worker::resultReady,
-                    this,
-                    &Management::getResult,
-                    Qt::DirectConnection);
+            QTextStream(stdout) << "Starting thread " << game + 1;
+            QTextStream(stdout) << " on device " << gpu << endl;
+            m_gamesThreads[thread_index] =
+                new Worker(thread_index, myGpu, this);
+            connect(m_gamesThreads[thread_index], &Worker::resultReady, this,
+                    &Management::getResult, Qt::DirectConnection);
             QFileInfo finfo = getNextStored();
             if (!finfo.fileName().isEmpty()) {
                 m_gamesThreads[thread_index]->order(getWork(finfo));
@@ -145,7 +152,8 @@ void Management::wait() {
     QTextStream(stdout) << "Management: waiting for workers" << endl;
     for (int i = 0; i < m_gpus * m_games; ++i) {
         m_gamesThreads[i]->wait();
-        QTextStream(stdout) << "Management: Worker " << i+1 << " ended" << endl;
+        QTextStream(stdout)
+            << "Management: Worker " << i + 1 << " ended" << endl;
     }
 }
 
@@ -156,17 +164,17 @@ void Management::getResult(Order ord, Result res, int index, int duration) {
     m_syncMutex.lock();
     m_gamesPlayed++;
     switch (res.type()) {
-    case Result::File:
-        m_selfGames++,
-        uploadData(res.parameters(), ord.parameters());
-        printTimingInfo(duration);
-        break;
-    case Result::Win:
-    case Result::Loss:
-        m_matchGames++,
-        uploadResult(res.parameters(), ord.parameters());
-        printTimingInfo(duration);
-        break;
+        case Result::File:
+            m_selfGames++;
+            uploadData(res.parameters(), ord.parameters());
+            printTimingInfo(duration);
+            break;
+        case Result::Win:
+        case Result::Loss:
+            m_matchGames++;
+            uploadResult(res.parameters(), ord.parameters());
+            printTimingInfo(duration);
+            break;
     }
     sendAllGames();
     if (m_gamesLeft == 0) {
@@ -193,10 +201,9 @@ QFileInfo Management::getNextStored() {
     checkStoredGames();
     while (!m_storedFiles.isEmpty()) {
         fi = m_storedFiles.takeFirst();
-        m_lockFile = new QLockFile(fi.fileName()+".lock");
-        if (m_lockFile->tryLock(10) &&
-           fi.exists()) {
-                break;
+        m_lockFile = new QLockFile(fi.fileName() + ".lock");
+        if (m_lockFile->tryLock(10) && fi.exists()) {
+            break;
         }
         delete m_lockFile;
         m_lockFile = nullptr;
@@ -204,7 +211,7 @@ QFileInfo Management::getNextStored() {
     return fi;
 }
 
-void  Management::printTimingInfo(float duration) {
+void Management::printTimingInfo(float duration) {
 
     auto game_end = std::chrono::high_resolution_clock::now();
     auto total_time_s =
@@ -213,16 +220,19 @@ void  Management::printTimingInfo(float duration) {
         std::chrono::duration_cast<std::chrono::minutes>(total_time_s);
     auto total_time_millis =
         std::chrono::duration_cast<std::chrono::milliseconds>(total_time_s);
-    QTextStream(stdout)
-        << m_gamesPlayed << " game(s) (" << m_selfGames << " self played and "
-        << m_matchGames << " matches) played in "
-        << total_time_min.count() << " minutes = "
-        << total_time_s.count() / m_gamesPlayed << " seconds/game, "
-        << total_time_millis.count() / m_movesMade.load()  << " ms/move"
-        << ", last game took " << int(duration) << " seconds." << endl;
+    QTextStream(stdout) << m_gamesPlayed << " game(s) (" << m_selfGames
+                        << " self played and " << m_matchGames
+                        << " matches) played in " << total_time_min.count()
+                        << " minutes = " << total_time_s.count() / m_gamesPlayed
+                        << " seconds/game, "
+                        << total_time_millis.count() / m_movesMade.load()
+                        << " ms/move"
+                        << ", last game took " << int(duration) << " seconds."
+                        << endl;
 }
 
-QString Management::getOption(const QJsonObject &ob, const QString &key, const QString &opt, const QString &defValue) {
+QString Management::getOption(const QJsonObject& ob, const QString& key,
+                              const QString& opt, const QString& defValue) {
     QString res;
     if (ob.contains(key)) {
         res.append(opt + ob.value(key).toString() + " ");
@@ -234,10 +244,12 @@ QString Management::getOption(const QJsonObject &ob, const QString &key, const Q
     return res;
 }
 
-QString Management::getBoolOption(const QJsonObject &ob, const QString &key, const QString &opt, bool defValue) {
+QString Management::getBoolOption(const QJsonObject& ob, const QString& key,
+                                  const QString& opt, bool defValue) {
     QString res;
     if (ob.contains(key)) {
-        if (ob.value(key).toString().compare("true", Qt::CaseInsensitive) == 0) {
+        if (ob.value(key).toString().compare("true", Qt::CaseInsensitive)
+            == 0) {
             res.append(opt + " ");
         }
     } else {
@@ -248,13 +260,15 @@ QString Management::getBoolOption(const QJsonObject &ob, const QString &key, con
     return res;
 }
 
-QString Management::getOptionsString(const QJsonObject &opt, const QString &rnd) {
+QString Management::getOptionsString(const QJsonObject& opt,
+                                     const QString& rnd) {
     QString options;
     options.append(getOption(opt, "playouts", " -p ", ""));
     options.append(getOption(opt, "visits", " -v ", ""));
     options.append(getOption(opt, "resignation_percent", " -r ", "1"));
     options.append(getOption(opt, "randomcnt", " -m ", "30"));
-    options.append(getOption(opt, "threads", " -t ", "1"));
+    options.append(getOption(opt, "threads", " -t ", "6"));
+    options.append(getOption(opt, "batchsize", " --batchsize ", "5"));
     options.append(getBoolOption(opt, "dumbpass", " -d ", true));
     options.append(getBoolOption(opt, "noise", " -n ", true));
     options.append(" --noponder ");
@@ -264,40 +278,68 @@ QString Management::getOptionsString(const QJsonObject &opt, const QString &rnd)
     return options;
 }
 
+QString Management::getGtpCommandsString(const QJsonValue& gtpCommands) {
+    const auto gtpCommandsJsonDoc = QJsonDocument(gtpCommands.toArray());
+    const auto gtpCommandsJson =
+        gtpCommandsJsonDoc.toJson(QJsonDocument::Compact);
+    auto gtpCommandsString = QVariant(gtpCommandsJson).toString();
+    gtpCommandsString.remove(QRegularExpression("[\\[\\]\"]"));
+    return gtpCommandsString;
+}
+
 Order Management::getWorkInternal(bool tuning) {
     Order o(Order::Error);
 
     /*
 
 {
-   "cmd" : "match",
-   "white_hash" : "223737476718d58a4a5b0f317a1eeeb4b38f0c06af5ab65cb9d76d68d9abadb6",
-   "black_hash" : "92c658d7325fe38f0c8adbbb1444ed17afd891b9f208003c272547a7bcb87909",
-   "options_hash" : "c2e3"
-   "required_client_version" : "5",
-   "leelaz_version" : "0.9",
-   "random_seed" : "1",
-   "options" : {
-       "playouts" : "1000",
-       "resignation_percent" : "3",
-       "noise" : "false",
-       "randomcnt" : "0"
-    }
+   cmd : "match",
+   white_hash : "223737476718d58a4a5b0f317a1eeeb4b38f0c06af5ab65cb9d76d68d9abadb6",
+   black_hash : "92c658d7325fe38f0c8adbbb1444ed17afd891b9f208003c272547a7bcb87909",
+   options_hash : "c2e3",
+   minimum_autogtp_version: "16",
+   random_seed: "2301343010299460478",
+   minimum_leelaz_version: "0.15",
+   options : {
+       playouts : "1000",
+       visits: "3201",
+       resignation_percent : "3",
+       noise : "true",
+       randomcnt : "30"
+    },
+    white_options : {
+       playouts : "0",
+       visits: "1601",
+       resignation_percent : "5",
+       noise : "false",
+       randomcnt : "0"
+    },
+    white_hash_gzip_hash: "23c29bf777e446b5c3fb0e6e7fa4d53f15b99cc0c25798b70b57877b55bf1638",
+    black_hash_gzip_hash: "ccfe6023456aaaa423c29bf777e4aab481245289aaaabb70b7b5380992377aa8",
+    hash_sgf_hash: "7dbccc5ad9eb38f0135ff7ec860f0e81157f47dfc0a8375cef6bf1119859e537",
+    moves_count: "92",
+    gtp_commands : [ "time_settings 600 30 1", "komi 0.5", "fixed_handicap 2" ],
+    white_gtp_commands : [ "time_settings 0 10 1", "komi 0.5", "fixed_handicap 2" ],
 }
 
 {
-   "cmd" : "selfplay",
-   "hash" : "223737476718d58a4a5b0f317a1eeeb4b38f0c06af5ab65cb9d76d68d9abadb6",
-   "options_hash" : "ee21",
-   "required_client_version" : "5",
-   "leelaz_version" : "0.9",
-   "random_seed" : "1",
-   "options" : {
-       "playouts" : 1000,
-       "resignation_percent" : "3",
-       "noise" : "true",
-       "randomcnt" : "30"
-    }
+   cmd : "selfplay",
+   hash : "223737476718d58a4a5b0f317a1eeeb4b38f0c06af5ab65cb9d76d68d9abadb6",
+   options_hash : "ee21",
+   minimum_autogtp_version: "16",
+   random_seed: "2301343010299460478",
+   minimum_leelaz_version: "0.15",
+   options : {
+       playouts : "1000",
+       visits: "3201",
+       resignation_percent : "3",
+       noise : "true",
+       randomcnt : "30"
+    },
+    hash_gzip_hash: "23c29bf777e446b5c3fb0e6e7fa4d53f15b99cc0c25798b70b57877b55bf1638",
+    hash_sgf_hash: "7dbccc5ad9eb38f0135ff7ec860f0e81157f47dfc0a8375cef6bf1119859e537",
+    moves_count: "92",
+    gtp_commands : [ "time_settings 600 30 1", "komi 0.5", "fixed_handicap 4" ],
 }
 
 {
@@ -311,11 +353,14 @@ Order Management::getWorkInternal(bool tuning) {
     prog_cmdline.append(".exe");
 #endif
     prog_cmdline.append(" -s -J");
-    prog_cmdline.append(" http://zero.sjeng.org/get-task/");
+    prog_cmdline.append(" " + server_url + "get-task/");
     if (tuning) {
         prog_cmdline.append("0");
     } else {
         prog_cmdline.append(QString::number(AUTOGTP_VERSION));
+        if (!m_leelaversion.isEmpty()) {
+            prog_cmdline.append("/" + m_leelaversion);
+        }
     }
     QProcess curl;
     curl.start(prog_cmdline);
@@ -337,18 +382,21 @@ Order Management::getWorkInternal(bool tuning) {
     if (!tuning) {
         QTextStream(stdout) << doc.toJson() << endl;
     }
-    QMap<QString,QString> parameters;
+    QMap<QString, QString> parameters;
     QJsonObject ob = doc.object();
-    //checking client version
+    // checking client version
     int required_version = 0;
     if (ob.contains("required_client_version")) {
-        required_version = ob.value("required_client_version").toString().toInt();
+        required_version =
+            ob.value("required_client_version").toString().toInt();
     } else if (ob.contains("minimum_autogtp_version")) {
-        required_version = ob.value("minimum_autogtp_version").toString().toInt();
+        required_version =
+            ob.value("minimum_autogtp_version").toString().toInt();
     }
     if (required_version > m_version) {
-        QTextStream(stdout) << "Required client version: " << required_version << endl;
-        QTextStream(stdout) << ' ' <<  endl;
+        QTextStream(stdout)
+            << "Required client version: " << required_version << endl;
+        QTextStream(stdout) << ' ' << endl;
         QTextStream(stdout)
             << "Server requires client version " << required_version
             << " but we are version " << m_version << endl;
@@ -356,7 +404,7 @@ Order Management::getWorkInternal(bool tuning) {
             << "Check https://github.com/gcp/leela-zero for updates." << endl;
         exit(EXIT_FAILURE);
     }
-    //passing leela version
+    // passing leela version
     QString leelazVersion = Leelaz_min_version;
     if (ob.contains("leelaz_version")) {
         leelazVersion = ob.value("leelaz_version").toString();
@@ -365,59 +413,101 @@ Order Management::getWorkInternal(bool tuning) {
     }
     parameters["leelazVer"] = leelazVersion;
 
-    //getting the random seed
+    // getting the random seed
     QString rndSeed = "0";
-    if (ob.contains("random_seed"))
-         rndSeed = ob.value("random_seed").toString();
+    if (ob.contains("random_seed")) {
+        rndSeed = ob.value("random_seed").toString();
+    }
     parameters["rndSeed"] = rndSeed;
     if (rndSeed == "0") {
         rndSeed = "";
     }
 
-    //parsing options
+    // parsing options
     if (ob.contains("options")) {
         parameters["optHash"] = ob.value("options_hash").toString();
-        parameters["options"] = getOptionsString(ob.value("options").toObject(), rndSeed);
+        parameters["options"] =
+            getOptionsString(ob.value("options").toObject(), rndSeed);
+    }
+    if (ob.contains("gtp_commands")) {
+        parameters["gtpCommands"] =
+            getGtpCommandsString(ob.value("gtp_commands"));
+    }
+    if (ob.contains("hash_sgf_hash")) {
+        parameters["sgf"] =
+            fetchGameData(ob.value("hash_sgf_hash").toString(), "sgf");
+        parameters["moves"] = ob.contains("moves_count")
+                                  ? ob.value("moves_count").toString()
+                                  : "0";
     }
 
     parameters["debug"] = !m_debugPath.isEmpty() ? "true" : "false";
 
     if (!tuning) {
-        QTextStream(stdout) << "Got new job: " << ob.value("cmd").toString() << endl;
+        QTextStream(stdout)
+            << "Got new job: " << ob.value("cmd").toString() << endl;
     }
     if (ob.value("cmd").toString() == "selfplay") {
         QString net = ob.value("hash").toString();
-        fetchNetwork(net);
-        o.type(Order::Production);
+        QString gzipHash = ob.value("hash_gzip_hash").toString();
+        fetchNetwork(net, gzipHash);
         parameters["network"] = net;
+
+        o.type(Order::Production);
         o.parameters(parameters);
-        if (m_delNetworks &&
-            m_fallBack.parameters()["network"] != net) {
-            QTextStream(stdout) << "Deleting network " << "networks/" + m_fallBack.parameters()["network"] << endl;
-            QFile::remove("networks/" + m_fallBack.parameters()["network"]);
+        if (m_delNetworks && m_fallBack.parameters()["network"] != net) {
+            QTextStream(stdout)
+                << "Deleting network "
+                << "networks/" + m_fallBack.parameters()["network"] + ".gz"
+                << endl;
+            QFile::remove("networks/" + m_fallBack.parameters()["network"]
+                          + ".gz");
         }
         m_fallBack = o;
         QTextStream(stdout) << "net: " << net << "." << endl;
     }
     if (ob.value("cmd").toString() == "match") {
-        o.type(Order::Validation);
         QString net1 = ob.value("black_hash").toString();
+        QString gzipHash1 = ob.value("black_hash_gzip_hash").toString();
         QString net2 = ob.value("white_hash").toString();
-        fetchNetwork(net1);
-        fetchNetwork(net2);
+        QString gzipHash2 = ob.value("white_hash_gzip_hash").toString();
+        fetchNetwork(net1, gzipHash1);
+        fetchNetwork(net2, gzipHash2);
         parameters["firstNet"] = net1;
         parameters["secondNet"] = net2;
+        parameters["optionsSecond"] =
+            ob.contains("white_options") ? getOptionsString(
+                ob.value("white_options").toObject(), rndSeed)
+                                         : parameters["options"];
+        if (ob.contains("gtp_commands")) {
+            parameters["gtpCommandsSecond"] =
+                ob.contains("white_gtp_commands")
+                    ? getGtpCommandsString(ob.value("white_gtp_commands"))
+                    : parameters["gtpCommands"];
+        }
+
+        o.type(Order::Validation);
         o.parameters(parameters);
         if (m_delNetworks) {
-            if (m_lastMatch.parameters()["firstNet"] != net1 &&
-                m_lastMatch.parameters()["firstNet"] != net2) {
-                QTextStream(stdout) << "Deleting network " << "networks/" + m_lastMatch.parameters()["firstNet"] << endl;
-                QFile::remove("networks/" + m_lastMatch.parameters()["firstNet"]);
+            if (m_lastMatch.parameters()["firstNet"] != net1
+                && m_lastMatch.parameters()["firstNet"] != net2) {
+                QTextStream(stdout)
+                    << "Deleting network "
+                    << "networks/" + m_lastMatch.parameters()["firstNet"]
+                           + ".gz"
+                    << endl;
+                QFile::remove("networks/" + m_lastMatch.parameters()["firstNet"]
+                              + ".gz");
             }
-            if (m_lastMatch.parameters()["secondNet"] != net1 &&
-                m_lastMatch.parameters()["secondNet"] != net2) {
-                QTextStream(stdout) << "Deleting network " << "networks/" + m_lastMatch.parameters()["secondNet"] << endl;
-                QFile::remove("networks/" + m_lastMatch.parameters()["secondNet"]);
+            if (m_lastMatch.parameters()["secondNet"] != net1
+                && m_lastMatch.parameters()["secondNet"] != net2) {
+                QTextStream(stdout)
+                    << "Deleting network "
+                    << "networks/" + m_lastMatch.parameters()["secondNet"]
+                           + ".gz"
+                    << endl;
+                QFile::remove("networks/"
+                              + m_lastMatch.parameters()["secondNet"] + ".gz");
             }
         }
         m_lastMatch = o;
@@ -425,10 +515,12 @@ Order Management::getWorkInternal(bool tuning) {
         QTextStream(stdout) << "second network " << net2 << "." << endl;
     }
     if (ob.value("cmd").toString() == "wait") {
-        o.type(Order::Wait);
         parameters["minutes"] = ob.value("minutes").toString();
+
+        o.type(Order::Wait);
         o.parameters(parameters);
-        QTextStream(stdout) << "minutes: " << parameters["minutes"]  << "." << endl;
+        QTextStream(stdout)
+            << "minutes: " << parameters["minutes"] << "." << endl;
     }
     return o;
 }
@@ -437,25 +529,26 @@ Order Management::getWork(bool tuning) {
     for (auto retries = 0; retries < MAX_RETRIES; retries++) {
         try {
             return getWorkInternal(tuning);
-        } catch (NetworkException ex) {
+        } catch (const NetworkException& ex) {
             QTextStream(stdout)
                 << "Network connection to server failed." << endl;
-            QTextStream(stdout)
-                << ex.what() << endl;
+            QTextStream(stdout) << ex.what() << endl;
             auto retry_delay =
-                std::min<int>(
-                    RETRY_DELAY_MIN_SEC * std::pow(1.5, retries),
-                    RETRY_DELAY_MAX_SEC);
-            QTextStream(stdout) << "Retrying in " << retry_delay << " s."
-                                << endl;
+                std::min<int>(RETRY_DELAY_MIN_SEC * std::pow(1.5, retries),
+                              RETRY_DELAY_MAX_SEC);
+            QTextStream(stdout)
+                << "Retrying in " << retry_delay << " s." << endl;
             QThread::sleep(retry_delay);
         }
     }
-    QTextStream(stdout) << "Maximum number of retries exceeded. Falling back to previous network."
+    QTextStream(stdout) << "Maximum number of retries exceeded. Falling back "
+                           "to previous network."
                         << endl;
     if (m_fallBack.type() != Order::Error) {
-        QMap<QString,QString> map = m_fallBack.parameters();
-        QString seed = QString::number(QUuid::createUuid().toRfc4122().toHex().left(8).toLongLong(Q_NULLPTR, 16));
+        QMap<QString, QString> map = m_fallBack.parameters();
+        QString seed = QString::number(
+            QUuid::createUuid().toRfc4122().toHex().left(8).toLongLong(
+                Q_NULLPTR, 16));
         QString rs = "-s " + seed + " ";
         map["rndSeed"] = seed;
         QString opt = map["options"];
@@ -468,10 +561,7 @@ Order Management::getWork(bool tuning) {
     exit(EXIT_FAILURE);
 }
 
-
-bool Management::networkExists(const QString &name) {
-    QString realHash = name;
-    realHash.remove(0,9);
+bool Management::networkExists(const QString& name, const QString& gzipHash) {
     if (QFileInfo::exists(name)) {
         QFile f(name);
         if (f.open(QFile::ReadOnly)) {
@@ -480,9 +570,12 @@ bool Management::networkExists(const QString &name) {
                 throw NetworkException("Reading network file failed.");
             }
             QString result = hash.result().toHex();
-            if (result == realHash) {
+            if (result == gzipHash) {
                 return true;
             }
+            QTextStream(stdout)
+                << "Downloaded network hash doesn't match, calculated: "
+                << result << " it should be: " << gzipHash << endl;
         } else {
             QTextStream(stdout)
                 << "Unable to open network file for reading." << endl;
@@ -492,19 +585,17 @@ bool Management::networkExists(const QString &name) {
             throw NetworkException("Unable to delete the network file."
                                    " Check permissions.");
         }
-        QTextStream(stdout) << "Downloaded network hash doesn't match." << endl;
-        f.remove();
     }
     return false;
 }
 
-void Management::fetchNetwork(const QString &net) {
-    QString name = "networks/" + net;
-    if (networkExists(name)) {
+void Management::fetchNetwork(const QString& net, const QString& hash) {
+    QString name = "networks/" + net + ".gz";
+    if (networkExists(name, hash)) {
         return;
     }
-    if (QFileInfo::exists(name + ".gz")) {
-        QFile f_gz(name + ".gz");
+    if (QFileInfo::exists(name)) {
+        QFile f_gz(name);
         // Curl refuses to overwrite, so make sure to delete the gzipped
         // network if it exists
         f_gz.remove();
@@ -516,9 +607,9 @@ void Management::fetchNetwork(const QString &net) {
 #endif
     // Be quiet, but output the real file name we saved.
     // Use the filename from the server.
-    prog_cmdline.append(" -s -J -o " + name + ".gz ");
+    prog_cmdline.append(" -s -J -o " + name + " ");
     prog_cmdline.append(" -w %{filename_effective}");
-    prog_cmdline.append(" http://zero.sjeng.org/" + name + ".gz");
+    prog_cmdline.append(" " + server_url + name);
 
     QProcess curl;
     curl.start(prog_cmdline);
@@ -533,27 +624,38 @@ void Management::fetchNetwork(const QString &net) {
     QString outstr(output);
     QStringList outlst = outstr.split("\n");
     QString outfile = outlst[0];
+    QTextStream(stdout) << "Net filename: " << outfile << endl;
+    return;
+}
+
+QString Management::fetchGameData(const QString& name,
+                                  const QString& extension) {
+    QString prog_cmdline("curl");
 #ifdef WIN32
-    QProcess::execute("gzip.exe -d -q " + outfile);
-#else
-    QProcess::execute("gunzip -q " + outfile);
+    prog_cmdline.append(".exe");
 #endif
-    // Remove extension (.gz)
-    outfile.chop(3);
-    QTextStream(stdout) << "Net filename: " << outfile << endl;
 
-    if (!networkExists(name)) {
-        //If gunzip failed remove the .gz file
-        QFile f_gz(name + ".gz");
-        f_gz.remove();
-        throw NetworkException("Failed to fetch the network");
+    const auto fileName = QUuid::createUuid().toRfc4122().toHex();
+
+    // Be quiet, but output the real file name we saved.
+    // Use the filename from the server.
+    prog_cmdline.append(" -s -J -o " + fileName + "." + extension);
+    prog_cmdline.append(" -w %{filename_effective}");
+    prog_cmdline.append(" " + server_url + "view/" + name + "." + extension);
+
+    QProcess curl;
+    curl.start(prog_cmdline);
+    curl.waitForFinished(-1);
+
+    if (curl.exitCode()) {
+        throw NetworkException("Curl returned non-zero exit code "
+                               + std::to_string(curl.exitCode()));
     }
 
-    return;
+    return fileName;
 }
 
-
-void Management::archiveFiles(const QString &fileName) {
+void Management::archiveFiles(const QString& fileName) {
     if (!m_keepPath.isEmpty()) {
         QFile(fileName + ".sgf").copy(m_keepPath + '/' + fileName + ".sgf");
     }
@@ -568,7 +670,7 @@ void Management::archiveFiles(const QString &fileName) {
         }
     }
 }
-void Management::cleanupFiles(const QString &fileName) {
+void Management::cleanupFiles(const QString& fileName) {
     QDir dir;
     QStringList filters;
     filters << fileName + ".*";
@@ -580,8 +682,8 @@ void Management::cleanupFiles(const QString &fileName) {
     }
 }
 
-void Management::gzipFile(const QString &fileName) {
-    QString gzipCmd ="gzip";
+void Management::gzipFile(const QString& fileName) {
+    QString gzipCmd = "gzip";
 #ifdef WIN32
     gzipCmd.append(".exe");
 #endif
@@ -589,8 +691,10 @@ void Management::gzipFile(const QString &fileName) {
     QProcess::execute(gzipCmd);
 }
 
-void Management::saveCurlCmdLine(const QStringList &prog_cmdline, const QString &name) {
-    QString fileName = "curl_save" + QUuid::createUuid().toRfc4122().toHex() + ".bin";
+void Management::saveCurlCmdLine(const QStringList& prog_cmdline,
+                                 const QString& name) {
+    QString fileName =
+        "curl_save" + QUuid::createUuid().toRfc4122().toHex() + ".bin";
     QLockFile lf(fileName + ".lock");
     lf.lock();
     QFile f(fileName);
@@ -617,7 +721,7 @@ void Management::sendAllGames() {
     QFileInfoList list = dir.entryInfoList();
     for (int i = 0; i < list.size(); ++i) {
         QFileInfo fileInfo = list.at(i);
-        QLockFile lf(fileInfo.fileName()+".lock");
+        QLockFile lf(fileInfo.fileName() + ".lock");
         if (!lf.tryLock(10)) {
             continue;
         }
@@ -643,26 +747,25 @@ void Management::sendAllGames() {
         try {
             sent = sendCurl(lines);
             if (sent) {
-                QTextStream(stdout) << "File: " << file.fileName() << " sent" << endl;
+                QTextStream(stdout)
+                    << "File: " << file.fileName() << " sent" << endl;
                 file.remove();
                 cleanupFiles(name);
-                if (i+1 < list.size()) {
+                if (i + 1 < list.size()) {
                     QThread::sleep(10);
                 }
             }
-        } catch (NetworkException ex) {
+        } catch (const NetworkException& ex) {
             QTextStream(stdout)
                 << "Network connection to server failed." << endl;
+            QTextStream(stdout) << ex.what() << endl;
             QTextStream(stdout)
-                << ex.what() << endl;
-            QTextStream(stdout)
-                    << "Retrying when next game is finished."
-                    << endl;
+                << "Retrying when next game is finished." << endl;
         }
     }
 }
 
-bool Management::sendCurl(const QStringList &lines) {
+bool Management::sendCurl(const QStringList& lines) {
     QString prog_cmdline("curl");
 #ifdef WIN32
     prog_cmdline.append(".exe");
@@ -676,11 +779,11 @@ bool Management::sendCurl(const QStringList &lines) {
     curl.start(prog_cmdline);
     curl.waitForFinished(-1);
     if (curl.exitCode()) {
-        QTextStream(stdout) << "Upload failed. Curl Exit code: "
-            << curl.exitCode() << endl;
+        QTextStream(stdout)
+            << "Upload failed. Curl Exit code: " << curl.exitCode() << endl;
         QTextStream(stdout) << curl.readAllStandardOutput();
         throw NetworkException("Curl returned non-zero exit code "
-                                   + std::to_string(curl.exitCode()));
+                               + std::to_string(curl.exitCode()));
         return false;
     }
     QTextStream(stdout) << curl.readAllStandardOutput();
@@ -697,11 +800,13 @@ bool Management::sendCurl(const QStringList &lines) {
 -F options_hash=c2e3
 -F random_seed=0
 -F sgf=@file
-http://zero.sjeng.org/submit-match
+https://zero.sjeng.org/submit-match
 */
 
-void Management::uploadResult(const QMap<QString,QString> &r, const QMap<QString,QString> &l) {
-    QTextStream(stdout) << "Uploading match: " << r["file"] << ".sgf for networks ";
+void Management::uploadResult(const QMap<QString, QString>& r,
+                              const QMap<QString, QString>& l) {
+    QTextStream(stdout) << "Uploading match: " << r["file"]
+                        << ".sgf for networks ";
     QTextStream(stdout) << l["firstNet"] << " and " << l["secondNet"] << endl;
     archiveFiles(r["file"]);
     gzipFile(r["file"] + ".sgf");
@@ -714,30 +819,28 @@ void Management::uploadResult(const QMap<QString,QString> &r, const QMap<QString
         prog_cmdline.append("-F loserhash=" + l["firstNet"]);
     }
     prog_cmdline.append("-F clientversion=" + QString::number(m_version));
-    prog_cmdline.append("-F winnercolor="+ r["winner"]);
-    prog_cmdline.append("-F movescount="+ r["moves"]);
-    prog_cmdline.append("-F score="+ r["score"]);
-    prog_cmdline.append("-F options_hash="+ l["optHash"]);
-    prog_cmdline.append("-F random_seed="+ l["rndSeed"]);
-    prog_cmdline.append("-F sgf=@"+ r["file"] + ".sgf.gz");
-    prog_cmdline.append("http://zero.sjeng.org/submit-match");
+    prog_cmdline.append("-F winnercolor=" + r["winner"]);
+    prog_cmdline.append("-F movescount=" + r["moves"]);
+    prog_cmdline.append("-F score=" + r["score"]);
+    prog_cmdline.append("-F options_hash=" + l["optHash"]);
+    prog_cmdline.append("-F random_seed=" + l["rndSeed"]);
+    prog_cmdline.append("-F sgf=@" + r["file"] + ".sgf.gz");
+    prog_cmdline.append(server_url + "submit-match");
 
     bool sent = false;
     for (auto retries = 0; retries < MAX_RETRIES; retries++) {
         try {
             sent = sendCurl(prog_cmdline);
             break;
-        } catch (NetworkException ex) {
+        } catch (const NetworkException& ex) {
             QTextStream(stdout)
                 << "Network connection to server failed." << endl;
-            QTextStream(stdout)
-                << ex.what() << endl;
+            QTextStream(stdout) << ex.what() << endl;
             auto retry_delay =
-                std::min<int>(
-                    RETRY_DELAY_MIN_SEC * std::pow(1.5, retries),
-                    RETRY_DELAY_MAX_SEC);
-            QTextStream(stdout) << "Retrying in " << retry_delay << " s."
-                                << endl;
+                std::min<int>(RETRY_DELAY_MIN_SEC * std::pow(1.5, retries),
+                              RETRY_DELAY_MAX_SEC);
+            QTextStream(stdout)
+                << "Retrying in " << retry_delay << " s." << endl;
             QThread::sleep(retry_delay);
         }
     }
@@ -748,7 +851,6 @@ void Management::uploadResult(const QMap<QString,QString> &r, const QMap<QString
     cleanupFiles(r["file"]);
 }
 
-
 /*
 -F networkhash=223737476718d58a4a5b0f317a1eeeb4b38f0c06af5ab65cb9d76d68d9abadb6
 -F clientversion=6
@@ -756,40 +858,40 @@ void Management::uploadResult(const QMap<QString,QString> &r, const QMap<QString
 -F random_seed=1
 -F sgf=@file
 -F trainingdata=@data_file
-http://zero.sjeng.org/submit
+https://zero.sjeng.org/submit
 */
 
-void Management::uploadData(const QMap<QString,QString> &r, const QMap<QString,QString> &l) {
-    QTextStream(stdout) << "Uploading game: " << r["file"] << ".sgf for network " << l["network"] << endl;
+void Management::uploadData(const QMap<QString, QString>& r,
+                            const QMap<QString, QString>& l) {
+    QTextStream(stdout) << "Uploading game: " << r["file"]
+                        << ".sgf for network " << l["network"] << endl;
     archiveFiles(r["file"]);
     gzipFile(r["file"] + ".sgf");
     QStringList prog_cmdline;
     prog_cmdline.append("-F networkhash=" + l["network"]);
     prog_cmdline.append("-F clientversion=" + QString::number(m_version));
-    prog_cmdline.append("-F options_hash="+ l["optHash"]);
-    prog_cmdline.append("-F movescount="+ r["moves"]);
-    prog_cmdline.append("-F winnercolor="+ r["winner"]);
-    prog_cmdline.append("-F random_seed="+ l["rndSeed"]);
+    prog_cmdline.append("-F options_hash=" + l["optHash"]);
+    prog_cmdline.append("-F movescount=" + r["moves"]);
+    prog_cmdline.append("-F winnercolor=" + r["winner"]);
+    prog_cmdline.append("-F random_seed=" + l["rndSeed"]);
     prog_cmdline.append("-F sgf=@" + r["file"] + ".sgf.gz");
     prog_cmdline.append("-F trainingdata=@" + r["file"] + ".txt.0.gz");
-    prog_cmdline.append("http://zero.sjeng.org/submit");
+    prog_cmdline.append(server_url + "submit");
 
     bool sent = false;
     for (auto retries = 0; retries < MAX_RETRIES; retries++) {
         try {
             sent = sendCurl(prog_cmdline);
             break;
-        } catch (NetworkException ex) {
+        } catch (const NetworkException& ex) {
             QTextStream(stdout)
                 << "Network connection to server failed." << endl;
-            QTextStream(stdout)
-                << ex.what() << endl;
+            QTextStream(stdout) << ex.what() << endl;
             auto retry_delay =
-                std::min<int>(
-                    RETRY_DELAY_MIN_SEC * std::pow(1.5, retries),
-                    RETRY_DELAY_MAX_SEC);
-            QTextStream(stdout) << "Retrying in " << retry_delay << " s."
-                                << endl;
+                std::min<int>(RETRY_DELAY_MIN_SEC * std::pow(1.5, retries),
+                              RETRY_DELAY_MAX_SEC);
+            QTextStream(stdout)
+                << "Retrying in " << retry_delay << " s." << endl;
             QThread::sleep(retry_delay);
         }
     }
diff --git a/autogtp/Management.h b/autogtp/Management.h
index cb6715c13..a5c503b92 100644
--- a/autogtp/Management.h
+++ b/autogtp/Management.h
@@ -19,33 +19,31 @@
 #define MANAGEMENT_H
 
 #include <QAtomicInt>
+#include <QFileInfo>
+#include <QLockFile>
 #include <QMutex>
 #include <QString>
 #include <QTextStream>
 #include <QThread>
-#include <QFileInfo>
-#include <QLockFile>
 #include <QVector>
 #include <chrono>
 #include <stdexcept>
+
 #include "Worker.h"
 
-constexpr int AUTOGTP_VERSION = 16;
+constexpr int AUTOGTP_VERSION = 18;
 
 class Management : public QObject {
     Q_OBJECT
 public:
-    Management(const int gpus,
-               const int games,
-               const QStringList& gpuslist,
-               const int ver,
-               const int maxGame,
-               const bool delNetworks,
-               const QString& keep,
+    Management(int gpus, int games, const QStringList& gpuslist, int ver,
+               int maxGame, bool delNetworks, const QString& keep,
                const QString& debug);
     ~Management() = default;
     void giveAssignments();
-    void incMoves() { m_movesMade++; }
+    void incMoves() {
+        m_movesMade++;
+    }
     void wait();
 signals:
     void sendQuit();
@@ -54,12 +52,9 @@ public slots:
     void storeGames();
 
 private:
-
-    struct NetworkException: public std::runtime_error
-    {
+    struct NetworkException : public std::runtime_error {
         NetworkException(std::string const& message)
-            : std::runtime_error("NetworkException: " + message)
-        {}
+            : std::runtime_error("NetworkException: " + message) {}
     };
     QMutex m_syncMutex;
     QVector<Worker*> m_gamesThreads;
@@ -81,28 +76,35 @@ public slots:
     int m_gamesLeft;
     int m_threadsLeft;
     bool m_delNetworks;
-    QLockFile *m_lockFile;
+    QLockFile* m_lockFile;
+    QString m_leelaversion;
 
     Order getWorkInternal(bool tuning);
     Order getWork(bool tuning = false);
-    Order getWork(const QFileInfo &file);
-    QString getOption(const QJsonObject &ob, const QString &key, const QString &opt, const QString &defValue);
-    QString getBoolOption(const QJsonObject &ob, const QString &key, const QString &opt, bool defValue);
-    QString getOptionsString(const QJsonObject &opt, const QString &rnd);
+    Order getWork(const QFileInfo& file);
+    QString getOption(const QJsonObject& ob, const QString& key,
+                      const QString& opt, const QString& defValue);
+    QString getBoolOption(const QJsonObject& ob, const QString& key,
+                          const QString& opt, bool defValue);
+    QString getOptionsString(const QJsonObject& opt, const QString& rnd);
+    QString getGtpCommandsString(const QJsonValue& gtpCommands);
     void sendAllGames();
     void checkStoredGames();
     QFileInfo getNextStored();
-    bool networkExists(const QString &name);
-    void fetchNetwork(const QString &net);
+    bool networkExists(const QString& name, const QString& gzipHash);
+    void fetchNetwork(const QString& net, const QString& hash);
+    QString fetchGameData(const QString& name, const QString& extension);
     void printTimingInfo(float duration);
-    void runTuningProcess(const QString &tuneCmdLine);
-    void gzipFile(const QString &fileName);
-    bool sendCurl(const QStringList &lines);
-    void saveCurlCmdLine(const QStringList &prog_cmdline, const QString &name);
-    void archiveFiles(const QString &fileName);
-    void cleanupFiles(const QString &fileName);
-    void uploadData(const QMap<QString,QString> &r, const QMap<QString,QString> &l);
-    void uploadResult(const QMap<QString, QString> &r, const QMap<QString, QString> &l);
+    void runTuningProcess(const QString& tuneCmdLine);
+    void gzipFile(const QString& fileName);
+    bool sendCurl(const QStringList& lines);
+    void saveCurlCmdLine(const QStringList& prog_cmdline, const QString& name);
+    void archiveFiles(const QString& fileName);
+    void cleanupFiles(const QString& fileName);
+    void uploadData(const QMap<QString, QString>& r,
+                    const QMap<QString, QString>& l);
+    void uploadResult(const QMap<QString, QString>& r,
+                      const QMap<QString, QString>& l);
 };
 
 #endif
diff --git a/autogtp/Order.cpp b/autogtp/Order.cpp
index 874d892f6..bda07fafa 100644
--- a/autogtp/Order.cpp
+++ b/autogtp/Order.cpp
@@ -16,11 +16,12 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "Order.h"
 #include <QFile>
 #include <QTextStream>
 
-void Order::save(const QString &file) {
+#include "Order.h"
+
+void Order::save(const QString& file) {
     QFile f(file);
     if (!f.open(QIODevice::WriteOnly | QIODevice::Text)) {
         return;
@@ -28,28 +29,27 @@ void Order::save(const QString &file) {
     QTextStream out(&f);
     out << m_type << endl;
     out << m_parameters.size() << endl;
-    for (QString key : m_parameters.keys())
-    {
+    for (QString key : m_parameters.keys()) {
         out << key << " " << m_parameters.value(key) << endl;
     }
     out.flush();
     f.close();
 }
 
-void Order::load(const QString &file) {
+void Order::load(const QString& file) {
     QFile f(file);
     if (!f.open(QIODevice::ReadOnly | QIODevice::Text)) {
         return;
     }
     QTextStream in(&f);
-    in >>  m_type;
+    in >> m_type;
     int count;
     in >> count;
     QString key;
     for (int i = 0; i < count; i++) {
         in >> key;
-        if (key == "options") {
-           m_parameters[key] = in.readLine();
+        if (key.contains("options") || key.contains("gtpCommands")) {
+            m_parameters[key] = in.readLine().remove(0, 1);
         } else {
             in >> m_parameters[key];
         }
diff --git a/autogtp/Order.h b/autogtp/Order.h
index 758efd05c..38a780e83 100644
--- a/autogtp/Order.h
+++ b/autogtp/Order.h
@@ -19,8 +19,8 @@
 #ifndef ORDER_H
 #define ORDER_H
 
-#include <QString>
 #include <QMap>
+#include <QString>
 
 class Order {
 public:
@@ -33,22 +33,44 @@ class Order {
         RestoreSelfPlayed
     };
     Order() = default;
-    Order(int t, QMap<QString,QString> p = QMap<QString,QString>()) { m_type = t; m_parameters = p; }
-    Order(const Order &o) { m_type = o.m_type; m_parameters = o.m_parameters; }
-    Order &operator=(const Order &o) { m_type = o.m_type; m_parameters = o.m_parameters; return *this; }
+    Order(int t, QMap<QString, QString> p = QMap<QString, QString>()) {
+        m_type = t;
+        m_parameters = p;
+    }
+    Order(const Order& o) {
+        m_type = o.m_type;
+        m_parameters = o.m_parameters;
+    }
+    Order& operator=(const Order& o) {
+        m_type = o.m_type;
+        m_parameters = o.m_parameters;
+        return *this;
+    }
     ~Order() = default;
-    void type(int t) { m_type = t; }
-    int type() const { return m_type; }
-    QMap<QString,QString> parameters() const { return m_parameters; }
-    void parameters(const QMap<QString,QString> &l) { m_parameters = l; }
-    void add(const QString &key, const QString &value) { m_parameters[key] = value; }
-    bool isValid() { return (m_type > Error && m_type <= RestoreSelfPlayed); }
-    void save(const QString &file);
-    void load(const QString &file);
+    void type(int t) {
+        m_type = t;
+    }
+    int type() const {
+        return m_type;
+    }
+    QMap<QString, QString> parameters() const {
+        return m_parameters;
+    }
+    void parameters(const QMap<QString, QString>& l) {
+        m_parameters = l;
+    }
+    void add(const QString& key, const QString& value) {
+        m_parameters[key] = value;
+    }
+    bool isValid() {
+        return m_type > Error && m_type <= RestoreSelfPlayed;
+    }
+    void save(const QString& file);
+    void load(const QString& file);
 
 private:
     int m_type;
-    QMap<QString,QString> m_parameters;
+    QMap<QString, QString> m_parameters;
 };
 
 #endif // ORDER_H
diff --git a/autogtp/README.md b/autogtp/README.md
index fe9555c9f..bded50853 100644
--- a/autogtp/README.md
+++ b/autogtp/README.md
@@ -33,6 +33,7 @@ directly.
 Copy the compiled leelaz binary into the autogtp directory, and run
 autogtp.
 
-    cp ../src/leelaz .
+    cp ../build/leelaz .
     ./autogtp
 
+While autogtp is running, typing q+Enter will save the processed data and exit. When autogtp runs next, autogtp will continue the game.
diff --git a/autogtp/Result.h b/autogtp/Result.h
index 4405802c7..2d05535b8 100644
--- a/autogtp/Result.h
+++ b/autogtp/Result.h
@@ -19,8 +19,8 @@
 #ifndef RESULT_H
 #define RESULT_H
 
-#include <QString>
 #include <QMap>
+#include <QString>
 
 class Result {
 public:
@@ -34,16 +34,30 @@ class Result {
         Error
     };
     Result() = default;
-    Result(int t, QMap<QString,QString> n = QMap<QString,QString>()) { m_type = t, m_parameters = n; }
+    Result(int t, QMap<QString, QString> n = QMap<QString, QString>()) {
+        m_type = t;
+        m_parameters = n;
+    }
     ~Result() = default;
-    void type(int t) { m_type = t; }
-    int type() { return m_type; }
-    void add(const QString &name, const QString &value) { m_parameters[name] = value; }
-    QMap<QString,QString> parameters() { return m_parameters; }
-    void clear() { m_parameters.clear(); }
+    void type(int t) {
+        m_type = t;
+    }
+    int type() {
+        return m_type;
+    }
+    void add(const QString& name, const QString& value) {
+        m_parameters[name] = value;
+    }
+    QMap<QString, QString> parameters() {
+        return m_parameters;
+    }
+    void clear() {
+        m_parameters.clear();
+    }
+
 private:
     int m_type;
-    QMap<QString,QString> m_parameters;
+    QMap<QString, QString> m_parameters;
 };
 
 #endif // RESULT_H
diff --git a/autogtp/Worker.cpp b/autogtp/Worker.cpp
index 76972ba46..af74b82c8 100644
--- a/autogtp/Worker.cpp
+++ b/autogtp/Worker.cpp
@@ -16,21 +16,17 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "Worker.h"
-#include "Game.h"
-#include <QTextStream>
 #include <QLockFile>
+#include <QTextStream>
 #include <QUuid>
 #include <chrono>
 
+#include "Worker.h"
+
+#include "Game.h"
 
-Worker::Worker(int index, const QString& gpuIndex, Management *parent) :
-    m_index(index),
-    m_state(),
-    m_gpu(""),
-    m_job(nullptr),
-    m_boss(parent)
-{
+Worker::Worker(int index, const QString& gpuIndex, Management* parent)
+    : m_index(index), m_state(), m_gpu(""), m_job(nullptr), m_boss(parent) {
     if (!gpuIndex.isEmpty()) {
         m_gpu = " --gpu=" + gpuIndex + " ";
     }
@@ -42,8 +38,7 @@ void Worker::doStore() {
     m_state.store(STORING);
 }
 
-void Worker::order(Order o)
-{
+void Worker::order(Order o) {
     if (!o.isValid()) {
         if (m_job != nullptr) {
             m_job->finish();
@@ -57,46 +52,44 @@ void Worker::order(Order o)
     m_job->init(m_todo);
 }
 
-
 void Worker::createJob(int type) {
     if (m_job != nullptr) {
         delete m_job;
     }
     switch (type) {
-    case Order::Production:
-    case Order::RestoreSelfPlayed:
-        m_job = new ProductionJob(m_gpu, m_boss);
-        break;
-    case Order::Validation:
-    case Order::RestoreMatch:
-        m_job = new ValidationJob(m_gpu, m_boss);
-        break;
-    case Order::Wait:
-        m_job = new WaitJob(m_gpu, m_boss);
-        break;
+        case Order::Production:
+        case Order::RestoreSelfPlayed:
+            m_job = new ProductionJob(m_gpu, m_boss);
+            break;
+        case Order::Validation:
+        case Order::RestoreMatch:
+            m_job = new ValidationJob(m_gpu, m_boss);
+            break;
+        case Order::Wait:
+            m_job = new WaitJob(m_gpu, m_boss);
+            break;
     }
 }
 
 void Worker::run() {
-     Result res;
-     do {
+    Result res;
+    do {
         auto start = std::chrono::high_resolution_clock::now();
         res = m_job->execute();
         auto end = std::chrono::high_resolution_clock::now();
         auto gameDuration =
-        std::chrono::duration_cast<std::chrono::seconds>(end - start).count();
+            std::chrono::duration_cast<std::chrono::seconds>(end - start)
+                .count();
         if (m_state != STORING) {
             emit resultReady(m_todo, res, m_index, gameDuration);
         }
     } while (m_state == RUNNING);
     if (m_state == STORING) {
         m_todo.add("moves", res.parameters()["moves"]);
+        m_todo.add("sgf", res.parameters()["sgf"]);
         if (res.type() == Result::StoreMatch) {
-            m_todo.add("sgfFirst", res.parameters()["sgfFirst"]);
-            m_todo.add("sgfSecond", res.parameters()["sgfSecond"]);
             m_todo.type(Order::RestoreMatch);
         } else {
-            m_todo.add("sgf", res.parameters()["sgf"]);
             m_todo.type(Order::RestoreSelfPlayed);
         }
         QString unique = QUuid::createUuid().toRfc4122().toHex();
diff --git a/autogtp/Worker.h b/autogtp/Worker.h
index d9d9fcf1b..f42d3ef14 100644
--- a/autogtp/Worker.h
+++ b/autogtp/Worker.h
@@ -19,12 +19,12 @@
 #ifndef WORKER_H
 #define WORKER_H
 
+#include <QMutex>
+#include <QThread>
+
 #include "Job.h"
 #include "Order.h"
 
-#include <QThread>
-#include <QMutex>
-
 class Management;
 
 class Worker : public QThread {
@@ -35,21 +35,25 @@ class Worker : public QThread {
         FINISHING,
         STORING
     };
-    Worker(int index, const QString& gpuIndex, Management *parent);
+    Worker(int index, const QString& gpuIndex, Management* parent);
     ~Worker() = default;
     void order(Order o);
-    void doFinish() { m_job->finish(); m_state.store(FINISHING); }
+    void doFinish() {
+        m_job->finish();
+        m_state.store(FINISHING);
+    }
     void doStore();
     void run() override;
 signals:
     void resultReady(Order o, Result r, int index, int duration);
+
 private:
     int m_index;
     QAtomicInt m_state;
     QString m_gpu;
     Order m_todo;
-    Job *m_job;
-    Management *m_boss;
+    Job* m_job;
+    Management* m_boss;
     void createJob(int type);
 };
 
diff --git a/autogtp/main.cpp b/autogtp/main.cpp
index 3bc08c5f6..512eed026 100644
--- a/autogtp/main.cpp
+++ b/autogtp/main.cpp
@@ -17,28 +17,29 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <QtCore/QCoreApplication>
-#include <QtCore/QTimer>
-#include <QtCore/QTextStream>
-#include <QtCore/QStringList>
 #include <QCommandLineParser>
-#include <QProcess>
+#include <QDebug>
+#include <QDir>
 #include <QFile>
 #include <QFileInfo>
-#include <QDir>
+#include <QProcess>
+#include <QtCore/QCoreApplication>
+#include <QtCore/QStringList>
+#include <QtCore/QTextStream>
+#include <QtCore/QTimer>
 #include <QtWidgets/QShortcut>
-#include <QDebug>
 #include <chrono>
 #ifdef WIN32
 #include <direct.h>
 #endif
 #include <QCommandLineParser>
 #include <iostream>
+
+#include "Console.h"
 #include "Game.h"
 #include "Management.h"
-#include "Console.h"
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
     QCoreApplication app(argc, argv);
     app.setApplicationName("autogtp");
     app.setApplicationVersion(QString("v%1").arg(AUTOGTP_VERSION));
@@ -49,34 +50,32 @@ int main(int argc, char *argv[]) {
 
     QCommandLineOption gamesNumOption(
         {"g", "gamesNum"},
-              "Play 'gamesNum' games on one GPU at the same time.",
-              "num", "1");
+        "Play 'gamesNum' games on one device (GPU/CPU) at the same time.",
+        "num", "1");
     QCommandLineOption gpusOption(
         {"u", "gpus"},
-              "Index of the GPU to use for multiple GPUs support.",
-              "num");
+        "Index of the device(s) to use for multiple devices support.", "num");
     QCommandLineOption keepSgfOption(
-        {"k", "keepSgf" },
-              "Save SGF files after each self-play game.",
-              "output directory");
+        {"k", "keepSgf"}, "Save SGF files after each self-play game.",
+        "output directory");
     QCommandLineOption keepDebugOption(
-        { "d", "debug" }, "Save training and extra debug files after each self-play game.",
-                          "output directory");
-    QCommandLineOption timeoutOption(
-        { "t", "timeout" }, "Save running games after the timeout (in minutes) is passed and then exit.",
-                          "time in minutes");
+        {"d", "debug"},
+        "Save training and extra debug files after each self-play game.",
+        "output directory");
+    QCommandLineOption timeoutOption({"t", "timeout"},
+                                     "Save running games after the timeout (in "
+                                     "minutes) is passed and then exit.",
+                                     "time in minutes");
 
     QCommandLineOption singleOption(
-        { "s", "single" }, "Exit after the first game is completed.",
-                          "");
+        {"s", "single"}, "Exit after the first game is completed.", "");
 
     QCommandLineOption maxOption(
-        { "m", "maxgames" }, "Exit after the given number of games is completed.",
-                          "max number of games");
+        {"m", "maxgames"}, "Exit after the given number of games is completed.",
+        "max number of games");
 
     QCommandLineOption eraseOption(
-        { "e", "erase" }, "Erase old networks when new ones are available.",
-                          "");
+        {"e", "erase"}, "Erase old networks when new ones are available.", "");
 
     parser.addOption(gamesNumOption);
     parser.addOption(gpusOption);
@@ -119,7 +118,7 @@ int main(int argc, char *argv[]) {
     // Map streams
     QTextStream cerr(stderr, QIODevice::WriteOnly);
     cerr << "AutoGTP v" << AUTOGTP_VERSION << endl;
-    cerr << "Using " << gamesNum << " thread(s) for GPU(s)." << endl;
+    cerr << "Using " << gamesNum << " game thread(s) per device." << endl;
     if (parser.isSet(keepSgfOption)) {
         if (!QDir().mkpath(parser.value(keepSgfOption))) {
             cerr << "Couldn't create output directory for self-play SGF files!"
@@ -129,33 +128,38 @@ int main(int argc, char *argv[]) {
     }
     if (parser.isSet(keepDebugOption)) {
         if (!QDir().mkpath(parser.value(keepDebugOption))) {
-            cerr << "Couldn't create output directory for self-play Debug files!"
-                 << endl;
+            cerr
+                << "Couldn't create output directory for self-play Debug files!"
+                << endl;
             return EXIT_FAILURE;
         }
     }
-    Console *cons = nullptr;
+    Console* cons = nullptr;
     if (!QDir().mkpath("networks")) {
-        cerr << "Couldn't create the directory for the networks files!"
-             << endl;
+        cerr << "Couldn't create the directory for the networks files!" << endl;
         return EXIT_FAILURE;
     }
-    Management *boss = new Management(gpusNum, gamesNum, gpusList, AUTOGTP_VERSION, maxNum,
-                                      parser.isSet(eraseOption), parser.value(keepSgfOption),
-                                      parser.value(keepDebugOption));
-    QObject::connect(&app, &QCoreApplication::aboutToQuit, boss, &Management::storeGames);
-    QTimer *timer = new QTimer();
+    Management* boss =
+        new Management(gpusNum, gamesNum, gpusList, AUTOGTP_VERSION, maxNum,
+                       parser.isSet(eraseOption), parser.value(keepSgfOption),
+                       parser.value(keepDebugOption));
+    QObject::connect(&app, &QCoreApplication::aboutToQuit, boss,
+                     &Management::storeGames);
+    QTimer* timer = new QTimer();
     boss->giveAssignments();
     if (parser.isSet(timeoutOption)) {
-        QObject::connect(timer, &QTimer::timeout, &app, &QCoreApplication::quit);
+        QObject::connect(timer, &QTimer::timeout, &app,
+                         &QCoreApplication::quit);
         timer->start(parser.value(timeoutOption).toInt() * 60000);
-    } else {
-        if (parser.isSet(singleOption) || parser.isSet(maxOption)) {
-            QObject::connect(boss, &Management::sendQuit, &app, &QCoreApplication::quit);
-        } else {
-            cons = new Console();
-            QObject::connect(cons, &Console::sendQuit, &app, &QCoreApplication::quit);
-        }
+    }
+    if (parser.isSet(singleOption) || parser.isSet(maxOption)) {
+        QObject::connect(boss, &Management::sendQuit, &app,
+                         &QCoreApplication::quit);
+    }
+    if (true) {
+        cons = new Console();
+        QObject::connect(cons, &Console::sendQuit, &app,
+                         &QCoreApplication::quit);
     }
     return app.exec();
 }
diff --git a/cmake/Modules/GetGitRevisionDescription.cmake b/cmake/Modules/GetGitRevisionDescription.cmake
new file mode 100644
index 000000000..8ab03bc5f
--- /dev/null
+++ b/cmake/Modules/GetGitRevisionDescription.cmake
@@ -0,0 +1,168 @@
+# - Returns a version string from Git
+#
+# These functions force a re-configure on each git commit so that you can
+# trust the values of the variables in your build system.
+#
+#  get_git_head_revision(<refspecvar> <hashvar> [<additional arguments to git describe> ...])
+#
+# Returns the refspec and sha hash of the current head revision
+#
+#  git_describe(<var> [<additional arguments to git describe> ...])
+#
+# Returns the results of git describe on the source tree, and adjusting
+# the output so that it tests false if an error occurs.
+#
+#  git_get_exact_tag(<var> [<additional arguments to git describe> ...])
+#
+# Returns the results of git describe --exact-match on the source tree,
+# and adjusting the output so that it tests false if there was no exact
+# matching tag.
+#
+#  git_local_changes(<var>)
+#
+# Returns either "CLEAN" or "DIRTY" with respect to uncommitted changes.
+# Uses the return code of "git diff-index --quiet HEAD --".
+# Does not regard untracked files.
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+if(__get_git_revision_description)
+	return()
+endif()
+set(__get_git_revision_description YES)
+
+# We must run the following at "include" time, not at function call time,
+# to find the path to this module rather than the path to a calling list file
+get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)
+
+function(get_git_head_revision _refspecvar _hashvar)
+	set(GIT_PARENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+	set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+	while(NOT EXISTS "${GIT_DIR}")	# .git dir not found, search parent directories
+		set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}")
+		get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH)
+		if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT)
+			# We have reached the root directory, we are not in git
+			set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+			set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+			return()
+		endif()
+		set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+	endwhile()
+	# check if this is a submodule
+	if(NOT IS_DIRECTORY ${GIT_DIR})
+		file(READ ${GIT_DIR} submodule)
+		string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" GIT_DIR_RELATIVE ${submodule})
+		get_filename_component(SUBMODULE_DIR ${GIT_DIR} PATH)
+		get_filename_component(GIT_DIR ${SUBMODULE_DIR}/${GIT_DIR_RELATIVE} ABSOLUTE)
+	endif()
+	set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
+	if(NOT EXISTS "${GIT_DATA}")
+		file(MAKE_DIRECTORY "${GIT_DATA}")
+	endif()
+
+	if(NOT EXISTS "${GIT_DIR}/HEAD")
+		return()
+	endif()
+	set(HEAD_FILE "${GIT_DATA}/HEAD")
+	configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY)
+
+	configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
+		"${GIT_DATA}/grabRef.cmake"
+		@ONLY)
+	include("${GIT_DATA}/grabRef.cmake")
+
+	set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE)
+	set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE)
+endfunction()
+
+function(git_describe _var)
+	if(NOT GIT_FOUND)
+		find_package(Git QUIET)
+	endif()
+	get_git_head_revision(refspec hash)
+	if(NOT GIT_FOUND)
+		set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+	if(NOT hash)
+		set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+
+	# TODO sanitize
+	#if((${ARGN}" MATCHES "&&") OR
+	#	(ARGN MATCHES "||") OR
+	#	(ARGN MATCHES "\\;"))
+	#	message("Please report the following error to the project!")
+	#	message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
+	#endif()
+
+	#message(STATUS "Arguments to execute_process: ${ARGN}")
+
+	execute_process(COMMAND
+		"${GIT_EXECUTABLE}"
+		describe
+		${hash}
+		${ARGN}
+		WORKING_DIRECTORY
+		"${CMAKE_CURRENT_SOURCE_DIR}"
+		RESULT_VARIABLE
+		res
+		OUTPUT_VARIABLE
+		out
+		ERROR_QUIET
+		OUTPUT_STRIP_TRAILING_WHITESPACE)
+	if(NOT res EQUAL 0)
+		set(out "${out}-${res}-NOTFOUND")
+	endif()
+
+	set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
+
+function(git_get_exact_tag _var)
+	git_describe(out --exact-match ${ARGN})
+	set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
+
+function(git_local_changes _var)
+	if(NOT GIT_FOUND)
+		find_package(Git QUIET)
+	endif()
+	get_git_head_revision(refspec hash)
+	if(NOT GIT_FOUND)
+		set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+	if(NOT hash)
+		set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+
+	execute_process(COMMAND
+		"${GIT_EXECUTABLE}"
+		diff-index --quiet HEAD --
+		WORKING_DIRECTORY
+		"${CMAKE_CURRENT_SOURCE_DIR}"
+		RESULT_VARIABLE
+		res
+		OUTPUT_VARIABLE
+		out
+		ERROR_QUIET
+		OUTPUT_STRIP_TRAILING_WHITESPACE)
+	if(res EQUAL 0)
+		set(${_var} "CLEAN" PARENT_SCOPE)
+	else()
+		set(${_var} "DIRTY" PARENT_SCOPE)
+	endif()
+endfunction()
diff --git a/cmake/Modules/GetGitRevisionDescription.cmake.in b/cmake/Modules/GetGitRevisionDescription.cmake.in
new file mode 100644
index 000000000..6d8b708ef
--- /dev/null
+++ b/cmake/Modules/GetGitRevisionDescription.cmake.in
@@ -0,0 +1,41 @@
+#
+# Internal file for GetGitRevisionDescription.cmake
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+set(HEAD_HASH)
+
+file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)
+
+string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
+if(HEAD_CONTENTS MATCHES "ref")
+	# named branch
+	string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
+	if(EXISTS "@GIT_DIR@/${HEAD_REF}")
+		configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+	else()
+		configure_file("@GIT_DIR@/packed-refs" "@GIT_DATA@/packed-refs" COPYONLY)
+		file(READ "@GIT_DATA@/packed-refs" PACKED_REFS)
+		if(${PACKED_REFS} MATCHES "([0-9a-z]*) ${HEAD_REF}")
+			set(HEAD_HASH "${CMAKE_MATCH_1}")
+		endif()
+	endif()
+else()
+	# detached HEAD
+	configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
+endif()
+
+if(NOT HEAD_HASH)
+	file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
+	string(STRIP "${HEAD_HASH}" HEAD_HASH)
+endif()
diff --git a/msvc/VS2015/cmake_build.bat b/msvc/VS2015/cmake_build.bat
new file mode 100644
index 000000000..937a86d1b
--- /dev/null
+++ b/msvc/VS2015/cmake_build.bat
@@ -0,0 +1,18 @@
+set PKG_FOLDER="%cd%\msvc\packages"
+git submodule update --init --recursive
+mkdir build
+cd build
+set BLAS_HOME="..\msvc\packages\OpenBLAS.0.2.14.1\lib\native"
+for /F %%f in ("%features%") do set DEFINES=%DEFINES% -D%%f=1
+cmake -G "Visual Studio 14 2015 Win64" %DEFINES% ^
+      -DCMAKE_PREFIX_PATH="%QTDIR%/lib/cmake/"   ^
+      -DBOOST_ROOT="C:/Libraries/boost_1_65_1"   ^
+      -DBOOST_LIBRARYDIR="C:/Libraries/boost_1_65_1/lib64-msvc-14.1" ^
+      -DBoost_USE_STATIC_LIBS=ON                 ^
+      -DZLIB_ROOT="%PKG_FOLDER%/zlib-msvc14-x64.1.2.11.7795/build/native" ^
+      -DZLIB_LIBRARY="%PKG_FOLDER%/zlib-msvc14-x64.1.2.11.7795/build/native/zlib-msvc14-x64.targets" ^
+      -DOpenCL_LIBRARY="%PKG_FOLDER%/opencl-nug.0.777.12/build/native/opencl-nug.targets" ^
+      -DOpenCL_INCLUDE_DIR="%PKG_FOLDER%/opencl-nug.0.777.12/build/native/include" ^
+      -DBLAS_LIBRARIES="%PKG_FOLDER%/OpenBLAS.0.2.14.1/build/native/openblas.targets" ^
+      -Dgtest_force_shared_crt=ON ..
+cmake --build . --config Release -- /maxcpucount:1
diff --git a/msvc/VS2015/leela-zero.vcxproj b/msvc/VS2015/leela-zero.vcxproj
index dc31e6f6e..2320c983b 100644
--- a/msvc/VS2015/leela-zero.vcxproj
+++ b/msvc/VS2015/leela-zero.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|x64">
@@ -43,9 +43,11 @@
   <PropertyGroup Label="UserMacros" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <TargetName>leelaz</TargetName>
+    <IncludePath>..\..\src\Eigen;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <TargetName>leelaz</TargetName>
+    <IncludePath>..\..\src\Eigen;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
@@ -82,6 +84,7 @@
     <ClCompile Include="..\..\src\Leela.cpp" />
     <ClCompile Include="..\..\src\Network.cpp" />
     <ClCompile Include="..\..\src\NNCache.cpp" />
+    <ClCompile Include="..\..\src\CPUPipe.cpp" />
     <ClCompile Include="..\..\src\OpenCL.cpp" />
     <ClCompile Include="..\..\src\OpenCLScheduler.cpp" />
     <ClCompile Include="..\..\src\Random.cpp" />
@@ -93,6 +96,7 @@
     <ClCompile Include="..\..\src\Training.cpp" />
     <ClCompile Include="..\..\src\Tuner.cpp" />
     <ClCompile Include="..\..\src\UCTNode.cpp" />
+    <ClCompile Include="..\..\src\UCTNodePointer.cpp" />
     <ClCompile Include="..\..\src\UCTNodeRoot.cpp" />
     <ClCompile Include="..\..\src\UCTSearch.cpp" />
     <ClCompile Include="..\..\src\Utils.cpp" />
@@ -109,6 +113,8 @@
     <ClInclude Include="..\..\src\KoState.h" />
     <ClInclude Include="..\..\src\Network.h" />
     <ClInclude Include="..\..\src\NNCache.h" />
+    <ClInclude Include="..\..\src\ForwardPipe.h" />
+    <ClInclude Include="..\..\src\CPUPipe.h" />
     <ClInclude Include="..\..\src\OpenCL.h" />
     <ClInclude Include="..\..\src\OpenCLScheduler.h" />
     <ClInclude Include="..\..\src\Random.h" />
@@ -121,6 +127,7 @@
     <ClInclude Include="..\..\src\Training.h" />
     <ClInclude Include="..\..\src\Tuner.h" />
     <ClInclude Include="..\..\src\UCTNode.h" />
+    <ClInclude Include="..\..\src\UCTNodePointer.h" />
     <ClInclude Include="..\..\src\UCTSearch.h" />
     <ClInclude Include="..\..\src\Utils.h" />
     <ClInclude Include="..\..\src\Zobrist.h" />
@@ -134,8 +141,9 @@
     <Import Project="..\packages\boost_system-vc140.1.65.1.0\build\native\boost_system-vc140.targets" Condition="Exists('..\packages\boost_system-vc140.1.65.1.0\build\native\boost_system-vc140.targets')" />
     <Import Project="..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets" Condition="Exists('..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets')" />
     <Import Project="..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets" Condition="Exists('..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets')" />
-    <Import Project="..\packages\zlib-msvc-x64.1.2.11.8900\build\native\zlib-msvc-x64.targets" Condition="Exists('..\packages\zlib-msvc-x64.1.2.11.8900\build\native\zlib-msvc-x64.targets')" />
+    <Import Project="..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets" Condition="Exists('..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets')" />
     <Import Project="..\packages\boost_program_options-vc140.1.65.1.0\build\native\boost_program_options-vc140.targets" Condition="Exists('..\packages\boost_program_options-vc140.1.65.1.0\build\native\boost_program_options-vc140.targets')" />
+    <Import Project="..\packages\boost_filesystem-vc140.1.65.1.0\build\native\boost_filesystem-vc140.targets" Condition="Exists('..\packages\boost_filesystem-vc140.1.65.1.0\build\native\boost_filesystem-vc140.targets')" />
   </ImportGroup>
   <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
     <PropertyGroup>
@@ -145,7 +153,8 @@
     <Error Condition="!Exists('..\packages\boost_system-vc140.1.65.1.0\build\native\boost_system-vc140.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_system-vc140.1.65.1.0\build\native\boost_system-vc140.targets'))" />
     <Error Condition="!Exists('..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets'))" />
     <Error Condition="!Exists('..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets'))" />
-    <Error Condition="!Exists('..\packages\zlib-msvc-x64.1.2.11.8900\build\native\zlib-msvc-x64.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\zlib-msvc-x64.1.2.11.8900\build\native\zlib-msvc-x64.targets'))" />
+    <Error Condition="!Exists('..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets'))" />
     <Error Condition="!Exists('..\packages\boost_program_options-vc140.1.65.1.0\build\native\boost_program_options-vc140.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_program_options-vc140.1.65.1.0\build\native\boost_program_options-vc140.targets'))" />
+    <Error Condition="!Exists('..\packages\boost_filesystem-vc140.1.65.1.0\build\native\boost_filesystem-vc140.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_filesystem-vc140.1.65.1.0\build\native\boost_filesystem-vc140.targets'))" />
   </Target>
 </Project>
diff --git a/msvc/VS2015/leela-zero.vcxproj.filters b/msvc/VS2015/leela-zero.vcxproj.filters
index eebb49412..be8d32e0f 100644
--- a/msvc/VS2015/leela-zero.vcxproj.filters
+++ b/msvc/VS2015/leela-zero.vcxproj.filters
@@ -42,6 +42,12 @@
     <ClInclude Include="..\..\src\Network.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\ForwardPipe.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\CPUPipe.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\src\OpenCL.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -72,6 +78,9 @@
     <ClInclude Include="..\..\src\UCTNode.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\UCTNodePointer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\src\UCTSearch.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -116,6 +125,9 @@
     <ClCompile Include="..\..\src\Network.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\CPUPipe.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\src\OpenCL.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -143,6 +155,9 @@
     <ClCompile Include="..\..\src\UCTNode.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\UCTNodePointer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\src\UCTNodeRoot.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/VS2015/packages.config b/msvc/VS2015/packages.config
index d2b2cab78..654ed217e 100644
--- a/msvc/VS2015/packages.config
+++ b/msvc/VS2015/packages.config
@@ -1,9 +1,10 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="boost" version="1.65.1.0" targetFramework="native" />
+  <package id="boost_filesystem-vc140" version="1.65.1.0" targetFramework="native" />
   <package id="boost_program_options-vc140" version="1.65.1.0" targetFramework="native" />
   <package id="boost_system-vc140" version="1.65.1.0" targetFramework="native" />
   <package id="OpenBLAS" version="0.2.14.1" targetFramework="native" />
   <package id="opencl-nug" version="0.777.12" targetFramework="native" />
-  <package id="zlib-msvc-x64" version="1.2.11.8900" targetFramework="native" />
-</packages>
\ No newline at end of file
+  <package id="zlib-msvc14-x64" version="1.2.11.7795" targetFramework="native" />
+</packages>
diff --git a/msvc/VS2017/autogtp.vcxproj b/msvc/VS2017/autogtp.vcxproj
index 7704ebdcf..d207aeb15 100644
--- a/msvc/VS2017/autogtp.vcxproj
+++ b/msvc/VS2017/autogtp.vcxproj
@@ -128,7 +128,6 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{B12702AD-ABFB-343A-A199-8E24837244A3}</ProjectGuid>
     <Keyword>Qt4VSv1.0</Keyword>
-    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
diff --git a/msvc/VS2017/cmake_build.bat b/msvc/VS2017/cmake_build.bat
new file mode 100644
index 000000000..df7bf5132
--- /dev/null
+++ b/msvc/VS2017/cmake_build.bat
@@ -0,0 +1,8 @@
+set PKG_FOLDER="%cd%\msvc\packages"
+git submodule update --init --recursive
+mkdir build
+cd build
+set BLAS_HOME="..\msvc\packages\OpenBLAS.0.2.14.1\lib\native"
+for /F %%f in ("%features%") do set DEFINES=%DEFINES% -D%%f=1
+cmake -G "Visual Studio 15 2017 Win64" %DEFINES% -DCMAKE_PREFIX_PATH="%QTDIR%/lib/cmake/" -DBOOST_ROOT="C:/Libraries/boost_1_65_1" -DBOOST_LIBRARYDIR="C:/Libraries/boost_1_65_1/lib64-msvc-14.1" -DBoost_USE_STATIC_LIBS=ON -DZLIB_ROOT="%PKG_FOLDER%/zlib-msvc14-x64.1.2.11.7795/build/native" -DZLIB_LIBRARY="%PKG_FOLDER%/zlib-msvc14-x64.1.2.11.7795/build/native/zlib-msvc14-x64.targets" -DOpenCL_LIBRARY="%PKG_FOLDER%/opencl-nug.0.777.12/build/native/opencl-nug.targets" -DOpenCL_INCLUDE_DIR="%PKG_FOLDER%/opencl-nug.0.777.12/build/native/include" -DBLAS_LIBRARIES="%PKG_FOLDER%/OpenBLAS.0.2.14.1/build/native/openblas.targets" -Dgtest_force_shared_crt=ON ..
+cmake --build . --config Release -- /maxcpucount:1
diff --git a/msvc/VS2017/leela-zero.vcxproj b/msvc/VS2017/leela-zero.vcxproj
index ed6c73131..9b8afb34a 100644
--- a/msvc/VS2017/leela-zero.vcxproj
+++ b/msvc/VS2017/leela-zero.vcxproj
@@ -30,6 +30,8 @@
     <ClInclude Include="..\..\src\KoState.h" />
     <ClInclude Include="..\..\src\Network.h" />
     <ClInclude Include="..\..\src\NNCache.h" />
+    <ClInclude Include="..\..\src\ForwardPipe.h" />
+    <ClInclude Include="..\..\src\CPUPipe.h" />
     <ClInclude Include="..\..\src\OpenCL.h" />
     <ClInclude Include="..\..\src\OpenCLScheduler.h" />
     <ClInclude Include="..\..\src\Random.h" />
@@ -58,6 +60,7 @@
     <ClCompile Include="..\..\src\Leela.cpp" />
     <ClCompile Include="..\..\src\Network.cpp" />
     <ClCompile Include="..\..\src\NNCache.cpp" />
+    <ClCompile Include="..\..\src\CPUPipe.cpp" />
     <ClCompile Include="..\..\src\OpenCL.cpp" />
     <ClCompile Include="..\..\src\OpenCLScheduler.cpp" />
     <ClCompile Include="..\..\src\Random.cpp" />
@@ -80,7 +83,6 @@
     <ProjectGuid>{7B887BFE-8D2C-46CD-B139-5213434BF218}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>leelazero</RootNamespace>
-    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -132,6 +134,7 @@
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
+    <IncludePath>..\..\src\Eigen;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <LinkIncremental>false</LinkIncremental>
@@ -140,6 +143,7 @@
     <LinkIncremental>false</LinkIncremental>
     <TargetName>leelaz</TargetName>
     <IntDir>$(Platform)\$(Configuration)\leelaz\</IntDir>
+    <IncludePath>..\..\src\Eigen;$(IncludePath)</IncludePath>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -203,22 +207,24 @@
   </ItemDefinitionGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="..\packages\boost.1.65.1.0\build\native\boost.targets" Condition="Exists('..\packages\boost.1.65.1.0\build\native\boost.targets')" />
-    <Import Project="..\packages\boost_system-vc141.1.65.1.0\build\native\boost_system-vc141.targets" Condition="Exists('..\packages\boost_system-vc141.1.65.1.0\build\native\boost_system-vc141.targets')" />
-    <Import Project="..\packages\boost_program_options-vc141.1.65.1.0\build\native\boost_program_options-vc141.targets" Condition="Exists('..\packages\boost_program_options-vc141.1.65.1.0\build\native\boost_program_options-vc141.targets')" />
     <Import Project="..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets" Condition="Exists('..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets')" />
     <Import Project="..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets" Condition="Exists('..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets')" />
     <Import Project="..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets" Condition="Exists('..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets')" />
+    <Import Project="..\packages\boost.1.68.0.0\build\boost.targets" Condition="Exists('..\packages\boost.1.68.0.0\build\boost.targets')" />
+    <Import Project="..\packages\boost_filesystem-vc141.1.68.0.0\build\boost_filesystem-vc141.targets" Condition="Exists('..\packages\boost_filesystem-vc141.1.68.0.0\build\boost_filesystem-vc141.targets')" />
+    <Import Project="..\packages\boost_program_options-vc141.1.68.0.0\build\boost_program_options-vc141.targets" Condition="Exists('..\packages\boost_program_options-vc141.1.68.0.0\build\boost_program_options-vc141.targets')" />
+    <Import Project="..\packages\boost_system-vc141.1.68.0.0\build\boost_system-vc141.targets" Condition="Exists('..\packages\boost_system-vc141.1.68.0.0\build\boost_system-vc141.targets')" />
   </ImportGroup>
   <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
     <PropertyGroup>
       <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
     </PropertyGroup>
-    <Error Condition="!Exists('..\packages\boost.1.65.1.0\build\native\boost.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost.1.65.1.0\build\native\boost.targets'))" />
-    <Error Condition="!Exists('..\packages\boost_system-vc141.1.65.1.0\build\native\boost_system-vc141.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_system-vc141.1.65.1.0\build\native\boost_system-vc141.targets'))" />
-    <Error Condition="!Exists('..\packages\boost_program_options-vc141.1.65.1.0\build\native\boost_program_options-vc141.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_program_options-vc141.1.65.1.0\build\native\boost_program_options-vc141.targets'))" />
     <Error Condition="!Exists('..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\OpenBLAS.0.2.14.1\build\native\openblas.targets'))" />
     <Error Condition="!Exists('..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\opencl-nug.0.777.12\build\native\opencl-nug.targets'))" />
     <Error Condition="!Exists('..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\zlib-msvc14-x64.1.2.11.7795\build\native\zlib-msvc14-x64.targets'))" />
+    <Error Condition="!Exists('..\packages\boost.1.68.0.0\build\boost.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost.1.68.0.0\build\boost.targets'))" />
+    <Error Condition="!Exists('..\packages\boost_filesystem-vc141.1.68.0.0\build\boost_filesystem-vc141.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_filesystem-vc141.1.68.0.0\build\boost_filesystem-vc141.targets'))" />
+    <Error Condition="!Exists('..\packages\boost_program_options-vc141.1.68.0.0\build\boost_program_options-vc141.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_program_options-vc141.1.68.0.0\build\boost_program_options-vc141.targets'))" />
+    <Error Condition="!Exists('..\packages\boost_system-vc141.1.68.0.0\build\boost_system-vc141.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\boost_system-vc141.1.68.0.0\build\boost_system-vc141.targets'))" />
   </Target>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/msvc/VS2017/leela-zero.vcxproj.filters b/msvc/VS2017/leela-zero.vcxproj.filters
index 936338b4c..5ac9f8662 100644
--- a/msvc/VS2017/leela-zero.vcxproj.filters
+++ b/msvc/VS2017/leela-zero.vcxproj.filters
@@ -42,6 +42,12 @@
     <ClInclude Include="..\..\src\Network.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\ForwardPipe.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\CPUPipe.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\src\OpenCL.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -122,6 +128,9 @@
     <ClCompile Include="..\..\src\Network.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\CPUPipe.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\src\OpenCL.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -177,4 +186,4 @@
   <ItemGroup>
     <None Include="packages.config" />
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/msvc/VS2017/packages.config b/msvc/VS2017/packages.config
index fdf64d154..7de7578b6 100644
--- a/msvc/VS2017/packages.config
+++ b/msvc/VS2017/packages.config
@@ -1,8 +1,9 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="boost" version="1.65.1.0" targetFramework="native" />
-  <package id="boost_program_options-vc141" version="1.65.1.0" targetFramework="native" />
-  <package id="boost_system-vc141" version="1.65.1.0" targetFramework="native" />
+  <package id="boost" version="1.68.0.0" targetFramework="native" />
+  <package id="boost_filesystem-vc141" version="1.68.0.0" targetFramework="native" />
+  <package id="boost_program_options-vc141" version="1.68.0.0" targetFramework="native" />
+  <package id="boost_system-vc141" version="1.68.0.0" targetFramework="native" />
   <package id="OpenBLAS" version="0.2.14.1" targetFramework="native" />
   <package id="opencl-nug" version="0.777.12" targetFramework="native" />
   <package id="zlib-msvc14-x64" version="1.2.11.7795" targetFramework="native" />
diff --git a/scripts/resign_analysis/resign_analysis.py b/scripts/resign_analysis/resign_analysis.py
index 79a71749b..3042b7a19 100755
--- a/scripts/resign_analysis/resign_analysis.py
+++ b/scripts/resign_analysis/resign_analysis.py
@@ -17,6 +17,7 @@
 #    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 
 import argparse
+import math
 import os
 import sys
 
@@ -51,7 +52,7 @@ def to_move_str(to_move):
     if (to_move): return "W"
     else: return "B"
 
-def parseGameBody(filename, fh, tfh, verbose):
+def parseGameBody(filename, fh, tfh, verbose, resignthr):
     gs = GameStats(filename)
     movenum = 0
     while 1:
@@ -75,11 +76,11 @@ def parseGameBody(filename, fh, tfh, verbose):
             if verbose >= 3:
                 print("+", to_move, movenum, netwinrate, child_uctwinrate,
                       bestmovevisits)
-            if not gs.resign_type and child_uctwinrate < resignrate:
+            if not gs.resign_type and child_uctwinrate < resignthr:
                 if verbose >= 1:
-                    print(("Wrong resign -- %s rr=%0.3f wr=%0.3f "
+                    print(("Wrong resign -- %s rt=%0.3f wr=%0.3f "
                            "winner=%s movenum=%d") %
-                          (filename, resignrate, child_uctwinrate,
+                          (filename, resignthr, child_uctwinrate,
                            to_move_str(to_move), movenum))
                     if verbose >= 3:
                         print("policy_weights", policy_weights)
@@ -89,7 +90,7 @@ def parseGameBody(filename, fh, tfh, verbose):
             if verbose >= 2:
                 print("-", to_move, movenum, netwinrate, child_uctwinrate,
                       bestmovevisits)
-            if not gs.resign_type and child_uctwinrate < resignrate:
+            if not gs.resign_type and child_uctwinrate < resignthr:
                 if verbose >= 2:
                     print("Correct resign -- %s" % (filename))
                 gs.resign_type = "Correct"
@@ -97,7 +98,7 @@ def parseGameBody(filename, fh, tfh, verbose):
     gs.total_moves = movenum
     return gs
 
-def parseGames(filenames, resignrate, verbose, prefixes):
+def parseGames(filenames, resignthr, verbose, prefixes):
     gsd = {}
     for filename in filenames:
         training_filename = filename.replace(".debug", "")
@@ -113,13 +114,13 @@ def parseGames(filenames, resignrate, verbose, prefixes):
                     continue
             cfg_resignpct = int(cfg_resignpct)
             if cfg_resignpct == 0:
-                gsd[filename] = parseGameBody(filename, fh, tfh, verbose)
+                gsd[filename] = parseGameBody(filename, fh, tfh, verbose, resignthr)
             elif verbose >= 2:
                 print("{} was played with -r {}, skipping".format(
                         filename, cfg_resignpct))
     return gsd
 
-def resignStats(gsd, resignrate):
+def resignStats(gsd, resignthr):
     # [ B wins, W wins, Overall ]
     stats = [ TotalStats(), TotalStats(), TotalStats() ]
     for gs in gsd.values():
@@ -136,8 +137,8 @@ def resignStats(gsd, resignrate):
             stats[gs.winner].resigned_game_len_sum += gs.resign_movenum
         stats[gs.winner].game_len_sum += gs.total_moves
     stats[2].calcOverall(stats[0], stats[1])
-    print("Resign rate: %0.2f - Black won %d/%d (%0.2f%%)" % (
-        resignrate,
+    print("Resign thr: %0.2f - Black won %d/%d (%0.2f%%)" % (
+        resignthr,
         stats[0].num_games,
         stats[0].num_games+stats[1].num_games,
         100 * stats[0].num_games / (stats[0].num_games+stats[1].num_games)))
@@ -169,6 +170,7 @@ def resignStats(gsd, resignrate):
         print("%s - Average game length: %d/%d (%0.2f%% reduction)" % (
             win_str, resigned_avg_len, avg_len, avg_reduction*100))
     print()
+    return stats
 
 if __name__ == "__main__":
     usage_str = """
@@ -191,12 +193,17 @@ def resignStats(gsd, resignrate):
     parser = argparse.ArgumentParser(
             formatter_class=argparse.RawDescriptionHelpFormatter,
             description=usage_str)
-    default_resignrates="0.5,0.2,0.15,0.1,0.05,0.02,0.01"
+    default_resignthrs="0.5,0.2,0.15,0.1,0.05,0.02,0.01"
     parser.add_argument(
-            "-r", metavar="Resign_rates", dest="resignrates", type=str,
-            default=default_resignrates,
+            "-r", metavar="Resign_thresholds", dest="resignthrs", type=str,
+            default=default_resignthrs,
             help="comma separated resign thresholds (default {})".format(
-                    default_resignrates))
+                    default_resignthrs))
+    parser.add_argument(
+            "-R", metavar="Resign_rate", dest="resignrate", type=float,
+            help="If specified, a search is performed that finds the maximum \
+            resign threshold that can be set without exceeding the given \
+            resign rate")
     parser.add_argument(
             "-v", metavar="Verbose", dest="verbose", type=int, default=0,
             help="Verbosity level (default 0)")
@@ -207,13 +214,36 @@ def resignStats(gsd, resignrate):
             "-n", metavar="Prefix", dest="networks", nargs="+",
             help="Prefixes of specific networks to analyze")
     args = parser.parse_args()
-    resignrates = [float(i) for i in args.resignrates.split(",")]
+    resignthrs = [float(i) for i in args.resignthrs.split(",")]
     if args.networks:
         print("Analyzing networks starting with: {}".format(
                 ",".join(args.networks)))
-    for resignrate in (resignrates):
-        gsd = parseGames(args.data, resignrate, args.verbose, args.networks)
+
+    for resignthr in (resignthrs):
+        gsd = parseGames(args.data, resignthr, args.verbose, args.networks)
         if gsd:
-            resignStats(gsd, resignrate)
+            resignStats(gsd, resignthr)
         else:
             print("No games to analyze (for more info try running with -v 2)")
+
+    if args.resignrate:
+        L = 0.0
+        R = 0.5
+        while L < R :
+            resignthr = math.floor((L + R) * 50) / 100
+            gsd = parseGames(args.data, resignthr, args.verbose, args.networks)
+            if not gsd:
+                print("No games to analyze (for more info try running with -v 2)")
+                break
+            stats = resignStats(gsd, resignthr)
+            wrong_rate = stats[2].wrong_resign_count / stats[2].num_games
+            if wrong_rate > args.resignrate:
+                if R == resignthr:
+                    R = (math.floor(resignthr * 100) - 1) / 100
+                else:
+                    R = resignthr
+            else:
+                L = (math.floor(resignthr * 100) + 1) / 100
+        if (L == R):      
+            print(("The highest the resign threshold should be set to: %0.2f")
+                  % (R - 0.01))
diff --git a/src/CL/cl2.hpp b/src/CL/cl2.hpp
index f7d28d30a..d49c156b4 100644
--- a/src/CL/cl2.hpp
+++ b/src/CL/cl2.hpp
@@ -447,6 +447,8 @@
 # undef CL_HPP_TARGET_OPENCL_VERSION
 # define CL_HPP_TARGET_OPENCL_VERSION 200
 #endif
+/* Forward target OpenCL version to C headers */
+#define CL_TARGET_OPENCL_VERSION CL_HPP_TARGET_OPENCL_VERSION
 
 #if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
 # define CL_HPP_MINIMUM_OPENCL_VERSION 200
@@ -1139,6 +1141,8 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
     F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
     F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
     F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
     F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
     F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
@@ -1235,8 +1239,6 @@ inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_
     F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
     F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
     F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
     F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \
     \
     F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
@@ -2742,7 +2744,9 @@ class Context
                     error = platforms[i].getDevices(type, &devices);
 
 #if defined(CL_HPP_ENABLE_EXCEPTIONS)
-                } catch (Error) {}
+                } catch (cl::Error& e) {
+                    error = e.err();
+                }
     // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
     // We do error checking next anyway, and can throw there if needed
 #endif
@@ -5925,27 +5929,28 @@ class Kernel : public detail::Wrapper<cl_kernel>
             );
     }
     
-    template<int index, int ArrayLength, class D, typename T0, typename... Ts>
-    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, Ts... ts)
+    template<int index, int ArrayLength, class D, typename T0, typename T1, typename... Ts>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, const pointer<T1, D> &t1, Ts & ... ts)
     {
         pointerList[index] = static_cast<void*>(t0.get());
-        setSVMPointersHelper<index + 1, Ts...>(ts...);
+        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
     }
 
-    template<int index, int ArrayLength, typename T0, typename... Ts>
+    template<int index, int ArrayLength, typename T0, typename T1, typename... Ts>
     typename std::enable_if<std::is_pointer<T0>::value, void>::type
-    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, Ts... ts)
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, T1 t1, Ts... ts)
     {
         pointerList[index] = static_cast<void*>(t0);
-        setSVMPointersHelper<index + 1, Ts...>(ts...);
+        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
     }
-    
+
     template<int index, int ArrayLength, typename T0, class D>
     void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)
     {
         pointerList[index] = static_cast<void*>(t0.get());
     }
 
+
     template<int index, int ArrayLength, typename T0>
     typename std::enable_if<std::is_pointer<T0>::value, void>::type
     setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)
@@ -5954,7 +5959,7 @@ class Kernel : public detail::Wrapper<cl_kernel>
     }
 
     template<typename T0, typename... Ts>
-    cl_int setSVMPointers(const T0 &t0, Ts... ts)
+    cl_int setSVMPointers(const T0 &t0, Ts & ... ts)
     {
         std::array<void*, 1 + sizeof...(Ts)> pointerList;
 
@@ -7206,7 +7211,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
 
         return err;
     }
-
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
     cl_int enqueueReadBufferRect(
         const Buffer& buffer,
         cl_bool blocking,
@@ -7321,7 +7326,7 @@ class CommandQueue : public detail::Wrapper<cl_command_queue>
 
         return err;
     }
-
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
      * Enqueue a command to fill a buffer object with a pattern
@@ -9531,7 +9536,7 @@ class KernelFunctor
     }
 
     template<typename T0, typename... T1s>
-    cl_int setSVMPointers(const T0 &t0, T1s... ts)
+    cl_int setSVMPointers(const T0 &t0, T1s &... ts)
     {
         return kernel_.setSVMPointers(t0, ts...);
     }
diff --git a/src/CPUPipe.cpp b/src/CPUPipe.cpp
new file mode 100644
index 000000000..f56366f30
--- /dev/null
+++ b/src/CPUPipe.cpp
@@ -0,0 +1,442 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
+
+#include "config.h"
+
+#ifdef __APPLE__
+#include <Accelerate/Accelerate.h>
+#endif
+#ifdef USE_MKL
+#include <mkl.h>
+#endif
+#ifdef USE_OPENBLAS
+#include <cblas.h>
+#endif
+#ifndef USE_BLAS
+#include <Eigen/Dense>
+#endif
+
+#include "CPUPipe.h"
+#include "Im2Col.h"
+#include "Network.h"
+
+#ifndef USE_BLAS
+// Eigen helpers
+template <typename T>
+using EigenMatrixMap =
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+#endif
+
+void CPUPipe::initialize(int channels) {
+    m_input_channels = channels;
+}
+
+void CPUPipe::winograd_transform_in(const std::vector<float>& in,
+                                    std::vector<float>& V, const int C) {
+    constexpr auto W = BOARD_SIZE;
+    constexpr auto H = BOARD_SIZE;
+    constexpr auto WTILES = WINOGRAD_WTILES;
+    constexpr auto P = WINOGRAD_P;
+
+    constexpr auto Wpad = 2 + WINOGRAD_M * WTILES;
+
+    constexpr auto buffersize = 32;
+
+    std::array<std::array<float, Wpad>, Wpad> in_pad{{{0.0f}}};
+
+    std::array<float, buffersize * WINOGRAD_ALPHA * WINOGRAD_ALPHA> buffer;
+    auto buffer_offset = 0;
+    auto buffer_entries = 0;
+
+    // multiple vector [i0..i5] by Bt and produce [o0..o5]
+    // const auto Bt = std::array<float, WINOGRAD_TILE>{
+    //     1.0f,  0.0f,       -5.0f / 2.0f,  0.0f,        1.0f, 0.0f,
+    //     0.0f, -SQ2,        -2.0f,         SQ2 / 2.0f,  1.0f, 0.0f,
+    //     0.0f,  SQ2,        -2.0f,        -SQ2 / 2.0f,  1.0f, 0.0f,
+    //     0.0f, -SQ2 / 2.0f, -1.0f / 2.0f,  SQ2,         1.0f, 0.0f,
+    //     0.0f,  SQ2 / 2.0f, -1.0f / 2.0f, -SQ2,         1.0f, 0.0f,
+    //     0.0f,  1.0f,        0.0f,        -5.0f / 2.0f, 0.0f, 1.0f};
+    const auto multiply_bt = [](float& o0, float& o1, float& o2,
+                                float& o3, float& o4, float& o5,
+                                const float i0, const float i1, const float i2,
+                                const float i3, const float i4, const float i5) {
+        auto i3m1 = i1 * -SQ2 + i3 * (SQ2 / 2.0f);
+        auto i4m2 = i2 * -2.0f + i4 * 1.0f;
+
+        o0 = i0 + i2 * (-5.0f / 2.0f) + i4;
+        o1 = i3m1 + i4m2;
+        o2 = -i3m1 + i4m2;
+
+        auto i3m1_2 = i3 * (SQ2) + i1 * (-SQ2 / 2.0f);
+        auto i4m2_2 = i2 * (-1.0f / 2.0f) + i4;
+
+        o3 = i3m1_2 + i4m2_2;
+        o4 = -i3m1_2 + i4m2_2;
+
+        o5 = i1 + i3 * (-5.0f / 2.0f) + i5;
+    };
+
+    for (auto ch = 0; ch < C; ch++) {
+        for (auto yin = 0; yin < H; yin++) {
+            for (auto xin = 0; xin < W; xin++) {
+                in_pad[yin + 1][xin + 1] = in[ch * (W * H) + yin * W + xin];
+            }
+        }
+        for (auto block_y = 0; block_y < WTILES; block_y++) {
+            // Tiles overlap by 2
+            const auto yin = WINOGRAD_M * block_y;
+            for (auto block_x = 0; block_x < WTILES; block_x++) {
+                const auto xin = WINOGRAD_M * block_x;
+#define DECL_T1(XX)                                                            \
+    float T1_##XX##_0, T1_##XX##_1, T1_##XX##_2, T1_##XX##_3, T1_##XX##_4,     \
+        T1_##XX##_5;
+                DECL_T1(0)
+                DECL_T1(1)
+                DECL_T1(2)
+                DECL_T1(3)
+                DECL_T1(4)
+                DECL_T1(5)
+
+                // Calculates transpose(B).x.B
+#define MULTIPLY_BT(XX)                                                        \
+    multiply_bt(T1_0_##XX, T1_1_##XX, T1_2_##XX, T1_3_##XX, T1_4_##XX,         \
+                T1_5_##XX,                                                     \
+                in_pad[yin + 0][xin + XX],                                     \
+                in_pad[yin + 1][xin + XX],                                     \
+                in_pad[yin + 2][xin + XX],                                     \
+                in_pad[yin + 3][xin + XX],                                     \
+                in_pad[yin + 4][xin + XX],                                     \
+                in_pad[yin + 5][xin + XX]);
+                MULTIPLY_BT(0)
+                MULTIPLY_BT(1)
+                MULTIPLY_BT(2)
+                MULTIPLY_BT(3)
+                MULTIPLY_BT(4)
+                MULTIPLY_BT(5)
+
+#define MULTIPLY_B(XX)                                                         \
+    multiply_bt(                                                               \
+        buffer[buffersize * (XX * WINOGRAD_ALPHA + 0) + buffer_entries],       \
+        buffer[buffersize * (XX * WINOGRAD_ALPHA + 1) + buffer_entries],       \
+        buffer[buffersize * (XX * WINOGRAD_ALPHA + 2) + buffer_entries],       \
+        buffer[buffersize * (XX * WINOGRAD_ALPHA + 3) + buffer_entries],       \
+        buffer[buffersize * (XX * WINOGRAD_ALPHA + 4) + buffer_entries],       \
+        buffer[buffersize * (XX * WINOGRAD_ALPHA + 5) + buffer_entries],       \
+        T1_##XX##_0, T1_##XX##_1, T1_##XX##_2, T1_##XX##_3, T1_##XX##_4,       \
+        T1_##XX##_5);
+                MULTIPLY_B(0)
+                MULTIPLY_B(1)
+                MULTIPLY_B(2)
+                MULTIPLY_B(3)
+                MULTIPLY_B(4)
+                MULTIPLY_B(5)
+
+                if (buffer_entries == 0) {
+                    buffer_offset = ch * P + block_y * WTILES + block_x;
+                }
+                buffer_entries++;
+
+                if (buffer_entries >= buffersize
+                    || (ch == C - 1 && block_x == WTILES - 1
+                        && block_y == WTILES - 1)) {
+
+                    for (auto i = 0; i < WINOGRAD_ALPHA * WINOGRAD_ALPHA; i++) {
+                        for (auto entry = 0; entry < buffer_entries; entry++) {
+                            V[i * C * P + buffer_offset + entry] =
+                                buffer[i * buffersize + entry];
+                        }
+                    }
+                    buffer_entries = 0;
+                }
+            }
+        }
+    }
+}
+
+void CPUPipe::winograd_sgemm(const std::vector<float>& U,
+                             const std::vector<float>& V,
+                             std::vector<float>& M,
+                             const int C, const int K) {
+    constexpr auto P = WINOGRAD_P;
+
+    for (auto b = 0; b < WINOGRAD_TILE; b++) {
+        const auto offset_u = b * K * C;
+        const auto offset_v = b * C * P;
+        const auto offset_m = b * K * P;
+#ifdef USE_BLAS
+        cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                    K, P, C,
+                    1.0f,
+                    &U[offset_u], K,
+                    &V[offset_v], P,
+                    0.0f,
+                    &M[offset_m], P);
+#else
+        auto C_mat = EigenMatrixMap<float>(M.data() + offset_m, P, K);
+        C_mat.noalias() =
+            ConstEigenMatrixMap<float>(V.data() + offset_v, P, C)
+            * ConstEigenMatrixMap<float>(U.data() + offset_u, K, C).transpose();
+#endif
+    }
+}
+
+void CPUPipe::winograd_transform_out(const std::vector<float>& M,
+                                     std::vector<float>& Y, const int K) {
+    constexpr auto W = BOARD_SIZE;
+    constexpr auto H = BOARD_SIZE;
+    constexpr auto WTILES = WINOGRAD_WTILES;
+    constexpr auto P = WINOGRAD_P;
+
+    // multiple vector [i0..i5] by At and produce [o0..o3]
+    // const auto At = std::array<float, WINOGRAD_ALPHA * WINOGRAD_M>{
+    //     1.0f, 1.0f,        1.0f,        1.0f,        1.0f,       0.0f,
+    //     0.0f, SQ2 / 2.0f, -SQ2 / 2.0f,  SQ2,        -SQ2,        0.0f,
+    //     0.0f, 1.0f / 2.0f, 1.0f / 2.0f, 2.0f,        2.0f,       0.0f,
+    //     0.0f, SQ2 / 4.0f, -SQ2 / 4.0f,  2.0f * SQ2, -2.0f * SQ2, 1.0f};
+    const auto multiply_at = [](float& o0, float& o1, float& o2, float& o3,
+                                const float i0, const float i1,
+                                const float i2, const float i3,
+                                const float i4, const float i5) {
+        auto t1p2 = (i1 + i2) * (1.0f / 2.0f);
+        auto t1m2 = (i1 - i2) * (SQ2 / 4.0f);
+        auto t3p4 = i3 + i4;
+        auto t3m4 = (i3 - i4) * (SQ2);
+
+        o0 = i0 + t1p2 + t1p2 + t3p4;
+        o1 = t1m2 + t1m2 + t3m4;
+        o2 = t1p2 + t3p4 + t3p4;
+        o3 = t1m2 + t3m4 + t3m4 + i5;
+    };
+
+    for (auto k = 0; k < K; k++) {
+        for (auto block_x = 0; block_x < WTILES; block_x++) {
+            const auto x = WINOGRAD_M * block_x;
+            for (auto block_y = 0; block_y < WTILES; block_y++) {
+                const auto y = WINOGRAD_M * block_y;
+
+                const auto b = block_y * WTILES + block_x;
+                using WinogradTile =
+                    std::array<std::array<float, WINOGRAD_ALPHA>,
+                               WINOGRAD_ALPHA>;
+                WinogradTile temp_m;
+                for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
+                    for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
+                        temp_m[xi][nu] =
+                            M[(xi * WINOGRAD_ALPHA + nu) * K * P + k * P + b];
+                    }
+                }
+                std::array<std::array<float, WINOGRAD_ALPHA>, WINOGRAD_M> temp;
+                std::array<std::array<float, WINOGRAD_M>, WINOGRAD_M> o;
+
+                // Calculates transpose(A).temp_m.A
+                for (auto j = 0; j < WINOGRAD_ALPHA; j++) {
+                    multiply_at(temp[0][j], temp[1][j], temp[2][j], temp[3][j],
+                                temp_m[0][j], temp_m[1][j], temp_m[2][j],
+                                temp_m[3][j], temp_m[4][j], temp_m[5][j]);
+                }
+
+                for (auto i = 0; i < WINOGRAD_M; i++) {
+                    multiply_at(o[i][0], o[i][1], o[i][2], o[i][3],
+                                temp[i][0], temp[i][1], temp[i][2],
+                                temp[i][3], temp[i][4], temp[i][5]);
+                }
+
+                const auto y_ind = k * H * W + y * W + x;
+                for (auto i = 0; i < WINOGRAD_M; i++) {
+                    for (auto j = 0; j < WINOGRAD_M; j++) {
+                        if (y + i < H && x + j < W) {
+                            Y[y_ind + i * W + j] = o[i][j];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void CPUPipe::winograd_convolve3(const int outputs,
+                                 const std::vector<float>& input,
+                                 const std::vector<float>& U,
+                                 std::vector<float>& V,
+                                 std::vector<float>& M,
+                                 std::vector<float>& output) {
+
+    constexpr unsigned int filter_len = WINOGRAD_ALPHA * WINOGRAD_ALPHA;
+    const auto input_channels = U.size() / (outputs * filter_len);
+
+    winograd_transform_in(input, V, input_channels);
+    winograd_sgemm(U, V, M, input_channels, outputs);
+    winograd_transform_out(M, output, outputs);
+}
+
+template <unsigned int filter_size>
+void convolve(const size_t outputs,
+              const std::vector<float>& input,
+              const std::vector<float>& weights,
+              const std::vector<float>& biases,
+              std::vector<float>& output) {
+    // The size of the board is defined at compile time
+    constexpr unsigned int width = BOARD_SIZE;
+    constexpr unsigned int height = BOARD_SIZE;
+    constexpr auto num_intersections = width * height;
+    constexpr auto filter_len = filter_size * filter_size;
+    const auto input_channels = weights.size() / (biases.size() * filter_len);
+    const auto filter_dim = filter_len * input_channels;
+    assert(outputs * num_intersections == output.size());
+
+    std::vector<float> col(filter_dim * width * height);
+    im2col<filter_size>(input_channels, input, col);
+
+    // Weight shape (output, input, filter_size, filter_size)
+    // 96 18 3 3
+    // C←αAB + βC
+    // outputs[96,19x19] = weights[96,18x3x3] x col[18x3x3,19x19]
+    // M Number of rows in matrices A and C.
+    // N Number of columns in matrices B and C.
+    // K Number of columns in matrix A; number of rows in matrix B.
+    // lda The size of the first dimention of matrix A; if you are
+    // passing a matrix A[m][n], the value should be m.
+    //    cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+    //                ldb, beta, C, N);
+#ifdef USE_BLAS
+    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                // M        N            K
+                outputs, num_intersections, filter_dim,
+                1.0f, &weights[0], filter_dim,
+                &col[0], num_intersections,
+                0.0f, &output[0], num_intersections);
+#else
+    auto C_mat =
+        EigenMatrixMap<float>(output.data(), num_intersections, outputs);
+    C_mat.noalias() =
+        ConstEigenMatrixMap<float>(col.data(), num_intersections, filter_dim)
+        * ConstEigenMatrixMap<float>(weights.data(), filter_dim, outputs);
+#endif
+
+    for (unsigned int o = 0; o < outputs; o++) {
+        for (unsigned int b = 0; b < num_intersections; b++) {
+            output[(o * num_intersections) + b] += biases[o];
+        }
+    }
+}
+
+template <size_t spatial_size>
+void batchnorm(const size_t channels,
+               std::vector<float>& data,
+               const float* const means,
+               const float* const stddevs,
+               const float* const eltwise = nullptr) {
+    for (auto c = size_t{0}; c < channels; ++c) {
+        const auto mean = means[c];
+        const auto scale_stddev = stddevs[c];
+        const auto arr = &data[c * spatial_size];
+
+        if (eltwise == nullptr) {
+            // Classical BN
+            for (auto b = size_t{0}; b < spatial_size; b++) {
+                arr[b] = std::max(0.0f, scale_stddev * (arr[b] - mean));
+            }
+        } else {
+            // BN + residual add
+            const auto res = &eltwise[c * spatial_size];
+            for (auto b = size_t{0}; b < spatial_size; b++) {
+                arr[b] =
+                    std::max(0.0f, (scale_stddev * (arr[b] - mean)) + res[b]);
+            }
+        }
+    }
+}
+
+void CPUPipe::forward(const std::vector<float>& input,
+                      std::vector<float>& output_pol,
+                      std::vector<float>& output_val) {
+    // Input convolution
+    constexpr auto P = WINOGRAD_P;
+    // Calculate output channels
+    const auto output_channels = m_input_channels;
+    // input_channels is the maximum number of input channels of any
+    // convolution. Residual blocks are identical, but the first convolution
+    // might be bigger when the network has very few filters
+    const auto input_channels =
+        std::max(static_cast<size_t>(output_channels),
+                 static_cast<size_t>(Network::INPUT_CHANNELS));
+    auto conv_out = std::vector<float>(output_channels * NUM_INTERSECTIONS);
+
+    auto V = std::vector<float>(WINOGRAD_TILE * input_channels * P);
+    auto M = std::vector<float>(WINOGRAD_TILE * output_channels * P);
+
+    winograd_convolve3(output_channels, input, m_weights->m_conv_weights[0], V,
+                       M, conv_out);
+    batchnorm<NUM_INTERSECTIONS>(output_channels, conv_out,
+                                 m_weights->m_batchnorm_means[0].data(),
+                                 m_weights->m_batchnorm_stddevs[0].data());
+
+    // Residual tower
+    auto conv_in = std::vector<float>(output_channels * NUM_INTERSECTIONS);
+    auto res = std::vector<float>(output_channels * NUM_INTERSECTIONS);
+    for (auto i = size_t{1}; i < m_weights->m_conv_weights.size(); i += 2) {
+        auto output_channels = m_input_channels;
+        std::swap(conv_out, conv_in);
+        winograd_convolve3(output_channels, conv_in,
+                           m_weights->m_conv_weights[i], V, M, conv_out);
+        batchnorm<NUM_INTERSECTIONS>(output_channels, conv_out,
+                                     m_weights->m_batchnorm_means[i].data(),
+                                     m_weights->m_batchnorm_stddevs[i].data());
+
+        std::swap(conv_in, res);
+        std::swap(conv_out, conv_in);
+        winograd_convolve3(output_channels, conv_in,
+                           m_weights->m_conv_weights[i + 1], V, M, conv_out);
+        batchnorm<NUM_INTERSECTIONS>(
+            output_channels, conv_out,
+            m_weights->m_batchnorm_means[i + 1].data(),
+            m_weights->m_batchnorm_stddevs[i + 1].data(), res.data());
+    }
+    convolve<1>(Network::OUTPUTS_POLICY, conv_out, m_conv_pol_w, m_conv_pol_b,
+                output_pol);
+    convolve<1>(Network::OUTPUTS_VALUE, conv_out, m_conv_val_w, m_conv_val_b,
+                output_val);
+}
+
+void CPUPipe::push_weights(const unsigned int /*filter_size*/,
+                           const unsigned int /*channels*/,
+                           const unsigned int outputs,
+                           std::shared_ptr<const ForwardPipeWeights> weights) {
+
+    m_weights = weights;
+
+    // Output head convolutions
+    m_conv_pol_w = weights->m_conv_pol_w;
+    m_conv_pol_b.resize(m_conv_pol_w.size() / outputs, 0.0f);
+    m_conv_val_w = weights->m_conv_val_w;
+    m_conv_val_b.resize(m_conv_val_w.size() / outputs, 0.0f);
+}
diff --git a/src/CPUPipe.h b/src/CPUPipe.h
new file mode 100644
index 000000000..618625332
--- /dev/null
+++ b/src/CPUPipe.h
@@ -0,0 +1,78 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2018-2019 Junhee Yoo and contributors
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
+
+#ifndef CPUPIPE_H_INCLUDED
+#define CPUPIPE_H_INCLUDED
+#include "config.h"
+
+#include <cassert>
+#include <vector>
+
+#include "ForwardPipe.h"
+
+class CPUPipe : public ForwardPipe {
+public:
+    virtual void initialize(int channels);
+    virtual void forward(const std::vector<float>& input,
+                         std::vector<float>& output_pol,
+                         std::vector<float>& output_val);
+
+    virtual void push_weights(
+        unsigned int filter_size, unsigned int channels, unsigned int outputs,
+        std::shared_ptr<const ForwardPipeWeights> weights);
+
+private:
+    void winograd_transform_in(const std::vector<float>& in,
+                               std::vector<float>& V, int C);
+
+    void winograd_sgemm(const std::vector<float>& U,
+                        const std::vector<float>& V,
+                        std::vector<float>& M, int C, int K);
+
+    void winograd_transform_out(const std::vector<float>& M,
+                                std::vector<float>& Y, int K);
+
+    void winograd_convolve3(int outputs,
+                            const std::vector<float>& input,
+                            const std::vector<float>& U,
+                            std::vector<float>& V,
+                            std::vector<float>& M,
+                            std::vector<float>& output);
+
+    int m_input_channels;
+
+    // Input + residual block tower
+    std::shared_ptr<const ForwardPipeWeights> m_weights;
+
+    std::vector<float> m_conv_pol_w;
+    std::vector<float> m_conv_val_w;
+    std::vector<float> m_conv_pol_b;
+    std::vector<float> m_conv_val_b;
+};
+#endif
diff --git a/src/Eigen b/src/Eigen
new file mode 160000
index 000000000..cf794d3b7
--- /dev/null
+++ b/src/Eigen
@@ -0,0 +1 @@
+Subproject commit cf794d3b741a6278df169e58461f8529f43bce5d
diff --git a/src/FastBoard.cpp b/src/FastBoard.cpp
index 1c04d87a0..f302402b8 100644
--- a/src/FastBoard.cpp
+++ b/src/FastBoard.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,25 +14,39 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
-#include "FastBoard.h"
+#include "config.h"
 
-#include <cassert>
+#include <algorithm>
 #include <array>
+#include <cassert>
+#include <cctype>
 #include <iostream>
 #include <queue>
 #include <sstream>
 #include <string>
 
+#include "FastBoard.h"
+
 #include "Utils.h"
-#include "config.h"
 
 using namespace Utils;
 
 const int FastBoard::NBR_SHIFT;
-const int FastBoard::MAXSQ;
-const int FastBoard::BIG;
+const int FastBoard::NUM_VERTICES;
+const int FastBoard::NO_VERTEX;
 const int FastBoard::PASS;
 const int FastBoard::RESIGN;
 
@@ -41,31 +55,31 @@ const std::array<int, 2> FastBoard::s_eyemask = {
     4 * (1 << (NBR_SHIFT * WHITE))
 };
 
-const std::array<FastBoard::square_t, 4> FastBoard::s_cinvert = {
+const std::array<FastBoard::vertex_t, 4> FastBoard::s_cinvert = {
     WHITE, BLACK, EMPTY, INVAL
 };
 
-int FastBoard::get_boardsize(void) const {
+int FastBoard::get_boardsize() const {
     return m_boardsize;
 }
 
-int FastBoard::get_vertex(int x, int y) const {
+int FastBoard::get_vertex(const int x, const int y) const {
     assert(x >= 0 && x < BOARD_SIZE);
     assert(y >= 0 && y < BOARD_SIZE);
     assert(x >= 0 && x < m_boardsize);
     assert(y >= 0 && y < m_boardsize);
 
-    int vertex = ((y + 1) * m_squaresize) + (x + 1);
+    int vertex = ((y + 1) * m_sidevertices) + (x + 1);
 
-    assert(vertex >= 0 && vertex < m_maxsq);
+    assert(vertex >= 0 && vertex < m_numvertices);
 
     return vertex;
 }
 
-std::pair<int, int> FastBoard::get_xy(int vertex) const {
-    //int vertex = ((y + 1) * (get_boardsize() + 2)) + (x + 1);
-    int x = (vertex % m_squaresize) - 1;
-    int y = (vertex / m_squaresize) - 1;
+std::pair<int, int> FastBoard::get_xy(const int vertex) const {
+    // int vertex = ((y + 1) * (get_boardsize() + 2)) + (x + 1);
+    int x = (vertex % m_sidevertices) - 1;
+    int y = (vertex / m_sidevertices) - 1;
 
     assert(x >= 0 && x < m_boardsize);
     assert(y >= 0 && y < m_boardsize);
@@ -74,56 +88,57 @@ std::pair<int, int> FastBoard::get_xy(int vertex) const {
     return std::make_pair(x, y);
 }
 
-FastBoard::square_t FastBoard::get_square(int vertex) const {
-    assert(vertex >= 0 && vertex < MAXSQ);
-    assert(vertex >= 0 && vertex < m_maxsq);
+FastBoard::vertex_t FastBoard::get_state(const int vertex) const {
+    assert(vertex >= 0 && vertex < NUM_VERTICES);
+    assert(vertex >= 0 && vertex < m_numvertices);
 
-    return m_square[vertex];
+    return m_state[vertex];
 }
 
-void FastBoard::set_square(int vertex, FastBoard::square_t content) {
-    assert(vertex >= 0 && vertex < MAXSQ);
-    assert(vertex >= 0 && vertex < m_maxsq);
+void FastBoard::set_state(const int vertex, const FastBoard::vertex_t content) {
+    assert(vertex >= 0 && vertex < NUM_VERTICES);
+    assert(vertex >= 0 && vertex < m_numvertices);
     assert(content >= BLACK && content <= INVAL);
 
-    m_square[vertex] = content;
+    m_state[vertex] = content;
 }
 
-FastBoard::square_t FastBoard::get_square(int x, int y) const {
-    return get_square(get_vertex(x, y));
+FastBoard::vertex_t FastBoard::get_state(const int x, const int y) const {
+    return get_state(get_vertex(x, y));
 }
 
-void FastBoard::set_square(int x, int y, FastBoard::square_t content) {
-    set_square(get_vertex(x, y), content);
+void FastBoard::set_state(const int x, const int y,
+                          const FastBoard::vertex_t content) {
+    set_state(get_vertex(x, y), content);
 }
 
-void FastBoard::reset_board(int size) {
+void FastBoard::reset_board(const int size) {
     m_boardsize = size;
-    m_squaresize = size + 2;
-    m_maxsq = m_squaresize * m_squaresize;
+    m_sidevertices = size + 2;
+    m_numvertices = m_sidevertices * m_sidevertices;
     m_tomove = BLACK;
     m_prisoners[BLACK] = 0;
     m_prisoners[WHITE] = 0;
     m_empty_cnt = 0;
 
-    m_dirs[0] = -m_squaresize;
+    m_dirs[0] = -m_sidevertices;
     m_dirs[1] = +1;
-    m_dirs[2] = +m_squaresize;
+    m_dirs[2] = +m_sidevertices;
     m_dirs[3] = -1;
 
-    for (int i = 0; i < m_maxsq; i++) {
-        m_square[i]     = INVAL;
+    for (int i = 0; i < m_numvertices; i++) {
+        m_state[i] = INVAL;
         m_neighbours[i] = 0;
-        m_parent[i]     = MAXSQ;
+        m_parent[i] = NUM_VERTICES;
     }
 
     for (int i = 0; i < size; i++) {
         for (int j = 0; j < size; j++) {
             int vertex = get_vertex(i, j);
 
-            m_square[vertex]          = EMPTY;
-            m_empty_idx[vertex]       = m_empty_cnt;
-            m_empty[m_empty_cnt++]    = vertex;
+            m_state[vertex] = EMPTY;
+            m_empty_idx[vertex] = m_empty_cnt;
+            m_empty[m_empty_cnt++] = vertex;
 
             if (i == 0 || i == size - 1) {
                 m_neighbours[vertex] += (1 << (NBR_SHIFT * BLACK))
@@ -143,12 +158,14 @@ void FastBoard::reset_board(int size) {
         }
     }
 
-    m_parent[MAXSQ] = MAXSQ;
-    m_libs[MAXSQ]   = 16384;    /* we will subtract from this */
-    m_next[MAXSQ]   = MAXSQ;
+    m_parent[NUM_VERTICES] = NUM_VERTICES;
+    m_libs[NUM_VERTICES] = 16384; /* we will subtract from this */
+    m_next[NUM_VERTICES] = NUM_VERTICES;
+
+    assert(m_state[NO_VERTEX] == INVAL);
 }
 
-bool FastBoard::is_suicide(int i, int color) const {
+bool FastBoard::is_suicide(const int i, const int color) const {
     // If there are liberties next to us, it is never suicide
     if (count_pliberties(i)) {
         return false;
@@ -159,12 +176,12 @@ bool FastBoard::is_suicide(int i, int color) const {
         auto ai = i + m_dirs[k];
 
         auto libs = m_libs[m_parent[ai]];
-        if (get_square(ai) == color) {
+        if (get_state(ai) == color) {
             if (libs > 1) {
                 // connecting to live group = not suicide
                 return false;
             }
-        } else if (get_square(ai) == !color) {
+        } else if (get_state(ai) == !color) {
             if (libs <= 1) {
                 // killing neighbour = not suicide
                 return false;
@@ -197,7 +214,8 @@ void FastBoard::add_neighbour(const int vtx, const int color) {
     for (int k = 0; k < 4; k++) {
         int ai = vtx + m_dirs[k];
 
-        m_neighbours[ai] += (1 << (NBR_SHIFT * color)) - (1 << (NBR_SHIFT * EMPTY));
+        m_neighbours[ai] += (1 << (NBR_SHIFT * color))
+                          - (1 << (NBR_SHIFT * EMPTY));
 
         bool found = false;
         for (int i = 0; i < nbr_par_cnt; i++) {
@@ -239,14 +257,14 @@ void FastBoard::remove_neighbour(const int vtx, const int color) {
     }
 }
 
-int FastBoard::calc_reach_color(int color) const {
+int FastBoard::calc_reach_color(const int color) const {
     auto reachable = 0;
-    auto bd = std::vector<bool>(m_maxsq, false);
+    auto bd = std::vector<bool>(m_numvertices, false);
     auto open = std::queue<int>();
     for (auto i = 0; i < m_boardsize; i++) {
         for (auto j = 0; j < m_boardsize; j++) {
             auto vertex = get_vertex(i, j);
-            if (m_square[vertex] == color) {
+            if (m_state[vertex] == color) {
                 reachable++;
                 bd[vertex] = true;
                 open.push(vertex);
@@ -260,7 +278,7 @@ int FastBoard::calc_reach_color(int color) const {
 
         for (auto k = 0; k < 4; k++) {
             auto neighbor = vertex + m_dirs[k];
-            if (!bd[neighbor] && m_square[neighbor] == EMPTY) {
+            if (!bd[neighbor] && m_state[neighbor] == EMPTY) {
                 reachable++;
                 bd[neighbor] = true;
                 open.push(neighbor);
@@ -271,38 +289,42 @@ int FastBoard::calc_reach_color(int color) const {
 }
 
 // Needed for scoring passed out games not in MC playouts
-float FastBoard::area_score(float komi) const {
+float FastBoard::area_score(const float komi) const {
     auto white = calc_reach_color(WHITE);
     auto black = calc_reach_color(BLACK);
     return black - white - komi;
 }
 
-void FastBoard::display_board(int lastmove) {
+void FastBoard::display_board(const int lastmove) {
     int boardsize = get_boardsize();
 
     myprintf("\n   ");
     print_columns();
-    for (int j = boardsize-1; j >= 0; j--) {
-        myprintf("%2d", j+1);
+    for (int j = boardsize - 1; j >= 0; j--) {
+        myprintf("%2d", j + 1);
         if (lastmove == get_vertex(0, j))
             myprintf("(");
         else
             myprintf(" ");
         for (int i = 0; i < boardsize; i++) {
-            if (get_square(i,j) == WHITE) {
+            if (get_state(i, j) == WHITE) {
                 myprintf("O");
-            } else if (get_square(i,j) == BLACK)  {
+            } else if (get_state(i, j) == BLACK) {
                 myprintf("X");
             } else if (starpoint(boardsize, i, j)) {
                 myprintf("+");
             } else {
                 myprintf(".");
             }
-            if (lastmove == get_vertex(i, j)) myprintf(")");
-            else if (i != boardsize-1 && lastmove == get_vertex(i, j)+1) myprintf("(");
-            else myprintf(" ");
+            if (lastmove == get_vertex(i, j)) {
+                myprintf(")");
+            } else if (i != boardsize - 1 && lastmove == get_vertex(i, j) + 1) {
+                myprintf("(");
+            } else {
+                myprintf(" ");
+            }
         }
-        myprintf("%2d\n", j+1);
+        myprintf("%2d\n", j + 1);
     }
     myprintf("   ");
     print_columns();
@@ -314,14 +336,15 @@ void FastBoard::print_columns() {
         if (i < 25) {
             myprintf("%c ", (('a' + i < 'i') ? 'a' + i : 'a' + i + 1));
         } else {
-            myprintf("%c ", (('A' + (i - 25) < 'I') ? 'A' + (i - 25) : 'A' + (i - 25) + 1));
+            myprintf("%c ", (('A' + (i - 25) < 'I') ? 'A' + (i - 25)
+                                                    : 'A' + (i - 25) + 1));
         }
     }
     myprintf("\n");
 }
 
 void FastBoard::merge_strings(const int ip, const int aip) {
-    assert(ip != MAXSQ && aip != MAXSQ);
+    assert(ip != NUM_VERTICES && aip != NUM_VERTICES);
 
     /* merge stones */
     m_stones[ip] += m_stones[aip];
@@ -334,7 +357,7 @@ void FastBoard::merge_strings(const int ip, const int aip) {
         for (int k = 0; k < 4; k++) {
             int ai = newpos + m_dirs[k];
             // for each liberty, check if it is not shared
-            if (m_square[ai] == EMPTY) {
+            if (m_state[ai] == EMPTY) {
                 // find liberty neighbors
                 bool found = false;
                 for (int kk = 0; kk < 4; kk++) {
@@ -380,10 +403,10 @@ bool FastBoard::is_eye(const int color, const int i) const {
     colorcount[WHITE] = 0;
     colorcount[INVAL] = 0;
 
-    colorcount[m_square[i - 1 - m_squaresize]]++;
-    colorcount[m_square[i + 1 - m_squaresize]]++;
-    colorcount[m_square[i - 1 + m_squaresize]]++;
-    colorcount[m_square[i + 1 + m_squaresize]]++;
+    colorcount[m_state[i - 1 - m_sidevertices]]++;
+    colorcount[m_state[i + 1 - m_sidevertices]]++;
+    colorcount[m_state[i - 1 + m_sidevertices]]++;
+    colorcount[m_state[i + 1 + m_sidevertices]]++;
 
     if (colorcount[INVAL] == 0) {
         if (colorcount[!color] > 1) {
@@ -398,24 +421,23 @@ bool FastBoard::is_eye(const int color, const int i) const {
     return true;
 }
 
-std::string FastBoard::move_to_text(int move) const {
+std::string FastBoard::move_to_text(const int move) const {
     std::ostringstream result;
 
-    int column = move % m_squaresize;
-    int row = move / m_squaresize;
+    int column = move % m_sidevertices;
+    int row = move / m_sidevertices;
 
     column--;
     row--;
 
-    assert(move == FastBoard::PASS
-           || move == FastBoard::RESIGN
+    assert(move == FastBoard::PASS || move == FastBoard::RESIGN
            || (row >= 0 && row < m_boardsize));
-    assert(move == FastBoard::PASS
-           || move == FastBoard::RESIGN
+    assert(move == FastBoard::PASS || move == FastBoard::RESIGN
            || (column >= 0 && column < m_boardsize));
 
-    if (move >= 0 && move <= m_maxsq) {
-        result << static_cast<char>(column < 8 ? 'A' + column : 'A' + column + 1);
+    if (move >= 0 && move <= m_numvertices) {
+        result << static_cast<char>(column < 8 ? 'A' + column
+                                               : 'A' + column + 1);
         result << (row + 1);
     } else if (move == FastBoard::PASS) {
         result << "pass";
@@ -428,26 +450,53 @@ std::string FastBoard::move_to_text(int move) const {
     return result.str();
 }
 
-std::string FastBoard::move_to_text_sgf(int move) const {
+int FastBoard::text_to_move(std::string move) const {
+    transform(cbegin(move), cend(move), begin(move), tolower);
+
+    if (move == "pass") {
+        return PASS;
+    } else if (move == "resign") {
+        return RESIGN;
+    } else if (move.size() < 2 || !std::isalpha(move[0])
+               || !std::isdigit(move[1]) || move[0] == 'i') {
+        return NO_VERTEX;
+    }
+
+    auto column = move[0] - 'a';
+    if (move[0] > 'i') {
+        --column;
+    }
+
+    int row;
+    std::istringstream parsestream(move.substr(1));
+    parsestream >> row;
+    --row;
+
+    if (row >= m_boardsize || column >= m_boardsize) {
+        return NO_VERTEX;
+    }
+
+    return get_vertex(column, row);
+}
+
+std::string FastBoard::move_to_text_sgf(const int move) const {
     std::ostringstream result;
 
-    int column = move % m_squaresize;
-    int row = move / m_squaresize;
+    int column = move % m_sidevertices;
+    int row = move / m_sidevertices;
 
     column--;
     row--;
 
-    assert(move == FastBoard::PASS
-           || move == FastBoard::RESIGN
+    assert(move == FastBoard::PASS || move == FastBoard::RESIGN
            || (row >= 0 && row < m_boardsize));
-    assert(move == FastBoard::PASS
-           || move == FastBoard::RESIGN
+    assert(move == FastBoard::PASS || move == FastBoard::RESIGN
            || (column >= 0 && column < m_boardsize));
 
     // SGF inverts rows
     row = m_boardsize - row - 1;
 
-    if (move >= 0 && move <= m_maxsq) {
+    if (move >= 0 && move <= m_numvertices) {
         if (column <= 25) {
             result << static_cast<char>('a' + column);
         } else {
@@ -469,7 +518,7 @@ std::string FastBoard::move_to_text_sgf(int move) const {
     return result.str();
 }
 
-bool FastBoard::starpoint(int size, int point) {
+bool FastBoard::starpoint(const int size, const int point) {
     int stars[3];
     int points[2];
     int hits = 0;
@@ -496,11 +545,11 @@ bool FastBoard::starpoint(int size, int point) {
     return hits >= 2;
 }
 
-bool FastBoard::starpoint(int size, int x, int y) {
+bool FastBoard::starpoint(const int size, const int x, const int y) {
     return starpoint(size, y * size + x);
 }
 
-int FastBoard::get_prisoners(int side)  const {
+int FastBoard::get_prisoners(const int side) const {
     assert(side == WHITE || side == BLACK);
 
     return m_prisoners[side];
@@ -518,11 +567,11 @@ bool FastBoard::white_to_move() const {
     return m_tomove == WHITE;
 }
 
-void FastBoard::set_to_move(int tomove) {
+void FastBoard::set_to_move(const int tomove) {
     m_tomove = tomove;
 }
 
-std::string FastBoard::get_string(int vertex) const {
+std::string FastBoard::get_string(const int vertex) const {
     std::string result;
 
     int start = m_parent[vertex];
@@ -547,7 +596,7 @@ std::string FastBoard::get_stone_list() const {
         for (int j = 0; j < m_boardsize; j++) {
             int vertex = get_vertex(i, j);
 
-            if (get_square(vertex) != EMPTY) {
+            if (get_state(vertex) != EMPTY) {
                 result += move_to_text(vertex) + " ";
             }
         }
diff --git a/src/FastBoard.h b/src/FastBoard.h
index c11c60459..8cbd192f1 100644
--- a/src/FastBoard.h
+++ b/src/FastBoard.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef FASTBOARD_H_INCLUDED
@@ -29,6 +40,7 @@
 
 class FastBoard {
     friend class FastState;
+
 public:
     /*
         neighbor counts are up to 4, so 3 bits is ok,
@@ -38,48 +50,41 @@ class FastBoard {
     static constexpr int NBR_MASK = (1 << NBR_SHIFT) - 1;
 
     /*
-        highest existing square
+        number of vertices in a "letterboxed" board representation
     */
-    static constexpr int MAXSQ = ((BOARD_SIZE + 2) * (BOARD_SIZE + 2));
+    static constexpr int NUM_VERTICES = ((BOARD_SIZE + 2) * (BOARD_SIZE + 2));
 
     /*
-        infinite score
+        no applicable vertex
     */
-    static constexpr int BIG = 10000000;
-
+    static constexpr int NO_VERTEX = 0;
     /*
         vertex of a pass
     */
-    static constexpr int PASS   = -1;
+    static constexpr int PASS = -1;
     /*
         vertex of a "resign move"
     */
     static constexpr int RESIGN = -2;
 
     /*
-        possible contents of a square
+        possible contents of a vertex
     */
-    enum square_t : char {
+    enum vertex_t : char {
         BLACK = 0, WHITE = 1, EMPTY = 2, INVAL = 3
     };
 
-    /*
-        move generation types
-    */
-    using movescore_t = std::pair<int, float>;
-    using scoredmoves_t = std::vector<movescore_t>;
-
-    int get_boardsize(void) const;
-    square_t get_square(int x, int y) const;
-    square_t get_square(int vertex) const ;
-    int get_vertex(int i, int j) const;
-    void set_square(int x, int y, square_t content);
-    void set_square(int vertex, square_t content);
+    int get_boardsize() const;
+    vertex_t get_state(int x, int y) const;
+    vertex_t get_state(int vertex) const;
+    int get_vertex(int x, int y) const;
+    void set_state(int x, int y, vertex_t content);
+    void set_state(int vertex, vertex_t content);
     std::pair<int, int> get_xy(int vertex) const;
 
     bool is_suicide(int i, int color) const;
-    int count_pliberties(const int i) const;
-    bool is_eye(const int color, const int vtx) const;
+    int count_pliberties(int i) const;
+    bool is_eye(int color, int vtx) const;
 
     float area_score(float komi) const;
 
@@ -90,6 +95,7 @@ class FastBoard {
     void set_to_move(int color);
 
     std::string move_to_text(int move) const;
+    int text_to_move(std::string move) const;
     std::string move_to_text_sgf(int move) const;
     std::string get_stone_list() const;
     std::string get_string(int vertex) const;
@@ -105,32 +111,32 @@ class FastBoard {
         bit masks to detect eyes on neighbors
     */
     static const std::array<int,      2> s_eyemask;
-    static const std::array<square_t, 4> s_cinvert; /* color inversion */
-
-    std::array<square_t, MAXSQ>            m_square;      /* board contents */
-    std::array<unsigned short, MAXSQ+1>    m_next;        /* next stone in string */
-    std::array<unsigned short, MAXSQ+1>    m_parent;      /* parent node of string */
-    std::array<unsigned short, MAXSQ+1>    m_libs;        /* liberties per string parent */
-    std::array<unsigned short, MAXSQ+1>    m_stones;      /* stones per string parent */
-    std::array<unsigned short, MAXSQ>      m_neighbours;  /* counts of neighboring stones */
-    std::array<int, 4>                     m_dirs;        /* movement directions 4 way */
-    std::array<int, 2>                     m_prisoners;   /* prisoners per color */
-    std::array<unsigned short, MAXSQ>      m_empty;       /* empty squares */
-    std::array<unsigned short, MAXSQ>      m_empty_idx;   /* indexes of square */
-    int m_empty_cnt;                                      /* count of empties */
+    static const std::array<vertex_t, 4> s_cinvert; /* color inversion */
+
+    std::array<vertex_t, NUM_VERTICES>           m_state;      /* board contents */
+    std::array<unsigned short, NUM_VERTICES + 1> m_next;       /* next stone in string */
+    std::array<unsigned short, NUM_VERTICES + 1> m_parent;     /* parent node of string */
+    std::array<unsigned short, NUM_VERTICES + 1> m_libs;       /* liberties per string parent */
+    std::array<unsigned short, NUM_VERTICES + 1> m_stones;     /* stones per string parent */
+    std::array<unsigned short, NUM_VERTICES>     m_neighbours; /* counts of neighboring stones */
+    std::array<int, 4>                           m_dirs;       /* movement directions 4 way */
+    std::array<int, 2>                           m_prisoners;  /* prisoners per color */
+    std::array<unsigned short, NUM_VERTICES>     m_empty;      /* empty intersections */
+    std::array<unsigned short, NUM_VERTICES>     m_empty_idx;  /* intersection indices */
+    int m_empty_cnt;                                           /* count of empties */
 
     int m_tomove;
-    int m_maxsq;
+    int m_numvertices;
 
     int m_boardsize;
-    int m_squaresize;
+    int m_sidevertices;
 
     int calc_reach_color(int color) const;
 
-    int count_neighbours(const int color, const int i) const;
-    void merge_strings(const int ip, const int aip);
-    void add_neighbour(const int i, const int color);
-    void remove_neighbour(const int i, const int color);
+    int count_neighbours(int color, int i) const;
+    void merge_strings(int ip, int aip);
+    void add_neighbour(int i, int color);
+    void remove_neighbour(int i, int color);
     void print_columns();
 };
 
diff --git a/src/FastState.cpp b/src/FastState.cpp
index 451613e60..ce878a692 100644
--- a/src/FastState.cpp
+++ b/src/FastState.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,28 +14,41 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "FastState.h"
 
 #include <algorithm>
 #include <iterator>
 #include <vector>
 
+#include "FastState.h"
+
 #include "FastBoard.h"
+#include "GTP.h"
 #include "Utils.h"
 #include "Zobrist.h"
 
 using namespace Utils;
 
-void FastState::init_game(int size, float komi) {
+void FastState::init_game(const int size, const float komi) {
     board.reset_board(size);
 
     m_movenum = 0;
 
-    m_komove = 0;
-    m_lastmove = 0;
+    m_komove = FastBoard::NO_VERTEX;
+    m_lastmove = FastBoard::NO_VERTEX;
     m_komi = komi;
     m_handicap = 0;
     m_passes = 0;
@@ -43,41 +56,41 @@ void FastState::init_game(int size, float komi) {
     return;
 }
 
-void FastState::set_komi(float komi) {
+void FastState::set_komi(const float komi) {
     m_komi = komi;
 }
 
-void FastState::reset_game(void) {
+void FastState::reset_game() {
     reset_board();
 
     m_movenum = 0;
     m_passes = 0;
     m_handicap = 0;
-    m_komove = 0;
-    m_lastmove = 0;
+    m_komove = FastBoard::NO_VERTEX;
+    m_lastmove = FastBoard::NO_VERTEX;
 }
 
-void FastState::reset_board(void) {
+void FastState::reset_board() {
     board.reset_board(board.get_boardsize());
 }
 
-bool FastState::is_move_legal(int color, int vertex) {
-    return vertex == FastBoard::PASS ||
-           vertex == FastBoard::RESIGN ||
-           (vertex != m_komove &&
-                board.get_square(vertex) == FastBoard::EMPTY &&
-                !board.is_suicide(vertex, color));
+bool FastState::is_move_legal(const int color, const int vertex) const {
+    return !cfg_analyze_tags.is_to_avoid(color, vertex, m_movenum)
+           && (vertex == FastBoard::PASS || vertex == FastBoard::RESIGN
+               || (vertex != m_komove
+                   && board.get_state(vertex) == FastBoard::EMPTY
+                   && !board.is_suicide(vertex, color)));
 }
 
-void FastState::play_move(int vertex) {
+void FastState::play_move(const int vertex) {
     play_move(board.m_tomove, vertex);
 }
 
-void FastState::play_move(int color, int vertex) {
+void FastState::play_move(const int color, const int vertex) {
     board.m_hash ^= Zobrist::zobrist_ko[m_komove];
     if (vertex == FastBoard::PASS) {
         // No Ko move
-        m_komove = 0;
+        m_komove = FastBoard::NO_VERTEX;
     } else {
         m_komove = board.update_board(color, vertex);
     }
@@ -104,7 +117,7 @@ size_t FastState::get_movenum() const {
     return m_movenum;
 }
 
-int FastState::get_last_move(void) const {
+int FastState::get_last_move() const {
     return m_lastmove;
 }
 
@@ -112,7 +125,7 @@ int FastState::get_passes() const {
     return m_passes;
 }
 
-void FastState::set_passes(int val) {
+void FastState::set_passes(const int val) {
     m_passes = val;
 }
 
@@ -125,7 +138,7 @@ int FastState::get_to_move() const {
     return board.m_tomove;
 }
 
-void FastState::set_to_move(int tom) {
+void FastState::set_to_move(const int tom) {
     board.set_to_move(tom);
 }
 
@@ -143,7 +156,7 @@ void FastState::display_state() {
     board.display_board(get_last_move());
 }
 
-std::string FastState::move_to_text(int move) {
+std::string FastState::move_to_text(const int move) const {
     return board.move_to_text(move);
 }
 
@@ -155,10 +168,14 @@ float FastState::get_komi() const {
     return m_komi;
 }
 
-void FastState::set_handicap(int hcap) {
+void FastState::set_handicap(const int hcap) {
     m_handicap = hcap;
 }
 
 int FastState::get_handicap() const {
     return m_handicap;
 }
+
+std::uint64_t FastState::get_symmetry_hash(const int symmetry) const {
+    return board.calc_symmetry_hash(m_komove, symmetry);
+}
diff --git a/src/FastState.h b/src/FastState.h
index 502816a14..63a44e272 100644
--- a/src/FastState.h
+++ b/src/FastState.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,13 +14,24 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef FASTSTATE_H_INCLUDED
 #define FASTSTATE_H_INCLUDED
 
-#include <cstddef>
 #include <array>
+#include <cstddef>
 #include <string>
 #include <vector>
 
@@ -33,8 +44,7 @@ class FastState {
     void reset_board();
 
     void play_move(int vertex);
-
-    bool is_move_legal(int color, int vertex);
+    bool is_move_legal(int color, int vertex) const;
 
     void set_komi(float komi);
     float get_komi() const;
@@ -47,11 +57,12 @@ class FastState {
     void increment_passes();
 
     float final_score() const;
+    std::uint64_t get_symmetry_hash(int symmetry) const;
 
     size_t get_movenum() const;
     int get_last_move() const;
     void display_state();
-    std::string move_to_text(int move);
+    std::string move_to_text(int move) const;
 
     FullBoard board;
 
diff --git a/src/ForwardPipe.h b/src/ForwardPipe.h
new file mode 100644
index 000000000..a5c70c14a
--- /dev/null
+++ b/src/ForwardPipe.h
@@ -0,0 +1,73 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2018-2019 Junhee Yoo and contributors
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
+
+#ifndef FORWARDPIPE_H_INCLUDED
+#define FORWARDPIPE_H_INCLUDED
+
+#include "config.h"
+
+#include <memory>
+#include <vector>
+
+class ForwardPipe {
+public:
+    class ForwardPipeWeights {
+    public:
+        // Input + residual block tower
+        std::vector<std::vector<float>> m_conv_weights;
+        std::vector<std::vector<float>> m_conv_biases;
+        std::vector<std::vector<float>> m_batchnorm_means;
+        std::vector<std::vector<float>> m_batchnorm_stddevs;
+
+        // Policy head
+        std::vector<float> m_conv_pol_w;
+        std::vector<float> m_conv_pol_b;
+
+        std::vector<float> m_conv_val_w;
+        std::vector<float> m_conv_val_b;
+    };
+
+    virtual ~ForwardPipe() = default;
+
+    virtual void initialize(int channels) = 0;
+    virtual bool needs_autodetect() {
+        return false;
+    };
+    virtual void forward(const std::vector<float>& input,
+                         std::vector<float>& output_pol,
+                         std::vector<float>& output_val) = 0;
+    virtual void push_weights(
+        unsigned int filter_size, unsigned int channels, unsigned int outputs,
+        std::shared_ptr<const ForwardPipeWeights> weights) = 0;
+
+    virtual void drain() {}
+    virtual void resume() {}
+};
+
+#endif
diff --git a/src/FullBoard.cpp b/src/FullBoard.cpp
index aac3d2896..407d85164 100644
--- a/src/FullBoard.cpp
+++ b/src/FullBoard.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
@@ -22,31 +33,33 @@
 #include <cassert>
 
 #include "FullBoard.h"
+
+#include "Network.h"
 #include "Utils.h"
 #include "Zobrist.h"
 
 using namespace Utils;
 
-int FullBoard::remove_string(int i) {
+int FullBoard::remove_string(const int i) {
     int pos = i;
     int removed = 0;
-    int color = m_square[i];
+    int color = m_state[i];
 
     do {
-        m_hash    ^= Zobrist::zobrist[m_square[pos]][pos];
-        m_ko_hash ^= Zobrist::zobrist[m_square[pos]][pos];
+        m_hash    ^= Zobrist::zobrist[m_state[pos]][pos];
+        m_ko_hash ^= Zobrist::zobrist[m_state[pos]][pos];
 
-        m_square[pos] = EMPTY;
-        m_parent[pos] = MAXSQ;
+        m_state[pos] = EMPTY;
+        m_parent[pos] = NUM_VERTICES;
 
         remove_neighbour(pos, color);
 
-        m_empty_idx[pos]      = m_empty_cnt;
-        m_empty[m_empty_cnt]  = pos;
+        m_empty_idx[pos] = m_empty_cnt;
+        m_empty[m_empty_cnt] = pos;
         m_empty_cnt++;
 
-        m_hash    ^= Zobrist::zobrist[m_square[pos]][pos];
-        m_ko_hash ^= Zobrist::zobrist[m_square[pos]][pos];
+        m_hash    ^= Zobrist::zobrist[m_state[pos]][pos];
+        m_ko_hash ^= Zobrist::zobrist[m_state[pos]][pos];
 
         removed++;
         pos = m_next[pos];
@@ -55,26 +68,26 @@ int FullBoard::remove_string(int i) {
     return removed;
 }
 
-std::uint64_t FullBoard::calc_ko_hash(void) {
+std::uint64_t FullBoard::calc_ko_hash() const {
     auto res = Zobrist::zobrist_empty;
 
-    for (int i = 0; i < m_maxsq; i++) {
-        if (m_square[i] != INVAL) {
-            res ^= Zobrist::zobrist[m_square[i]][i];
+    for (auto i = 0; i < m_numvertices; i++) {
+        if (m_state[i] != INVAL) {
+            res ^= Zobrist::zobrist[m_state[i]][i];
         }
     }
 
     /* Tromp-Taylor has positional superko */
-    m_ko_hash = res;
     return res;
 }
 
-std::uint64_t FullBoard::calc_hash(int komove) {
+template <class Function>
+std::uint64_t FullBoard::calc_hash(const int komove, Function transform) const {
     auto res = Zobrist::zobrist_empty;
 
-    for (int i = 0; i < m_maxsq; i++) {
-        if (m_square[i] != INVAL) {
-            res ^= Zobrist::zobrist[m_square[i]][i];
+    for (auto i = 0; i < m_numvertices; i++) {
+        if (m_state[i] != INVAL) {
+            res ^= Zobrist::zobrist[m_state[i]][transform(i)];
         }
     }
 
@@ -86,22 +99,37 @@ std::uint64_t FullBoard::calc_hash(int komove) {
         res ^= Zobrist::zobrist_blacktomove;
     }
 
-    res ^= Zobrist::zobrist_ko[komove];
-
-    m_hash = res;
+    res ^= Zobrist::zobrist_ko[transform(komove)];
 
     return res;
 }
 
-std::uint64_t FullBoard::get_hash(void) const {
+std::uint64_t FullBoard::calc_hash(const int komove) const {
+    return calc_hash(komove, [](const auto vertex) { return vertex; });
+}
+
+std::uint64_t FullBoard::calc_symmetry_hash(const int komove,
+                                            const int symmetry) const {
+    return calc_hash(komove, [this, symmetry](const auto vertex) {
+        if (vertex == NO_VERTEX) {
+            return NO_VERTEX;
+        } else {
+            const auto newvtx =
+                Network::get_symmetry(get_xy(vertex), symmetry, m_boardsize);
+            return get_vertex(newvtx.first, newvtx.second);
+        }
+    });
+}
+
+std::uint64_t FullBoard::get_hash() const {
     return m_hash;
 }
 
-std::uint64_t FullBoard::get_ko_hash(void) const {
+std::uint64_t FullBoard::get_ko_hash() const {
     return m_ko_hash;
 }
 
-void FullBoard::set_to_move(int tomove) {
+void FullBoard::set_to_move(const int tomove) {
     if (m_tomove != tomove) {
         m_hash ^= Zobrist::zobrist_blacktomove;
     }
@@ -110,19 +138,19 @@ void FullBoard::set_to_move(int tomove) {
 
 int FullBoard::update_board(const int color, const int i) {
     assert(i != FastBoard::PASS);
-    assert(m_square[i] == EMPTY);
+    assert(m_state[i] == EMPTY);
 
-    m_hash ^= Zobrist::zobrist[m_square[i]][i];
-    m_ko_hash ^= Zobrist::zobrist[m_square[i]][i];
+    m_hash ^= Zobrist::zobrist[m_state[i]][i];
+    m_ko_hash ^= Zobrist::zobrist[m_state[i]][i];
 
-    m_square[i] = square_t(color);
+    m_state[i] = vertex_t(color);
     m_next[i] = i;
     m_parent[i] = i;
     m_libs[i] = count_pliberties(i);
     m_stones[i] = 1;
 
-    m_hash ^= Zobrist::zobrist[m_square[i]][i];
-    m_ko_hash ^= Zobrist::zobrist[m_square[i]][i];
+    m_hash ^= Zobrist::zobrist[m_state[i]][i];
+    m_ko_hash ^= Zobrist::zobrist[m_state[i]][i];
 
     /* update neighbor liberties (they all lose 1) */
     add_neighbour(i, color);
@@ -131,18 +159,18 @@ int FullBoard::update_board(const int color, const int i) {
     auto eyeplay = (m_neighbours[i] & s_eyemask[!color]);
 
     auto captured_stones = 0;
-    int captured_sq;
+    int captured_vtx;
 
     for (int k = 0; k < 4; k++) {
         int ai = i + m_dirs[k];
 
-        if (m_square[ai] == !color) {
+        if (m_state[ai] == !color) {
             if (m_libs[m_parent[ai]] <= 0) {
                 int this_captured = remove_string(ai);
-                captured_sq = ai;
+                captured_vtx = ai;
                 captured_stones += this_captured;
             }
-        } else if (m_square[ai] == color) {
+        } else if (m_state[ai] == color) {
             int ip = m_parent[i];
             int aip = m_parent[ai];
 
@@ -173,24 +201,24 @@ int FullBoard::update_board(const int color, const int i) {
 
     /* check for possible simple ko */
     if (captured_stones == 1 && eyeplay) {
-        assert(get_square(captured_sq) == FastBoard::EMPTY
-                && !is_suicide(captured_sq, !color));
-        return captured_sq;
+        assert(get_state(captured_vtx) == FastBoard::EMPTY
+               && !is_suicide(captured_vtx, !color));
+        return captured_vtx;
     }
 
     // No ko
-    return 0;
+    return NO_VERTEX;
 }
 
-void FullBoard::display_board(int lastmove) {
+void FullBoard::display_board(const int lastmove) {
     FastBoard::display_board(lastmove);
 
     myprintf("Hash: %llX Ko-Hash: %llX\n\n", get_hash(), get_ko_hash());
 }
 
-void FullBoard::reset_board(int size) {
+void FullBoard::reset_board(const int size) {
     FastBoard::reset_board(size);
 
-    calc_hash();
-    calc_ko_hash();
+    m_hash = calc_hash();
+    m_ko_hash = calc_ko_hash();
 }
diff --git a/src/FullBoard.h b/src/FullBoard.h
index d7f69799d..b3afd843d 100644
--- a/src/FullBoard.h
+++ b/src/FullBoard.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,31 +14,50 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef FULLBOARD_H_INCLUDED
 #define FULLBOARD_H_INCLUDED
 
 #include "config.h"
+
 #include <cstdint>
+
 #include "FastBoard.h"
 
 class FullBoard : public FastBoard {
 public:
     int remove_string(int i);
-    int update_board(const int color, const int i);
+    int update_board(int color, int i);
 
-    std::uint64_t calc_hash(int komove = 0);
-    std::uint64_t calc_ko_hash(void);
-    std::uint64_t get_hash(void) const;
-    std::uint64_t get_ko_hash(void) const;
+    std::uint64_t get_hash() const;
+    std::uint64_t get_ko_hash() const;
     void set_to_move(int tomove);
 
     void reset_board(int size);
     void display_board(int lastmove = -1);
 
+    std::uint64_t calc_hash(int komove = NO_VERTEX) const;
+    std::uint64_t calc_symmetry_hash(int komove, int symmetry) const;
+    std::uint64_t calc_ko_hash() const;
+
     std::uint64_t m_hash;
     std::uint64_t m_ko_hash;
+
+private:
+    template <class Function>
+    std::uint64_t calc_hash(int komove, Function transform) const;
 };
 
 #endif
diff --git a/src/GTP.cpp b/src/GTP.cpp
index 6969fabf6..8c3637bac 100644
--- a/src/GTP.cpp
+++ b/src/GTP.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,12 +14,23 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "GTP.h"
 
 #include <algorithm>
+#include <boost/algorithm/string.hpp>
 #include <cctype>
 #include <chrono>
 #include <cmath>
@@ -32,6 +43,8 @@
 #include <string>
 #include <vector>
 
+#include "GTP.h"
+
 #include "FastBoard.h"
 #include "FullBoard.h"
 #include "GameState.h"
@@ -47,10 +60,13 @@ using namespace Utils;
 // Configuration flags
 bool cfg_gtp_mode;
 bool cfg_allow_pondering;
-int cfg_num_threads;
-int cfg_max_threads;
+unsigned int cfg_num_threads;
+unsigned int cfg_batch_size;
 int cfg_max_playouts;
 int cfg_max_visits;
+size_t cfg_max_memory;
+size_t cfg_max_tree_size;
+int cfg_max_cache_ratio_percent;
 TimeManagement::enabled_t cfg_timemanage;
 int cfg_lagbuffer_cs;
 int cfg_resignpct;
@@ -64,42 +80,280 @@ bool cfg_dumbpass;
 std::vector<int> cfg_gpus;
 bool cfg_sgemm_exhaustive;
 bool cfg_tune_only;
+#ifdef USE_HALF
+precision_t cfg_precision;
+#endif
 #endif
 float cfg_puct;
+float cfg_logpuct;
+float cfg_logconst;
 float cfg_softmax_temp;
 float cfg_fpu_reduction;
+float cfg_fpu_root_reduction;
+float cfg_ci_alpha;
+float cfg_lcb_min_visit_ratio;
 std::string cfg_weightsfile;
 std::string cfg_logfile;
 FILE* cfg_logfile_handle;
 bool cfg_quiet;
 std::string cfg_options_str;
 bool cfg_benchmark;
+bool cfg_cpu_only;
+AnalyzeTags cfg_analyze_tags;
+
+/* Parses tags for the lz-analyze GTP command and friends */
+AnalyzeTags::AnalyzeTags(std::istringstream& cmdstream, const GameState& game) {
+    std::string tag;
+
+    /* Default color is the current one */
+    m_who = game.board.get_to_move();
+
+    auto avoid_not_pass_resign_b = false, avoid_not_pass_resign_w = false;
+    auto allow_b = false, allow_w = false;
+
+    while (true) {
+        cmdstream >> std::ws;
+        if (isdigit(cmdstream.peek())) {
+            tag = "interval";
+        } else {
+            cmdstream >> tag;
+            if (cmdstream.fail() && cmdstream.eof()) {
+                /* Parsing complete */
+                m_invalid = false;
+                return;
+            }
+        }
+
+        if (tag == "avoid" || tag == "allow") {
+            std::string textcolor, textmoves;
+            size_t until_movenum;
+            cmdstream >> textcolor;
+            cmdstream >> textmoves;
+            cmdstream >> until_movenum;
+            if (cmdstream.fail()) {
+                return;
+            }
+
+            std::vector<int> moves;
+            std::istringstream movestream(textmoves);
+            while (!movestream.eof()) {
+                std::string textmove;
+                getline(movestream, textmove, ',');
+                auto sepidx = textmove.find_first_of(':');
+                if (sepidx != std::string::npos) {
+                    if (!(sepidx == 2 || sepidx == 3)) {
+                        moves.clear();
+                        break;
+                    }
+                    auto move1_compressed =
+                        game.board.text_to_move(textmove.substr(0, sepidx));
+                    auto move2_compressed =
+                        game.board.text_to_move(textmove.substr(sepidx + 1));
+                    if (move1_compressed == FastBoard::NO_VERTEX ||
+                        move1_compressed == FastBoard::PASS ||
+                        move1_compressed == FastBoard::RESIGN ||
+                        move2_compressed == FastBoard::NO_VERTEX ||
+                        move2_compressed == FastBoard::PASS ||
+                        move2_compressed == FastBoard::RESIGN) {
+                        moves.clear();
+                        break;
+                    }
+                    auto move1_xy = game.board.get_xy(move1_compressed);
+                    auto move2_xy = game.board.get_xy(move2_compressed);
+                    auto xmin = std::min(move1_xy.first, move2_xy.first);
+                    auto xmax = std::max(move1_xy.first, move2_xy.first);
+                    auto ymin = std::min(move1_xy.second, move2_xy.second);
+                    auto ymax = std::max(move1_xy.second, move2_xy.second);
+                    for (auto move_x = xmin; move_x <= xmax; move_x++) {
+                        for (auto move_y = ymin; move_y <= ymax; move_y++) {
+                            moves.push_back(
+                                game.board.get_vertex(move_x, move_y));
+                        }
+                    }
+                } else {
+                    auto move = game.board.text_to_move(textmove);
+                    if (move == FastBoard::NO_VERTEX) {
+                        moves.clear();
+                        break;
+                    }
+                    moves.push_back(move);
+                }
+            }
+            if (moves.empty()) {
+                return;
+            }
+
+            int color;
+            if (textcolor == "w" || textcolor == "white") {
+                color = FastBoard::WHITE;
+            } else if (textcolor == "b" || textcolor == "black") {
+                color = FastBoard::BLACK;
+            } else {
+                return;
+            }
+
+            if (until_movenum < 1) {
+                return;
+            }
+            until_movenum += game.get_movenum() - 1;
+
+            for (const auto& move : moves) {
+                if (tag == "avoid") {
+                    add_move_to_avoid(color, move, until_movenum);
+                    if (move != FastBoard::PASS && move != FastBoard::RESIGN) {
+                        if (color == FastBoard::BLACK) {
+                            avoid_not_pass_resign_b = true;
+                        } else {
+                            avoid_not_pass_resign_w = true;
+                        }
+                    }
+                } else {
+                    add_move_to_allow(color, move, until_movenum);
+                    if (color == FastBoard::BLACK) {
+                        allow_b = true;
+                    } else {
+                        allow_w = true;
+                    }
+                }
+            }
+            if ((allow_b && avoid_not_pass_resign_b) ||
+                (allow_w && avoid_not_pass_resign_w)) {
+                /* If "allow" is in use, it is illegal to use "avoid" with any
+                 * move that is not "pass" or "resign". */
+                return;
+            }
+        } else if (tag == "w" || tag == "white") {
+            m_who = FastBoard::WHITE;
+        } else if (tag == "b" || tag == "black") {
+            m_who = FastBoard::BLACK;
+        } else if (tag == "interval") {
+            cmdstream >> m_interval_centis;
+            if (cmdstream.fail()) {
+                return;
+            }
+        } else if (tag == "minmoves") {
+            cmdstream >> m_min_moves;
+            if (cmdstream.fail()) {
+                return;
+            }
+        } else {
+            return;
+        }
+    }
+}
+
+void AnalyzeTags::add_move_to_avoid(const int color, const int vertex,
+                                    const size_t until_move) {
+    m_moves_to_avoid.emplace_back(color, until_move, vertex);
+}
+
+void AnalyzeTags::add_move_to_allow(const int color, const int vertex,
+                                    const size_t until_move) {
+    m_moves_to_allow.emplace_back(color, until_move, vertex);
+}
+
+int AnalyzeTags::interval_centis() const {
+    return m_interval_centis;
+}
+
+int AnalyzeTags::invalid() const {
+    return m_invalid;
+}
+
+int AnalyzeTags::who() const {
+    return m_who;
+}
+
+size_t AnalyzeTags::post_move_count() const {
+    return m_min_moves;
+}
+
+bool AnalyzeTags::is_to_avoid(const int color, const int vertex,
+                              const size_t movenum) const {
+    for (auto& move : m_moves_to_avoid) {
+        if (color == move.color && vertex == move.vertex
+            && movenum <= move.until_move) {
+            return true;
+        }
+    }
+    if (vertex != FastBoard::PASS && vertex != FastBoard::RESIGN) {
+        auto active_allow = false;
+        for (auto& move : m_moves_to_allow) {
+            if (color == move.color && movenum <= move.until_move) {
+                active_allow = true;
+                if (vertex == move.vertex) {
+                    return false;
+                }
+            }
+        }
+        if (active_allow) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AnalyzeTags::has_move_restrictions() const {
+    return !m_moves_to_avoid.empty() || !m_moves_to_allow.empty();
+}
+
+std::unique_ptr<Network> GTP::s_network;
+
+void GTP::initialize(std::unique_ptr<Network>&& net) {
+    s_network = std::move(net);
+
+    bool result;
+    std::string message;
+    std::tie(result, message) =
+        set_max_memory(cfg_max_memory, cfg_max_cache_ratio_percent);
+    if (!result) {
+        // This should only ever happen with 60 block networks on 32bit machine.
+        myprintf("LOW MEMORY SETTINGS! Couldn't set default memory limits.\n");
+        myprintf("The network you are using might be too big\n");
+        myprintf("for the default settings on your system.\n");
+        throw std::runtime_error("Error setting memory requirements.");
+    }
+    myprintf("%s\n", message.c_str());
+}
 
 void GTP::setup_default_parameters() {
     cfg_gtp_mode = false;
     cfg_allow_pondering = true;
-    cfg_max_threads = std::max(1, std::min(SMP::get_num_cpus(), MAX_CPUS));
-#ifdef USE_OPENCL
-    // If we will be GPU limited, using many threads won't help much.
-    cfg_num_threads = std::min(2, cfg_max_threads);
-#else
-    cfg_num_threads = cfg_max_threads;
-#endif
+
+    // we will re-calculate this on Leela.cpp
+    cfg_num_threads = 1;
+    // we will re-calculate this on Leela.cpp
+    cfg_batch_size = 1;
+
+    cfg_max_memory = UCTSearch::DEFAULT_MAX_MEMORY;
     cfg_max_playouts = UCTSearch::UNLIMITED_PLAYOUTS;
     cfg_max_visits = UCTSearch::UNLIMITED_PLAYOUTS;
+    // This will be overwriiten in initialize() after network size is known.
+    cfg_max_tree_size = UCTSearch::DEFAULT_MAX_MEMORY;
+    cfg_max_cache_ratio_percent = 10;
     cfg_timemanage = TimeManagement::AUTO;
     cfg_lagbuffer_cs = 100;
+    cfg_weightsfile = leelaz_file("best-network");
 #ifdef USE_OPENCL
-    cfg_gpus = { };
+    cfg_gpus = {};
     cfg_sgemm_exhaustive = false;
     cfg_tune_only = false;
+
+#ifdef USE_HALF
+    cfg_precision = precision_t::AUTO;
+#endif
 #endif
-    cfg_puct = 0.8f;
+    cfg_puct = 0.5f;
+    cfg_logpuct = 0.015f;
+    cfg_logconst = 1.7f;
     cfg_softmax_temp = 1.0f;
     cfg_fpu_reduction = 0.25f;
     // see UCTSearch::should_resign
     cfg_resignpct = -1;
     cfg_noise = false;
+    cfg_fpu_root_reduction = cfg_fpu_reduction;
+    cfg_ci_alpha = 1e-5f;
+    cfg_lcb_min_visit_ratio = 0.10f;
     cfg_random_cnt = 0;
     cfg_random_min_visits = 1;
     cfg_random_temp = 1.0f;
@@ -107,6 +361,13 @@ void GTP::setup_default_parameters() {
     cfg_logfile_handle = nullptr;
     cfg_quiet = false;
     cfg_benchmark = false;
+#ifdef USE_CPU_ONLY
+    cfg_cpu_only = true;
+#else
+    cfg_cpu_only = false;
+#endif
+
+    cfg_analyze_tags = AnalyzeTags{};
 
     // C++11 doesn't guarantee *anything* about how random this is,
     // and in MinGW it isn't random at all. But we can mix it in, which
@@ -115,8 +376,8 @@ void GTP::setup_default_parameters() {
     std::ranlux48 gen(rd());
     std::uint64_t seed1 = (gen() << 16) ^ gen();
     // If the above fails, this is one of our best, portable, bets.
-    std::uint64_t seed2 = std::chrono::high_resolution_clock::
-        now().time_since_epoch().count();
+    std::uint64_t seed2 =
+        std::chrono::high_resolution_clock::now().time_since_epoch().count();
     cfg_rng_seed = seed1 ^ seed2;
 }
 
@@ -139,6 +400,9 @@ const std::string GTP::s_commands[] = {
     "time_settings",
     "time_left",
     "fixed_handicap",
+    "last_move",
+    "move_history",
+    "clear_cache",
     "place_free_handicap",
     "set_free_handicap",
     "loadsgf",
@@ -147,10 +411,28 @@ const std::string GTP::s_commands[] = {
     "kgs-time_settings",
     "kgs-game_over",
     "heatmap",
+    "lz-analyze",
+    "lz-genmove_analyze",
+    "lz-memory_report",
+    "lz-setoption",
+    "gomill-explain_last_move",
+    ""
+};
+
+// Default/min/max could be moved into separate fields,
+// but for now we assume that the GUI will not send us invalid info.
+const std::string GTP::s_options[] = {
+    "option name Maximum Memory Use (MiB) type spin default 2048 min 128 max 131072",
+    "option name Percentage of memory for cache type spin default 10 min 1 max 99",
+    "option name Visits type spin default 0 min 0 max 1000000000",
+    "option name Playouts type spin default 0 min 0 max 1000000000",
+    "option name Lagbuffer type spin default 0 min 0 max 3000",
+    "option name Resign Percentage type spin default -1 min -1 max 30",
+    "option name Pondering type check default true",
     ""
 };
 
-std::string GTP::get_life_list(const GameState & game, bool live) {
+std::string GTP::get_life_list(const GameState& game, const bool live) {
     std::vector<std::string> stringlist;
     std::string result;
     const auto& board = game.board;
@@ -160,7 +442,7 @@ std::string GTP::get_life_list(const GameState & game, bool live) {
             for (int j = 0; j < board.get_boardsize(); j++) {
                 int vertex = board.get_vertex(i, j);
 
-                if (board.get_square(vertex) != FastBoard::EMPTY) {
+                if (board.get_state(vertex) != FastBoard::EMPTY) {
                     stringlist.push_back(board.get_string(vertex));
                 }
             }
@@ -169,9 +451,9 @@ std::string GTP::get_life_list(const GameState & game, bool live) {
 
     // remove multiple mentions of the same string
     // unique reorders and returns new iterator, erase actually deletes
-    std::sort(stringlist.begin(), stringlist.end());
-    stringlist.erase(std::unique(stringlist.begin(), stringlist.end()),
-                     stringlist.end());
+    std::sort(begin(stringlist), end(stringlist));
+    stringlist.erase(std::unique(begin(stringlist), end(stringlist)),
+                     end(stringlist));
 
     for (size_t i = 0; i < stringlist.size(); i++) {
         result += (i == 0 ? "" : "\n") + stringlist[i];
@@ -180,9 +462,9 @@ std::string GTP::get_life_list(const GameState & game, bool live) {
     return result;
 }
 
-bool GTP::execute(GameState & game, std::string xinput) {
+void GTP::execute(GameState& game, const std::string& xinput) {
     std::string input;
-    static auto search = std::make_unique<UCTSearch>(game);
+    static auto search = std::make_unique<UCTSearch>(game, *s_network);
 
     bool transform_lowercase = true;
 
@@ -196,9 +478,9 @@ bool GTP::execute(GameState & game, std::string xinput) {
         if (xinput[tmp] == 9) {
             input += " ";
         } else if ((xinput[tmp] > 0 && xinput[tmp] <= 9)
-                || (xinput[tmp] >= 11 && xinput[tmp] <= 31)
-                || xinput[tmp] == 127) {
-               continue;
+                   || (xinput[tmp] >= 11 && xinput[tmp] <= 31)
+                   || xinput[tmp] == 127) {
+            continue;
         } else {
             if (transform_lowercase) {
                 input += std::tolower(xinput[tmp]);
@@ -220,11 +502,11 @@ bool GTP::execute(GameState & game, std::string xinput) {
     int id = -1;
 
     if (input == "") {
-        return true;
+        return;
     } else if (input == "exit") {
         exit(EXIT_SUCCESS);
     } else if (input.find("#") == 0) {
-        return true;
+        return;
     } else if (std::isdigit(input[0])) {
         std::istringstream strm(input);
         char spacer;
@@ -238,13 +520,13 @@ bool GTP::execute(GameState & game, std::string xinput) {
     /* process commands */
     if (command == "protocol_version") {
         gtp_printf(id, "%d", GTP_VERSION);
-        return true;
+        return;
     } else if (command == "name") {
         gtp_printf(id, PROGRAM_NAME);
-        return true;
+        return;
     } else if (command == "version") {
         gtp_printf(id, PROGRAM_VERSION);
-        return true;
+        return;
     } else if (command == "quit") {
         gtp_printf(id, "");
         exit(EXIT_SUCCESS);
@@ -252,31 +534,31 @@ bool GTP::execute(GameState & game, std::string xinput) {
         std::istringstream cmdstream(command);
         std::string tmp;
 
-        cmdstream >> tmp;     /* remove known_command */
+        cmdstream >> tmp; /* remove known_command */
         cmdstream >> tmp;
 
         for (int i = 0; s_commands[i].size() > 0; i++) {
             if (tmp == s_commands[i]) {
                 gtp_printf(id, "true");
-                return 1;
+                return;
             }
         }
 
         gtp_printf(id, "false");
-        return true;
+        return;
     } else if (command.find("list_commands") == 0) {
         std::string outtmp(s_commands[0]);
         for (int i = 1; s_commands[i].size() > 0; i++) {
             outtmp = outtmp + "\n" + s_commands[i];
         }
         gtp_printf(id, outtmp.c_str());
-        return true;
+        return;
     } else if (command.find("boardsize") == 0) {
         std::istringstream cmdstream(command);
         std::string stmp;
         int tmp;
 
-        cmdstream >> stmp;  // eat boardsize
+        cmdstream >> stmp; // eat boardsize
         cmdstream >> tmp;
 
         if (!cmdstream.fail()) {
@@ -292,20 +574,21 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_fail_printf(id, "syntax not understood");
         }
 
-        return true;
+        return;
     } else if (command.find("clear_board") == 0) {
         Training::clear_training();
         game.reset_game();
-        search = std::make_unique<UCTSearch>(game);
+        search = std::make_unique<UCTSearch>(game, *s_network);
+        assert(UCTNodePointer::get_tree_size() == 0);
         gtp_printf(id, "");
-        return true;
+        return;
     } else if (command.find("komi") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
-        float komi = 7.5f;
+        float komi = KOMI;
         float old_komi = game.get_komi();
 
-        cmdstream >> tmp;  // eat komi
+        cmdstream >> tmp; // eat komi
         cmdstream >> komi;
 
         if (!cmdstream.fail()) {
@@ -317,75 +600,126 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_fail_printf(id, "syntax not understood");
         }
 
-        return true;
+        return;
     } else if (command.find("play") == 0) {
-        if (command.find("resign") != std::string::npos) {
-            game.play_move(FastBoard::RESIGN);
-            gtp_printf(id, "");
-        } else if (command.find("pass") != std::string::npos) {
-            game.play_move(FastBoard::PASS);
-            gtp_printf(id, "");
-        } else {
-            std::istringstream cmdstream(command);
-            std::string tmp;
-            std::string color, vertex;
+        std::istringstream cmdstream(command);
+        std::string tmp;
+        std::string color, vertex;
 
-            cmdstream >> tmp;   //eat play
-            cmdstream >> color;
-            cmdstream >> vertex;
+        cmdstream >> tmp; // eat play
+        cmdstream >> color;
+        cmdstream >> vertex;
 
-            if (!cmdstream.fail()) {
-                if (!game.play_textmove(color, vertex)) {
-                    gtp_fail_printf(id, "illegal move");
-                } else {
-                    gtp_printf(id, "");
-                }
+        if (!cmdstream.fail()) {
+            if (!game.play_textmove(color, vertex)) {
+                gtp_fail_printf(id, "illegal move");
             } else {
-                gtp_fail_printf(id, "syntax not understood");
+                gtp_printf(id, "");
             }
+        } else {
+            gtp_fail_printf(id, "syntax not understood");
         }
-        return true;
-    } else if (command.find("genmove") == 0) {
+        return;
+    } else if (command.find("genmove") == 0
+               || command.find("lz-genmove_analyze") == 0) {
+        auto analysis_output = command.find("lz-genmove_analyze") == 0;
+
         std::istringstream cmdstream(command);
         std::string tmp;
+        cmdstream >> tmp; // eat genmove
 
-        cmdstream >> tmp;  // eat genmove
-        cmdstream >> tmp;
+        int who;
+        AnalyzeTags tags;
 
-        if (!cmdstream.fail()) {
-            int who;
+        if (analysis_output) {
+            tags = AnalyzeTags{cmdstream, game};
+            if (tags.invalid()) {
+                gtp_fail_printf(id, "cannot parse analyze tags");
+                return;
+            }
+            who = tags.who();
+        } else {
+            /* genmove command */
+            cmdstream >> tmp;
             if (tmp == "w" || tmp == "white") {
                 who = FastBoard::WHITE;
             } else if (tmp == "b" || tmp == "black") {
                 who = FastBoard::BLACK;
             } else {
                 gtp_fail_printf(id, "syntax error");
-                return 1;
+                return;
             }
-            // start thinking
-            {
-                game.set_to_move(who);
-                int move = search->think(who);
-                game.play_move(move);
+        }
 
-                std::string vertex = game.move_to_text(move);
+        if (analysis_output) {
+            // Start of multi-line response
+            cfg_analyze_tags = tags;
+            if (id != -1) {
+                gtp_printf_raw("=%d\n", id);
+            } else {
+                gtp_printf_raw("=\n");
+            }
+        }
+        // start thinking
+        {
+            game.set_to_move(who);
+            // Outputs winrate and pvs for lz-genmove_analyze
+            int move = search->think(who);
+            game.play_move(move);
+
+            std::string vertex = game.move_to_text(move);
+            if (!analysis_output) {
                 gtp_printf(id, "%s", vertex.c_str());
+            } else {
+                gtp_printf_raw("play %s\n", vertex.c_str());
             }
-            if (cfg_allow_pondering) {
-                // now start pondering
-                if (!game.has_resigned()) {
-                    search->ponder();
-                }
+        }
+
+        if (cfg_allow_pondering) {
+            // now start pondering
+            if (!game.has_resigned()) {
+                // Outputs winrate and pvs through gtp for lz-genmove_analyze
+                search->ponder();
             }
+        }
+        if (analysis_output) {
+            // Terminate multi-line response
+            gtp_printf_raw("\n");
+        }
+        cfg_analyze_tags = {};
+        return;
+    } else if (command.find("lz-analyze") == 0) {
+        std::istringstream cmdstream(command);
+        std::string tmp;
+
+        cmdstream >> tmp; // eat lz-analyze
+        AnalyzeTags tags{cmdstream, game};
+        if (tags.invalid()) {
+            gtp_fail_printf(id, "cannot parse analyze tags");
+            return;
+        }
+        // Start multi-line response.
+        if (id != -1) {
+            gtp_printf_raw("=%d\n", id);
         } else {
-            gtp_fail_printf(id, "syntax not understood");
+            gtp_printf_raw("=\n");
         }
-        return true;
+        // Now start pondering.
+        if (!game.has_resigned()) {
+            cfg_analyze_tags = tags;
+            // Outputs winrate and pvs through gtp
+            game.set_to_move(tags.who());
+            search->ponder();
+        }
+        cfg_analyze_tags = {};
+        // Terminate multi-line response
+        gtp_printf_raw("\n");
+        return;
     } else if (command.find("kgs-genmove_cleanup") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
 
-        cmdstream >> tmp;  // eat kgs-genmove
+        cmdstream >> tmp; // eat kgs-genmove
         cmdstream >> tmp;
 
         if (!cmdstream.fail()) {
@@ -396,7 +730,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
                 who = FastBoard::BLACK;
             } else {
                 gtp_fail_printf(id, "syntax error");
-                return 1;
+                return;
             }
             game.set_passes(0);
             {
@@ -416,18 +750,18 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_fail_printf(id, "syntax not understood");
         }
-        return true;
+        return;
     } else if (command.find("undo") == 0) {
         if (game.undo_move()) {
             gtp_printf(id, "");
         } else {
             gtp_fail_printf(id, "cannot undo");
         }
-        return true;
+        return;
     } else if (command.find("showboard") == 0) {
         gtp_printf(id, "");
         game.display_state();
-        return true;
+        return;
     } else if (command.find("final_score") == 0) {
         float ftmp = game.final_score();
         /* white wins */
@@ -438,7 +772,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_printf(id, "0");
         }
-        return true;
+        return;
     } else if (command.find("final_status_list") == 0) {
         if (command.find("alive") != std::string::npos) {
             std::string livelist = get_life_list(game, true);
@@ -449,7 +783,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_printf(id, "");
         }
-        return true;
+        return;
     } else if (command.find("time_settings") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
@@ -465,7 +799,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_fail_printf(id, "syntax not understood");
         }
-        return true;
+        return;
     } else if (command.find("time_left") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, color;
@@ -482,7 +816,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
                 icolor = FastBoard::BLACK;
             } else {
                 gtp_fail_printf(id, "Color in time adjust not understood.\n");
-                return 1;
+                return;
             }
 
             game.adjust_time(icolor, time * 100, stones);
@@ -499,7 +833,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_fail_printf(id, "syntax not understood");
         }
-        return true;
+        return;
     } else if (command.find("auto") == 0) {
         do {
             int move = search->think(game.get_to_move(), UCTSearch::NORMAL);
@@ -508,39 +842,39 @@ bool GTP::execute(GameState & game, std::string xinput) {
 
         } while (game.get_passes() < 2 && !game.has_resigned());
 
-        return true;
-    } else if (command.find("go") == 0) {
+        return;
+    } else if (command.find("go") == 0 && command.size() < 6) {
         int move = search->think(game.get_to_move());
         game.play_move(move);
 
         std::string vertex = game.move_to_text(move);
         myprintf("%s\n", vertex.c_str());
-        return true;
+        return;
     } else if (command.find("heatmap") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
         std::string symmetry;
 
-        cmdstream >> tmp;   // eat heatmap
+        cmdstream >> tmp; // eat heatmap
         cmdstream >> symmetry;
 
         Network::Netresult vec;
         if (cmdstream.fail()) {
-            // Default = DIRECT with no rotation
-            vec = Network::get_scored_moves(
-                &game, Network::Ensemble::DIRECT, 0, true);
+            // Default = DIRECT with no symmetric change
+            vec = s_network->get_output(&game, Network::Ensemble::DIRECT,
+                                        Network::IDENTITY_SYMMETRY, false);
         } else if (symmetry == "all") {
-            for (auto r = 0; r < 8; r++) {
-                vec = Network::get_scored_moves(
-                    &game, Network::Ensemble::DIRECT, r, true);
+            for (auto s = 0; s < Network::NUM_SYMMETRIES; ++s) {
+                vec = s_network->get_output(&game, Network::Ensemble::DIRECT, s,
+                                            false);
                 Network::show_heatmap(&game, vec, false);
             }
         } else if (symmetry == "average" || symmetry == "avg") {
-            vec = Network::get_scored_moves(
-                &game, Network::Ensemble::AVERAGE, 8, true);
+            vec = s_network->get_output(&game, Network::Ensemble::AVERAGE, -1,
+                                        false);
         } else {
-            vec = Network::get_scored_moves(
-                &game, Network::Ensemble::DIRECT, std::stoi(symmetry), true);
+            vec = s_network->get_output(&game, Network::Ensemble::DIRECT,
+                                        std::stoi(symmetry), false);
         }
 
         if (symmetry != "all") {
@@ -548,13 +882,13 @@ bool GTP::execute(GameState & game, std::string xinput) {
         }
 
         gtp_printf(id, "");
-        return true;
+        return;
     } else if (command.find("fixed_handicap") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
         int stones;
 
-        cmdstream >> tmp;   // eat fixed_handicap
+        cmdstream >> tmp; // eat fixed_handicap
         cmdstream >> stones;
 
         if (!cmdstream.fail() && game.set_fixed_handicap(stones)) {
@@ -563,29 +897,61 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_fail_printf(id, "Not a valid number of handicap stones");
         }
-        return true;
+        return;
+    } else if (command.find("last_move") == 0) {
+        auto last_move = game.get_last_move();
+        if (last_move == FastBoard::NO_VERTEX) {
+            gtp_fail_printf(id, "no previous move known");
+            return;
+        }
+        auto coordinate = game.move_to_text(last_move);
+        auto color = game.get_to_move() == FastBoard::WHITE ? "black" : "white";
+        gtp_printf(id, "%s %s", color, coordinate.c_str());
+        return;
+    } else if (command.find("move_history") == 0) {
+        gtp_printf_raw("=%s %s",
+                       id == -1 ? "" : std::to_string(id).c_str(),
+                       game.get_movenum() == 0 ? "\n" : "");
+        auto game_history = game.get_game_history();
+        // undone moves may still be present, so reverse the portion of the
+        // array we need and resize to trim it down for iteration.
+        std::reverse(begin(game_history),
+                     begin(game_history) + game.get_movenum() + 1);
+        game_history.resize(game.get_movenum());
+        for (const auto& state : game_history) {
+            auto coordinate = game.move_to_text(state->get_last_move());
+            auto color =
+                state->get_to_move() == FastBoard::WHITE ? "black" : "white";
+            gtp_printf_raw("%s %s\n", color, coordinate.c_str());
+        }
+        gtp_printf_raw("\n");
+        return;
+    } else if (command.find("clear_cache") == 0) {
+        s_network->nncache_clear();
+        gtp_printf(id, "");
+        return;
     } else if (command.find("place_free_handicap") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
         int stones;
 
-        cmdstream >> tmp;   // eat place_free_handicap
+        cmdstream >> tmp; // eat place_free_handicap
         cmdstream >> stones;
 
         if (!cmdstream.fail()) {
-            game.place_free_handicap(stones);
+            game.place_free_handicap(stones, *s_network);
             auto stonestring = game.board.get_stone_list();
             gtp_printf(id, "%s", stonestring.c_str());
         } else {
             gtp_fail_printf(id, "Not a valid number of handicap stones");
         }
 
-        return true;
+        return;
     } else if (command.find("set_free_handicap") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
 
-        cmdstream >> tmp;   // eat set_free_handicap
+        cmdstream >> tmp; // eat set_free_handicap
 
         do {
             std::string vertex;
@@ -604,13 +970,13 @@ bool GTP::execute(GameState & game, std::string xinput) {
         std::string stonestring = game.board.get_stone_list();
         gtp_printf(id, "%s", stonestring.c_str());
 
-        return true;
+        return;
     } else if (command.find("loadsgf") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, filename;
         int movenum;
 
-        cmdstream >> tmp;   // eat loadsgf
+        cmdstream >> tmp; // eat loadsgf
         cmdstream >> filename;
 
         if (!cmdstream.fail()) {
@@ -621,7 +987,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             }
         } else {
             gtp_fail_printf(id, "Missing filename.");
-            return true;
+            return;
         }
 
         auto sgftree = std::make_unique<SGFTree>();
@@ -633,7 +999,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } catch (const std::exception&) {
             gtp_fail_printf(id, "cannot load file");
         }
-        return true;
+        return;
     } else if (command.find("kgs-chat") == 0) {
         // kgs-chat (game|private) Name Message
         std::istringstream cmdstream(command);
@@ -647,11 +1013,11 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } while (!cmdstream.fail());
 
         gtp_fail_printf(id, "I'm a go bot, not a chat bot.");
-        return true;
+        return;
     } else if (command.find("kgs-game_over") == 0) {
         // Do nothing. Particularly, don't ponder.
         gtp_printf(id, "");
-        return true;
+        return;
     } else if (command.find("kgs-time_settings") == 0) {
         // none, absolute, byoyomi, or canadian
         std::istringstream cmdstream(command);
@@ -677,7 +1043,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             game.set_timecontrol(maintime * 100, byotime * 100, 0, byoperiods);
         } else {
             gtp_fail_printf(id, "syntax not understood");
-            return true;
+            return;
         }
 
         if (!cmdstream.fail()) {
@@ -685,31 +1051,34 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_fail_printf(id, "syntax not understood");
         }
-        return true;
+        return;
     } else if (command.find("netbench") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp;
         int iterations;
 
-        cmdstream >> tmp;  // eat netbench
+        cmdstream >> tmp; // eat netbench
         cmdstream >> iterations;
 
         if (!cmdstream.fail()) {
-            Network::benchmark(&game, iterations);
+            s_network->benchmark(&game, iterations);
         } else {
-            Network::benchmark(&game);
+            s_network->benchmark(&game);
         }
         gtp_printf(id, "");
-        return true;
+        return;
 
     } else if (command.find("printsgf") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, filename;
 
-        cmdstream >> tmp;   // eat printsgf
+        cmdstream >> tmp; // eat printsgf
         cmdstream >> filename;
 
         auto sgf_text = SGFTree::state_to_string(game, 0);
+        // GTP says consecutive newlines terminate the output,
+        // so we must filter those.
+        boost::replace_all(sgf_text, "\n\n", "\n");
 
         if (cmdstream.fail()) {
             gtp_printf(id, "%s\n", sgf_text.c_str());
@@ -720,7 +1089,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_printf(id, "");
         }
 
-        return true;
+        return;
     } else if (command.find("load_training") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, filename;
@@ -736,13 +1105,13 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_fail_printf(id, "syntax not understood");
         }
 
-        return true;
+        return;
     } else if (command.find("save_training") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, filename;
 
         // tmp will eat "save_training"
-        cmdstream >> tmp >>  filename;
+        cmdstream >> tmp >> filename;
 
         Training::save_training(filename);
 
@@ -752,7 +1121,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_fail_printf(id, "syntax not understood");
         }
 
-        return true;
+        return;
     } else if (command.find("dump_training") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, winner_color, filename;
@@ -767,7 +1136,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             who_won = FullBoard::BLACK;
         } else {
             gtp_fail_printf(id, "syntax not understood");
-            return true;
+            return;
         }
 
         Training::dump_training(who_won, filename);
@@ -778,7 +1147,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_fail_printf(id, "syntax not understood");
         }
 
-        return true;
+        return;
     } else if (command.find("dump_debug") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, filename;
@@ -794,7 +1163,7 @@ bool GTP::execute(GameState & game, std::string xinput) {
             gtp_fail_printf(id, "syntax not understood");
         }
 
-        return true;
+        return;
     } else if (command.find("dump_supervised") == 0) {
         std::istringstream cmdstream(command);
         std::string tmp, sgfname, outname;
@@ -809,10 +1178,240 @@ bool GTP::execute(GameState & game, std::string xinput) {
         } else {
             gtp_fail_printf(id, "syntax not understood");
         }
+        return;
+    } else if (command.find("lz-memory_report") == 0) {
+        auto base_memory = get_base_memory();
+        auto tree_size = add_overhead(UCTNodePointer::get_tree_size());
+        auto cache_size = add_overhead(s_network->get_estimated_cache_size());
+
+        auto total = base_memory + tree_size + cache_size;
+        gtp_printf(id,
+                   "Estimated total memory consumption: %d MiB.\n"
+                   "Network with overhead: %d MiB / Search tree: %d MiB / Network cache: %d\n",
+                   total / MiB, base_memory / MiB, tree_size / MiB,
+                   cache_size / MiB);
+        return;
+    } else if (command.find("lz-setoption") == 0) {
+        return execute_setoption(*search.get(), id, command);
+    } else if (command.find("gomill-explain_last_move") == 0) {
+        gtp_printf(id, "%s\n", search->explain_last_think().c_str());
+        return;
+    }
+    gtp_fail_printf(id, "unknown command");
+    return;
+}
+
+std::pair<std::string, std::string> GTP::parse_option(std::istringstream& is) {
+    std::string token, name, value;
+
+    // Read option name (can contain spaces)
+    while (is >> token && token != "value")
+        name += std::string(" ", name.empty() ? 0 : 1) + token;
+
+    // Read option value (can contain spaces)
+    while (is >> token)
+        value += std::string(" ", value.empty() ? 0 : 1) + token;
+
+    return std::make_pair(name, value);
+}
+
+size_t GTP::get_base_memory() {
+    // At the moment of writing the memory consumption is
+    // roughly network size + 85 for one GPU and + 160 for two GPUs.
+#ifdef USE_OPENCL
+    auto gpus = std::max(cfg_gpus.size(), size_t{1});
+    return s_network->get_estimated_size() + 85 * MiB * gpus;
+#else
+    return s_network->get_estimated_size();
+#endif
+}
+
+std::pair<bool, std::string> GTP::set_max_memory(
+    size_t max_memory, const int cache_size_ratio_percent) {
+    if (max_memory == 0) {
+        max_memory = UCTSearch::DEFAULT_MAX_MEMORY;
+    }
+
+    // Calculate amount of memory available for the search tree +
+    // NNCache by estimating a constant memory overhead first.
+    auto base_memory = get_base_memory();
+
+    if (max_memory < base_memory) {
+        return std::make_pair(false, "Not enough memory for network. "
+                                         + std::to_string(base_memory / MiB)
+                                         + " MiB required.");
+    }
+
+    auto max_memory_for_search = max_memory - base_memory;
+
+    assert(cache_size_ratio_percent >= 1);
+    assert(cache_size_ratio_percent <= 99);
+    auto max_cache_size =
+        max_memory_for_search * cache_size_ratio_percent / 100;
+
+    auto max_cache_count =
+        (int)(remove_overhead(max_cache_size) / NNCache::ENTRY_SIZE);
+
+    // Verify if the setting would not result in too little cache.
+    if (max_cache_count < NNCache::MIN_CACHE_COUNT) {
+        return std::make_pair(false, "Not enough memory for cache.");
+    }
+    auto max_tree_size = max_memory_for_search - max_cache_size;
+
+    if (max_tree_size < UCTSearch::MIN_TREE_SPACE) {
+        return std::make_pair(false, "Not enough memory for search tree.");
+    }
+
+    // Only if settings are ok we store the values in config.
+    cfg_max_memory = max_memory;
+    cfg_max_cache_ratio_percent = cache_size_ratio_percent;
+    // Set max_tree_size.
+    cfg_max_tree_size = remove_overhead(max_tree_size);
+    // Resize cache.
+    s_network->nncache_resize(max_cache_count);
+
+    return std::make_pair(
+        true, "Setting max tree size to " + std::to_string(max_tree_size / MiB)
+                  + " MiB and cache size to "
+                  + std::to_string(max_cache_size / MiB) + " MiB.");
+}
+
+void GTP::execute_setoption(UCTSearch& search, const int id,
+                            const std::string& command) {
+    std::istringstream cmdstream(command);
+    std::string tmp, name_token;
+
+    // Consume lz_setoption, name.
+    cmdstream >> tmp >> name_token;
+
+    // Print available options if called without an argument.
+    if (cmdstream.fail()) {
+        std::string options_out_tmp("");
+        for (int i = 0; s_options[i].size() > 0; i++) {
+            options_out_tmp = options_out_tmp + "\n" + s_options[i];
+        }
+        gtp_printf(id, options_out_tmp.c_str());
+        return;
+    }
 
-        return true;
+    if (name_token.find("name") != 0) {
+        gtp_fail_printf(id, "incorrect syntax for lz-setoption");
+        return;
     }
 
-    gtp_fail_printf(id, "unknown command");
-    return true;
+    std::string name, value;
+    std::tie(name, value) = parse_option(cmdstream);
+
+    if (name == "maximum memory use (mib)") {
+        std::istringstream valuestream(value);
+        int max_memory_in_mib;
+        valuestream >> max_memory_in_mib;
+        if (!valuestream.fail()) {
+            if (max_memory_in_mib < 128 || max_memory_in_mib > 131072) {
+                gtp_fail_printf(id, "incorrect value");
+                return;
+            }
+            bool result;
+            std::string reason;
+            std::tie(result, reason) = set_max_memory(
+                max_memory_in_mib * MiB, cfg_max_cache_ratio_percent);
+            if (result) {
+                gtp_printf(id, reason.c_str());
+            } else {
+                gtp_fail_printf(id, reason.c_str());
+            }
+            return;
+        } else {
+            gtp_fail_printf(id, "incorrect value");
+            return;
+        }
+    } else if (name == "percentage of memory for cache") {
+        std::istringstream valuestream(value);
+        int cache_size_ratio_percent;
+        valuestream >> cache_size_ratio_percent;
+        if (cache_size_ratio_percent < 1 || cache_size_ratio_percent > 99) {
+            gtp_fail_printf(id, "incorrect value");
+            return;
+        }
+        bool result;
+        std::string reason;
+        std::tie(result, reason) =
+            set_max_memory(cfg_max_memory, cache_size_ratio_percent);
+        if (result) {
+            gtp_printf(id, reason.c_str());
+        } else {
+            gtp_fail_printf(id, reason.c_str());
+        }
+        return;
+    } else if (name == "visits") {
+        std::istringstream valuestream(value);
+        int visits;
+        valuestream >> visits;
+        cfg_max_visits = visits;
+
+        // 0 may be specified to mean "no limit"
+        if (cfg_max_visits == 0) {
+            cfg_max_visits = UCTSearch::UNLIMITED_PLAYOUTS;
+        }
+        // Note that if the visits are changed but no
+        // explicit command to set memory usage is given,
+        // we will stick with the initial guess we made on startup.
+        search.set_visit_limit(cfg_max_visits);
+
+        gtp_printf(id, "");
+    } else if (name == "playouts") {
+        std::istringstream valuestream(value);
+        int playouts;
+        valuestream >> playouts;
+        cfg_max_playouts = playouts;
+
+        // 0 may be specified to mean "no limit"
+        if (cfg_max_playouts == 0) {
+            cfg_max_playouts = UCTSearch::UNLIMITED_PLAYOUTS;
+        } else if (cfg_allow_pondering) {
+            // Limiting playouts while pondering is still enabled
+            // makes no sense.
+            gtp_fail_printf(id, "incorrect value");
+            return;
+        }
+
+        // Note that if the playouts are changed but no
+        // explicit command to set memory usage is given,
+        // we will stick with the initial guess we made on startup.
+        search.set_playout_limit(cfg_max_playouts);
+
+        gtp_printf(id, "");
+    } else if (name == "lagbuffer") {
+        std::istringstream valuestream(value);
+        int lagbuffer;
+        valuestream >> lagbuffer;
+        cfg_lagbuffer_cs = lagbuffer;
+        gtp_printf(id, "");
+    } else if (name == "pondering") {
+        std::istringstream valuestream(value);
+        std::string toggle;
+        valuestream >> toggle;
+        if (toggle == "true") {
+            if (cfg_max_playouts != UCTSearch::UNLIMITED_PLAYOUTS) {
+                gtp_fail_printf(id, "incorrect value");
+                return;
+            }
+            cfg_allow_pondering = true;
+        } else if (toggle == "false") {
+            cfg_allow_pondering = false;
+        } else {
+            gtp_fail_printf(id, "incorrect value");
+            return;
+        }
+        gtp_printf(id, "");
+    } else if (name == "resign percentage") {
+        std::istringstream valuestream(value);
+        int resignpct;
+        valuestream >> resignpct;
+        cfg_resignpct = resignpct;
+        gtp_printf(id, "");
+    } else {
+        gtp_fail_printf(id, "Unknown option");
+    }
+    return;
 }
diff --git a/src/GTP.h b/src/GTP.h
index 92985fd6c..19f8681cd 100644
--- a/src/GTP.h
+++ b/src/GTP.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef GTP_H_INCLUDED
@@ -26,14 +37,56 @@
 #include <vector>
 
 #include "GameState.h"
+#include "Network.h"
 #include "UCTSearch.h"
 
+struct MoveToAvoid {
+    int color;
+    size_t until_move;
+    int vertex;
+
+    MoveToAvoid(const int color, const size_t until_move, const int vertex)
+        : color(color), until_move(until_move), vertex(vertex) {}
+
+    bool operator==(const MoveToAvoid& other) const {
+        return color == other.color && until_move == other.until_move
+               && vertex == other.vertex;
+    }
+};
+
+class AnalyzeTags {
+    friend class LeelaTest;
+
+public:
+    AnalyzeTags() = default;
+    AnalyzeTags(std::istringstream& cmdstream, const GameState& game);
+
+    void add_move_to_avoid(int color, int vertex, size_t until_move);
+    void add_move_to_allow(int color, int vertex, size_t until_move);
+    int interval_centis() const;
+    int invalid() const;
+    int who() const;
+    size_t post_move_count() const;
+    bool is_to_avoid(int color, int vertex, size_t movenum) const;
+    bool has_move_restrictions() const;
+
+private:
+    bool m_invalid{true};
+    std::vector<MoveToAvoid> m_moves_to_avoid, m_moves_to_allow;
+    int m_interval_centis{0};
+    int m_who{FastBoard::INVAL};
+    size_t m_min_moves{0};
+};
+
 extern bool cfg_gtp_mode;
 extern bool cfg_allow_pondering;
-extern int cfg_num_threads;
-extern int cfg_max_threads;
+extern unsigned int cfg_num_threads;
+extern unsigned int cfg_batch_size;
 extern int cfg_max_playouts;
 extern int cfg_max_visits;
+extern size_t cfg_max_memory;
+extern size_t cfg_max_tree_size;
+extern int cfg_max_cache_ratio_percent;
 extern TimeManagement::enabled_t cfg_timemanage;
 extern int cfg_lagbuffer_cs;
 extern int cfg_resignpct;
@@ -47,16 +100,31 @@ extern bool cfg_dumbpass;
 extern std::vector<int> cfg_gpus;
 extern bool cfg_sgemm_exhaustive;
 extern bool cfg_tune_only;
+#ifdef USE_HALF
+enum class precision_t {
+    AUTO, SINGLE, HALF
+};
+extern precision_t cfg_precision;
+#endif
 #endif
 extern float cfg_puct;
+extern float cfg_logpuct;
+extern float cfg_logconst;
 extern float cfg_softmax_temp;
 extern float cfg_fpu_reduction;
+extern float cfg_fpu_root_reduction;
+extern float cfg_ci_alpha;
+extern float cfg_lcb_min_visit_ratio;
 extern std::string cfg_logfile;
 extern std::string cfg_weightsfile;
 extern FILE* cfg_logfile_handle;
 extern bool cfg_quiet;
 extern std::string cfg_options_str;
 extern bool cfg_benchmark;
+extern bool cfg_cpu_only;
+extern AnalyzeTags cfg_analyze_tags;
+
+static constexpr size_t MiB = 1024LL * 1024LL;
 
 /*
     A list of all valid GTP2 commands is defined here:
@@ -65,14 +133,32 @@ extern bool cfg_benchmark;
 */
 class GTP {
 public:
-    static bool execute(GameState & game, std::string xinput);
+    static std::unique_ptr<Network> s_network;
+    static void initialize(std::unique_ptr<Network>&& network);
+    static void execute(GameState& game, const std::string& xinput);
     static void setup_default_parameters();
+
 private:
     static constexpr int GTP_VERSION = 2;
 
-    static std::string get_life_list(const GameState & game, bool live);
+    static std::string get_life_list(const GameState& game, bool live);
     static const std::string s_commands[];
-};
+    static const std::string s_options[];
+    static std::pair<std::string, std::string> parse_option(
+        std::istringstream& is);
+    static std::pair<bool, std::string> set_max_memory(
+        size_t max_memory, int cache_size_ratio_percent);
+    static void execute_setoption(UCTSearch& search, int id,
+                                  const std::string& command);
 
+    // Memory estimation helpers
+    static size_t get_base_memory();
+    static size_t add_overhead(const size_t s) {
+        return s * 11LL / 10LL;
+    }
+    static size_t remove_overhead(const size_t s) {
+        return s * 10LL / 11LL;
+    }
+};
 
 #endif
diff --git a/src/GameState.cpp b/src/GameState.cpp
index 333ca785c..f523b0f4a 100644
--- a/src/GameState.cpp
+++ b/src/GameState.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,32 +14,41 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
-#include "GameState.h"
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
 #include <algorithm>
 #include <array>
 #include <cassert>
-#include <cctype>
 #include <iterator>
 #include <memory>
-#include <sstream>
 #include <string>
 
+#include "GameState.h"
+
 #include "FastBoard.h"
 #include "FastState.h"
 #include "FullBoard.h"
 #include "KoState.h"
+#include "Network.h"
 #include "UCTSearch.h"
 
-void GameState::init_game(int size, float komi) {
+void GameState::init_game(const int size, const float komi) {
     KoState::init_game(size, komi);
 
-    game_history.clear();
-    game_history.emplace_back(std::make_shared<KoState>(*this));
+    m_game_history.clear();
+    m_game_history.emplace_back(std::make_shared<KoState>(*this));
 
-    m_timecontrol.set_boardsize(board.get_boardsize());
     m_timecontrol.reset_clocks();
 
     m_resigned = FastBoard::EMPTY;
@@ -48,30 +57,30 @@ void GameState::init_game(int size, float komi) {
 void GameState::reset_game() {
     KoState::reset_game();
 
-    game_history.clear();
-    game_history.emplace_back(std::make_shared<KoState>(*this));
+    m_game_history.clear();
+    m_game_history.emplace_back(std::make_shared<KoState>(*this));
 
     m_timecontrol.reset_clocks();
 
     m_resigned = FastBoard::EMPTY;
 }
 
-bool GameState::forward_move(void) {
-    if (game_history.size() > m_movenum + 1) {
+bool GameState::forward_move() {
+    if (m_game_history.size() > m_movenum + 1) {
         m_movenum++;
-        *(static_cast<KoState*>(this)) = *game_history[m_movenum];
+        *(static_cast<KoState*>(this)) = *m_game_history[m_movenum];
         return true;
     } else {
         return false;
     }
 }
 
-bool GameState::undo_move(void) {
+bool GameState::undo_move() {
     if (m_movenum > 0) {
         m_movenum--;
 
         // this is not so nice, but it should work
-        *(static_cast<KoState*>(this)) = *game_history[m_movenum];
+        *(static_cast<KoState*>(this)) = *m_game_history[m_movenum];
 
         // This also restores hashes as they're part of state
         return true;
@@ -80,16 +89,16 @@ bool GameState::undo_move(void) {
     }
 }
 
-void GameState::rewind(void) {
-    *(static_cast<KoState*>(this)) = *game_history[0];
+void GameState::rewind() {
+    *(static_cast<KoState*>(this)) = *m_game_history[0];
     m_movenum = 0;
 }
 
-void GameState::play_move(int vertex) {
+void GameState::play_move(const int vertex) {
     play_move(get_to_move(), vertex);
 }
 
-void GameState::play_move(int color, int vertex) {
+void GameState::play_move(const int color, const int vertex) {
     if (vertex == FastBoard::RESIGN) {
         m_resigned = color;
     } else {
@@ -97,13 +106,13 @@ void GameState::play_move(int color, int vertex) {
     }
 
     // cut off any leftover moves from navigating
-    game_history.resize(m_movenum);
-    game_history.emplace_back(std::make_shared<KoState>(*this));
+    m_game_history.resize(m_movenum);
+    m_game_history.emplace_back(std::make_shared<KoState>(*this));
 }
 
-bool GameState::play_textmove(const std::string& color,
-                              const std::string& vertex) {
+bool GameState::play_textmove(std::string color, const std::string& vertex) {
     int who;
+    transform(cbegin(color), cend(color), begin(color), tolower);
     if (color == "w" || color == "white") {
         who = FullBoard::WHITE;
     } else if (color == "b" || color == "black") {
@@ -112,40 +121,10 @@ bool GameState::play_textmove(const std::string& color,
         return false;
     }
 
-    if (vertex.size() < 2) return false;
-    if (!std::isalpha(vertex[0])) return false;
-    if (!std::isdigit(vertex[1])) return false;
-    if (vertex[0] == 'i') return false;
-
-    int column, row;
-    if (vertex[0] >= 'A' && vertex[0] <= 'Z') {
-        if (vertex[0] < 'I') {
-            column = 25 + vertex[0] - 'A';
-        } else {
-            column = 25 + (vertex[0] - 'A')-1;
-        }
-    } else {
-        if (vertex[0] < 'i') {
-            column = vertex[0] - 'a';
-        } else {
-            column = (vertex[0] - 'a')-1;
-        }
-    }
-
-    std::string rowstring(vertex);
-    rowstring.erase(0, 1);
-    std::istringstream parsestream(rowstring);
-
-    parsestream >> row;
-    row--;
-
-    auto boardsize = board.get_boardsize();
-    if (row >= boardsize || column >= boardsize) {
-        return false;
-    }
-
-    auto move = board.get_vertex(column, row);
-    if (board.get_square(move) != FastBoard::EMPTY) {
+    const auto move = board.text_to_move(vertex);
+    if (move == FastBoard::NO_VERTEX
+        || (move != FastBoard::PASS && move != FastBoard::RESIGN
+            && board.get_state(move) != FastBoard::EMPTY)) {
         return false;
     }
 
@@ -155,11 +134,11 @@ bool GameState::play_textmove(const std::string& color,
     return true;
 }
 
-void GameState::stop_clock(int color) {
+void GameState::stop_clock(const int color) {
     m_timecontrol.stop(color);
 }
 
-void GameState::start_clock(int color) {
+void GameState::start_clock(const int color) {
     m_timecontrol.start(color);
 }
 
@@ -177,34 +156,33 @@ bool GameState::has_resigned() const {
     return m_resigned != FastBoard::EMPTY;
 }
 
-TimeControl& GameState::get_timecontrol() {
+const TimeControl& GameState::get_timecontrol() const {
     return m_timecontrol;
 }
 
-void GameState::set_timecontrol(int maintime, int byotime,
-                                int byostones, int byoperiods) {
-    TimeControl timecontrol(board.get_boardsize(), maintime, byotime,
-                            byostones, byoperiods);
-
+void GameState::set_timecontrol(const TimeControl& timecontrol) {
     m_timecontrol = timecontrol;
 }
 
-void GameState::set_timecontrol(TimeControl tmc) {
-    m_timecontrol = tmc;
+void GameState::set_timecontrol(const int maintime, const int byotime,
+                                const int byostones, const int byoperiods) {
+    TimeControl timecontrol(maintime, byotime, byostones, byoperiods);
+
+    m_timecontrol = timecontrol;
 }
 
-void GameState::adjust_time(int color, int time, int stones) {
+void GameState::adjust_time(const int color, const int time, const int stones) {
     m_timecontrol.adjust_time(color, time, stones);
 }
 
-void GameState::anchor_game_history(void) {
+void GameState::anchor_game_history() {
     // handicap moves don't count in game history
     m_movenum = 0;
-    game_history.clear();
-    game_history.emplace_back(std::make_shared<KoState>(*this));
+    m_game_history.clear();
+    m_game_history.emplace_back(std::make_shared<KoState>(*this));
 }
 
-bool GameState::set_fixed_handicap(int handicap) {
+bool GameState::set_fixed_handicap(const int handicap) {
     if (!valid_handicap(handicap)) {
         return false;
     }
@@ -250,7 +228,7 @@ bool GameState::set_fixed_handicap(int handicap) {
     return true;
 }
 
-int GameState::set_fixed_handicap_2(int handicap) {
+int GameState::set_fixed_handicap_2(const int handicap) {
     int board_size = board.get_boardsize();
     int low = board_size >= 13 ? 3 : 2;
     int mid = board_size / 2;
@@ -263,15 +241,15 @@ int GameState::set_fixed_handicap_2(int handicap) {
         for (int i = low; i <= high; i += interval) {
             for (int j = low; j <= high; j += interval) {
                 if (placed >= handicap) return placed;
-                if (board.get_square(i-1, j-1) != FastBoard::EMPTY) continue;
-                if (board.get_square(i-1, j) != FastBoard::EMPTY) continue;
-                if (board.get_square(i-1, j+1) != FastBoard::EMPTY) continue;
-                if (board.get_square(i, j-1) != FastBoard::EMPTY) continue;
-                if (board.get_square(i, j) != FastBoard::EMPTY) continue;
-                if (board.get_square(i, j+1) != FastBoard::EMPTY) continue;
-                if (board.get_square(i+1, j-1) != FastBoard::EMPTY) continue;
-                if (board.get_square(i+1, j) != FastBoard::EMPTY) continue;
-                if (board.get_square(i+1, j+1) != FastBoard::EMPTY) continue;
+                if (board.get_state(i - 1, j - 1) != FastBoard::EMPTY) continue;
+                if (board.get_state(i - 1, j) != FastBoard::EMPTY) continue;
+                if (board.get_state(i - 1, j + 1) != FastBoard::EMPTY) continue;
+                if (board.get_state(i, j - 1) != FastBoard::EMPTY) continue;
+                if (board.get_state(i, j) != FastBoard::EMPTY) continue;
+                if (board.get_state(i, j + 1) != FastBoard::EMPTY) continue;
+                if (board.get_state(i + 1, j - 1) != FastBoard::EMPTY) continue;
+                if (board.get_state(i + 1, j) != FastBoard::EMPTY) continue;
+                if (board.get_state(i + 1, j + 1) != FastBoard::EMPTY) continue;
                 play_move(FastBoard::BLACK, board.get_vertex(i, j));
                 placed++;
             }
@@ -282,7 +260,7 @@ int GameState::set_fixed_handicap_2(int handicap) {
     return placed;
 }
 
-bool GameState::valid_handicap(int handicap) {
+bool GameState::valid_handicap(const int handicap) {
     int board_size = board.get_boardsize();
 
     if (handicap < 2 || handicap > 9) {
@@ -301,7 +279,7 @@ bool GameState::valid_handicap(int handicap) {
     return true;
 }
 
-void GameState::place_free_handicap(int stones) {
+void GameState::place_free_handicap(int stones, Network& network) {
     int limit = board.get_boardsize() * board.get_boardsize();
     if (stones > limit / 2) {
         stones = limit / 2;
@@ -317,12 +295,12 @@ void GameState::place_free_handicap(int stones) {
     stones -= set_fixed_handicap_2(stones);
 
     for (int i = 0; i < stones; i++) {
-        auto search = std::make_unique<UCTSearch>(*this);
+        auto search = std::make_unique<UCTSearch>(*this, network);
         auto move = search->think(FastBoard::BLACK, UCTSearch::NOPASS);
         play_move(FastBoard::BLACK, move);
     }
 
-    if (orgstones)  {
+    if (orgstones) {
         board.set_to_move(FastBoard::WHITE);
     } else {
         board.set_to_move(FastBoard::BLACK);
@@ -333,8 +311,13 @@ void GameState::place_free_handicap(int stones) {
     set_handicap(orgstones);
 }
 
-const FullBoard& GameState::get_past_board(int moves_ago) const {
+const FullBoard& GameState::get_past_board(const int moves_ago) const {
     assert(moves_ago >= 0 && (unsigned)moves_ago <= m_movenum);
-    assert(m_movenum + 1 <= game_history.size());
-    return game_history[m_movenum - moves_ago]->board;
+    assert(m_movenum + 1 <= m_game_history.size());
+    return m_game_history[m_movenum - moves_ago]->board;
+}
+
+const std::vector<std::shared_ptr<const KoState>>&
+GameState::get_game_history() const {
+    return m_game_history;
 }
diff --git a/src/GameState.h b/src/GameState.h
index 8589a9c4b..9f6b56b37 100644
--- a/src/GameState.h
+++ b/src/GameState.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef GAMESTATE_H_INCLUDED
@@ -28,6 +39,8 @@
 #include "KoState.h"
 #include "TimeControl.h"
 
+class Network;
+
 class GameState : public KoState {
 public:
     explicit GameState() = default;
@@ -40,25 +53,25 @@ class GameState : public KoState {
     void reset_game();
     bool set_fixed_handicap(int stones);
     int set_fixed_handicap_2(int stones);
-    void place_free_handicap(int stones);
-    void anchor_game_history(void);
+    void place_free_handicap(int stones, Network& network);
+    void anchor_game_history();
 
-    void rewind(void); /* undo infinite */
-    bool undo_move(void);
-    bool forward_move(void);
+    void rewind(); /* undo infinite */
+    bool undo_move();
+    bool forward_move();
     const FullBoard& get_past_board(int moves_ago) const;
+    const std::vector<std::shared_ptr<const KoState>>& get_game_history() const;
 
     void play_move(int color, int vertex);
     void play_move(int vertex);
-    bool play_textmove(const std::string& color,
-                       const std::string& vertex);
+    bool play_textmove(std::string color, const std::string& vertex);
 
     void start_clock(int color);
     void stop_clock(int color);
-    TimeControl& get_timecontrol();
+    const TimeControl& get_timecontrol() const;
+    void set_timecontrol(const TimeControl& timecontrol);
     void set_timecontrol(int maintime, int byotime, int byostones,
                          int byoperiods);
-    void set_timecontrol(TimeControl tmc);
     void adjust_time(int color, int time, int stones);
 
     void display_state();
@@ -68,7 +81,7 @@ class GameState : public KoState {
 private:
     bool valid_handicap(int stones);
 
-    std::vector<std::shared_ptr<const KoState>> game_history;
+    std::vector<std::shared_ptr<const KoState>> m_game_history;
     TimeControl m_timecontrol;
     int m_resigned{FastBoard::EMPTY};
 };
diff --git a/src/Im2Col.h b/src/Im2Col.h
index 50a92902a..702fee807 100644
--- a/src/Im2Col.h
+++ b/src/Im2Col.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,37 +14,50 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef IM2COL_H_INCLUDED
 #define IM2COL_H_INCLUDED
 
+#include <algorithm>
 #include <cassert>
 #include <vector>
-#include <algorithm>
 
 template <unsigned long filter_size>
-void im2col(const int channels,
-            const std::vector<float>& input,
+void im2col(const int channels, const std::vector<float>& input,
             std::vector<float>& output) {
     constexpr unsigned int height = BOARD_SIZE;
     constexpr unsigned int width = BOARD_SIZE;
 
     constexpr int pad = (filter_size / 2);
-    constexpr unsigned int output_h = height + 2 * pad - filter_size  + 1;
+    constexpr unsigned int output_h = height + 2 * pad - filter_size + 1;
     constexpr unsigned int output_w = width + 2 * pad - filter_size + 1;
 
     const float* data_im = input.data();
     float* data_col = output.data();
 
-    for (int channel = channels; channel--; data_im += BOARD_SQUARES) {
-        for (unsigned int kernel_row = 0; kernel_row < filter_size; kernel_row++) {
-            for (unsigned int kernel_col = 0; kernel_col < filter_size; kernel_col++) {
+    for (int channel = channels; channel--; data_im += NUM_INTERSECTIONS) {
+        for (unsigned int kernel_row = 0; kernel_row < filter_size;
+             kernel_row++) {
+            for (unsigned int kernel_col = 0; kernel_col < filter_size;
+                 kernel_col++) {
                 int input_row = -pad + kernel_row;
                 for (int output_rows = output_h; output_rows; output_rows--) {
                     if (unsigned(input_row) < height) {
                         int input_col = -pad + kernel_col;
-                        for (int output_col = output_w; output_col; output_col--) {
+                        for (int output_col = output_w; output_col;
+                             output_col--) {
                             if (unsigned(input_col) < width) {
                                 *(data_col++) =
                                     data_im[input_row * width + input_col];
@@ -54,7 +67,8 @@ void im2col(const int channels,
                             input_col++;
                         }
                     } else {
-                        for (int output_cols = output_w; output_cols; output_cols--) {
+                        for (int output_cols = output_w; output_cols;
+                             output_cols--) {
                             *(data_col++) = 0;
                         }
                     }
@@ -66,10 +80,9 @@ void im2col(const int channels,
 }
 
 template <>
-void im2col<1>(const int channels,
-               const std::vector<float>& input,
+void im2col<1>(const int channels, const std::vector<float>& input,
                std::vector<float>& output) {
-    auto outSize = size_t{channels * static_cast<size_t>(BOARD_SQUARES)};
+    auto outSize = size_t{channels * static_cast<size_t>(NUM_INTERSECTIONS)};
     assert(output.size() == outSize);
     std::copy(begin(input), begin(input) + outSize, begin(output));
 }
diff --git a/src/KoState.cpp b/src/KoState.cpp
index 3b72540b8..62ef342a4 100644
--- a/src/KoState.cpp
+++ b/src/KoState.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,20 +14,32 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "KoState.h"
 
-#include <cassert>
 #include <algorithm>
+#include <cassert>
 #include <iterator>
 
+#include "KoState.h"
+
 #include "FastBoard.h"
 #include "FastState.h"
 #include "FullBoard.h"
 
-void KoState::init_game(int size, float komi) {
+void KoState::init_game(const int size, const float komi) {
     assert(size <= BOARD_SIZE);
 
     FastState::init_game(size, komi);
@@ -36,7 +48,7 @@ void KoState::init_game(int size, float komi) {
     m_ko_hash_history.emplace_back(board.get_ko_hash());
 }
 
-bool KoState::superko(void) const {
+bool KoState::superko() const {
     auto first = crbegin(m_ko_hash_history);
     auto last = crend(m_ko_hash_history);
 
@@ -52,11 +64,11 @@ void KoState::reset_game() {
     m_ko_hash_history.push_back(board.get_ko_hash());
 }
 
-void KoState::play_move(int vertex) {
+void KoState::play_move(const int vertex) {
     play_move(board.get_to_move(), vertex);
 }
 
-void KoState::play_move(int color, int vertex) {
+void KoState::play_move(const int color, const int vertex) {
     if (vertex != FastBoard::RESIGN) {
         FastState::play_move(color, vertex);
     }
diff --git a/src/KoState.h b/src/KoState.h
index 03bde6797..1f6805af1 100644
--- a/src/KoState.h
+++ b/src/KoState.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef KOSTATE_H_INCLUDED
@@ -29,7 +40,7 @@
 class KoState : public FastState {
 public:
     void init_game(int size, float komi);
-    bool superko(void) const;
+    bool superko() const;
     void reset_game();
 
     void play_move(int color, int vertex);
diff --git a/src/Leela.cpp b/src/Leela.cpp
index 9a17b160e..2bf148bc3 100644
--- a/src/Leela.cpp
+++ b/src/Leela.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,14 +14,26 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
 
-#include <cstdint>
 #include <algorithm>
+#include <boost/filesystem.hpp>
 #include <boost/format.hpp>
 #include <boost/program_options.hpp>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
@@ -31,8 +43,8 @@
 
 #include "GTP.h"
 #include "GameState.h"
-#include "Network.h"
 #include "NNCache.h"
+#include "Network.h"
 #include "Random.h"
 #include "ThreadPool.h"
 #include "Utils.h"
@@ -42,22 +54,94 @@ using namespace Utils;
 
 static void license_blurb() {
     printf(
-        "Leela Zero %s  Copyright (C) 2017-2018  Gian-Carlo Pascutto and contributors\n"
+        "Leela Zero %s  Copyright (C) 2017-2019  Gian-Carlo Pascutto and contributors\n"
         "This program comes with ABSOLUTELY NO WARRANTY.\n"
         "This is free software, and you are welcome to redistribute it\n"
         "under certain conditions; see the COPYING file for details.\n\n",
         PROGRAM_VERSION);
 }
 
-static void parse_commandline(int argc, char *argv[]) {
+static void calculate_thread_count_cpu(
+    boost::program_options::variables_map& vm) {
+    // If we are CPU-based, there is no point using more than the number of CPUs.
+    auto cfg_max_threads = std::min(SMP::get_num_cpus(), size_t{MAX_CPUS});
+
+    if (vm["threads"].as<unsigned int>() > 0) {
+        auto num_threads = vm["threads"].as<unsigned int>();
+        if (num_threads > cfg_max_threads) {
+            myprintf("Clamping threads to maximum = %d\n", cfg_max_threads);
+            num_threads = cfg_max_threads;
+        }
+        cfg_num_threads = num_threads;
+    } else {
+        cfg_num_threads = cfg_max_threads;
+    }
+}
+
+#ifdef USE_OPENCL
+static void calculate_thread_count_gpu(
+    boost::program_options::variables_map& vm) {
+    auto cfg_max_threads = size_t{MAX_CPUS};
+
+    // Default thread count : GPU case
+    // 1) if no args are given, use batch size of 5 and thread count of (batch size) * (number of gpus) * 2
+    // 2) if number of threads are given, use batch size of (thread count) / (number of gpus) / 2
+    // 3) if number of batches are given, use thread count of (batch size) * (number of gpus) * 2
+    auto gpu_count = cfg_gpus.size();
+    if (gpu_count == 0) {
+        // size of zero if autodetect GPU : default to 1
+        gpu_count = 1;
+    }
+
+    if (vm["threads"].as<unsigned int>() > 0) {
+        auto num_threads = vm["threads"].as<unsigned int>();
+        if (num_threads > cfg_max_threads) {
+            myprintf("Clamping threads to maximum = %d\n", cfg_max_threads);
+            num_threads = cfg_max_threads;
+        }
+        cfg_num_threads = num_threads;
+
+        if (vm["batchsize"].as<unsigned int>() > 0) {
+            cfg_batch_size = vm["batchsize"].as<unsigned int>();
+        } else {
+            cfg_batch_size =
+                (cfg_num_threads + (gpu_count * 2) - 1) / (gpu_count * 2);
+
+            // no idea why somebody wants to use threads less than the number of GPUs
+            // but should at least prevent crashing
+            if (cfg_batch_size == 0) {
+                cfg_batch_size = 1;
+            }
+        }
+    } else {
+        if (vm["batchsize"].as<unsigned int>() > 0) {
+            cfg_batch_size = vm["batchsize"].as<unsigned int>();
+        } else {
+            cfg_batch_size = 5;
+        }
+
+        cfg_num_threads =
+            std::min(cfg_max_threads, cfg_batch_size * gpu_count * 2);
+    }
+
+    if (cfg_num_threads < cfg_batch_size) {
+        printf(
+            "Number of threads = %d must be no smaller than batch size = %d\n",
+            cfg_num_threads, cfg_batch_size);
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
+static void parse_commandline(const int argc, const char* const argv[]) {
     namespace po = boost::program_options;
     // Declare the supported options.
     po::options_description gen_desc("Generic options");
     gen_desc.add_options()
         ("help,h", "Show commandline options.")
         ("gtp,g", "Enable GTP mode.")
-        ("threads,t", po::value<int>()->default_value(cfg_num_threads),
-                      "Number of threads to use.")
+        ("threads,t", po::value<unsigned int>()->default_value(0),
+                      "Number of threads to use. Select 0 to let leela-zero pick a reasonable default.")
         ("playouts,p", po::value<int>(),
                        "Weaken engine by limiting the number of playouts. "
                        "Requires --noponder.")
@@ -68,23 +152,39 @@ static void parse_commandline(int argc, char *argv[]) {
         ("resignpct,r", po::value<int>()->default_value(cfg_resignpct),
                         "Resign when winrate is less than x%.\n"
                         "-1 uses 10% but scales for handicap.")
-        ("weights,w", po::value<std::string>(), "File with network weights.")
-        ("logfile,l", po::value<std::string>(), "File to log input/output to.")
+        ("weights,w", po::value<std::string>()->default_value(cfg_weightsfile),
+                      "File with network weights.")
+        ("logfile,l", po::value<std::string>(),
+                      "File to log input/output to.")
         ("quiet,q", "Disable all diagnostic output.")
         ("timemanage", po::value<std::string>()->default_value("auto"),
-                       "[auto|on|off|fast] Enable time management features.\n"
-                       "auto = off when using -m, otherwise on")
+                       "[auto|on|off|fast|no_pruning] Enable time management features.\n"
+                       "auto = no_pruning when using -n, otherwise on.\n"
+                       "on = Cut off search when the best move can't change"
+                       ", but use full time if moving faster doesn't save time.\n"
+                       "fast = Same as on but always plays faster.\n"
+                       "no_pruning = For self play training use.\n")
         ("noponder", "Disable thinking on opponent's time.")
         ("benchmark", "Test network and exit. Default args:\n-v3200 --noponder "
                       "-m0 -t1 -s1.")
+#ifndef USE_CPU_ONLY
+        ("cpu-only", "Use CPU-only implementation and do not use OpenCL device(s).")
+#endif
         ;
 #ifdef USE_OPENCL
-    po::options_description gpu_desc("GPU options");
+    po::options_description gpu_desc("OpenCL device options");
     gpu_desc.add_options()
-        ("gpu",  po::value<std::vector<int> >(),
+        ("gpu", po::value<std::vector<int>>(),
                 "ID of the OpenCL device(s) to use (disables autodetection).")
         ("full-tuner", "Try harder to find an optimal OpenCL tuning.")
         ("tune-only", "Tune OpenCL only and then exit.")
+        ("batchsize", po::value<unsigned int>()->default_value(0),
+                      "Max batch size.  Select 0 to let leela-zero pick a reasonable default.")
+#ifdef USE_HALF
+        ("precision", po::value<std::string>(),
+                      "Floating-point precision (single/half/auto).\n"
+                      "Default is to auto which automatically determines which one to use.")
+#endif
         ;
 #endif
     po::options_description selfplay_desc("Self-play options");
@@ -95,48 +195,57 @@ static void parse_commandline(int argc, char *argv[]) {
         ("dumbpass,d", "Don't use heuristics for smarter passing.")
         ("randomcnt,m", po::value<int>()->default_value(cfg_random_cnt),
                         "Play more randomly the first x moves.")
-        ("randomvisits",
-            po::value<int>()->default_value(cfg_random_min_visits),
-            "Don't play random moves if they have <= x visits.")
-        ("randomtemp",
-            po::value<float>()->default_value(cfg_random_temp),
-            "Temperature to use for random move selection.")
-        ;
+        ("randomvisits", po::value<int>()->default_value(cfg_random_min_visits),
+                         "Don't play random moves if they have <= x visits.")
+        ("randomtemp", po::value<float>()->default_value(cfg_random_temp),
+                       "Temperature to use for random move selection.");
 #ifdef USE_TUNER
     po::options_description tuner_desc("Tuning options");
     tuner_desc.add_options()
         ("puct", po::value<float>())
+        ("logpuct", po::value<float>())
+        ("logconst", po::value<float>())
         ("softmax_temp", po::value<float>())
         ("fpu_reduction", po::value<float>())
-        ;
+        ("ci_alpha", po::value<float>());
 #endif
     // These won't be shown, we use them to catch incorrect usage of the
     // command line.
+    po::options_description ignore("Ignored options");
+#ifndef USE_OPENCL
+    ignore.add_options()
+        ("batchsize", po::value<unsigned int>()->default_value(1),
+                      "Max batch size.");
+#endif
     po::options_description h_desc("Hidden options");
     h_desc.add_options()
         ("arguments", po::value<std::vector<std::string>>());
     po::options_description visible;
-    visible.add(gen_desc)
+    visible
+        .add(gen_desc)
 #ifdef USE_OPENCL
-       .add(gpu_desc)
+        .add(gpu_desc)
 #endif
-       .add(selfplay_desc)
+        .add(selfplay_desc)
 #ifdef USE_TUNER
-       .add(tuner_desc);
+        .add(tuner_desc);
 #else
         ;
 #endif
     // Parse both the above, we will check if any of the latter are present.
     po::options_description all;
-    all.add(visible).add(h_desc);
+    all.add(visible).add(ignore).add(h_desc);
     po::positional_options_description p_desc;
     p_desc.add("arguments", -1);
     po::variables_map vm;
     try {
         po::store(po::command_line_parser(argc, argv)
-                  .options(all).positional(p_desc).run(), vm);
+                      .options(all)
+                      .positional(p_desc)
+                      .run(),
+                  vm);
         po::notify(vm);
-    }  catch(const boost::program_options::error& e) {
+    } catch (const boost::program_options::error& e) {
         printf("ERROR: %s\n", e.what());
         license_blurb();
         std::cout << visible << std::endl;
@@ -164,19 +273,28 @@ static void parse_commandline(int argc, char *argv[]) {
     }
 
     if (vm.count("benchmark")) {
-        cfg_quiet = true;  // Set this early to avoid unnecessary output.
+        cfg_quiet = true; // Set this early to avoid unnecessary output.
     }
 
 #ifdef USE_TUNER
     if (vm.count("puct")) {
         cfg_puct = vm["puct"].as<float>();
     }
+    if (vm.count("logpuct")) {
+        cfg_logpuct = vm["logpuct"].as<float>();
+    }
+    if (vm.count("logconst")) {
+        cfg_logconst = vm["logconst"].as<float>();
+    }
     if (vm.count("softmax_temp")) {
         cfg_softmax_temp = vm["softmax_temp"].as<float>();
     }
     if (vm.count("fpu_reduction")) {
         cfg_fpu_reduction = vm["fpu_reduction"].as<float>();
     }
+    if (vm.count("ci_alpha")) {
+        cfg_ci_alpha = vm["ci_alpha"].as<float>();
+    }
 #endif
 
     if (vm.count("logfile")) {
@@ -185,10 +303,12 @@ static void parse_commandline(int argc, char *argv[]) {
         cfg_logfile_handle = fopen(cfg_logfile.c_str(), "a");
     }
 
-    if (vm.count("weights")) {
-        cfg_weightsfile = vm["weights"].as<std::string>();
-    } else {
+    cfg_weightsfile = vm["weights"].as<std::string>();
+    if (vm["weights"].defaulted()
+        && !boost::filesystem::exists(cfg_weightsfile)) {
         printf("A network weights file is required to use the program.\n");
+        printf("By default, Leela Zero looks for it in %s.\n",
+               cfg_weightsfile.c_str());
         exit(EXIT_FAILURE);
     }
 
@@ -196,14 +316,61 @@ static void parse_commandline(int argc, char *argv[]) {
         cfg_gtp_mode = true;
     }
 
-    if (!vm["threads"].defaulted()) {
-        auto num_threads = vm["threads"].as<int>();
-        if (num_threads > cfg_max_threads) {
-            myprintf("Clamping threads to maximum = %d\n", cfg_max_threads);
-        } else if (num_threads != cfg_num_threads) {
-            cfg_num_threads = num_threads;
+#ifdef USE_OPENCL
+    if (vm.count("gpu")) {
+        cfg_gpus = vm["gpu"].as<std::vector<int>>();
+    }
+
+    if (vm.count("full-tuner")) {
+        cfg_sgemm_exhaustive = true;
+
+        // --full-tuner auto-implies --tune-only.  The full tuner is so slow
+        // that nobody will wait for it to finish befure running a game.
+        // This simply prevents some edge cases from confusing other people.
+        cfg_tune_only = true;
+    }
+
+    if (vm.count("tune-only")) {
+        cfg_tune_only = true;
+    }
+#ifdef USE_HALF
+    if (vm.count("precision")) {
+        auto precision = vm["precision"].as<std::string>();
+        if ("single" == precision) {
+            cfg_precision = precision_t::SINGLE;
+        } else if ("half" == precision) {
+            cfg_precision = precision_t::HALF;
+        } else if ("auto" == precision) {
+            cfg_precision = precision_t::AUTO;
+        } else {
+            printf("Unexpected option for --precision, expecting single/half/auto\n");
+            exit(EXIT_FAILURE);
         }
     }
+    if (cfg_precision == precision_t::AUTO) {
+        // Auto precision is not supported for full tuner cases.
+        if (cfg_sgemm_exhaustive) {
+            printf("Automatic precision not supported when doing exhaustive tuning\n");
+            printf("Please add '--precision single' or '--precision half'\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+#endif
+    if (vm.count("cpu-only")) {
+        cfg_cpu_only = true;
+    }
+#else
+    cfg_cpu_only = true;
+#endif
+
+    if (cfg_cpu_only) {
+        calculate_thread_count_cpu(vm);
+    } else {
+#ifdef USE_OPENCL
+        calculate_thread_count_gpu(vm);
+        myprintf("Using OpenCL batch size of %d\n", cfg_batch_size);
+#endif
+    }
     myprintf("Using %d thread(s).\n", cfg_num_threads);
 
     if (vm.count("seed")) {
@@ -277,6 +444,8 @@ static void parse_commandline(int argc, char *argv[]) {
             cfg_timemanage = TimeManagement::OFF;
         } else if (tm == "fast") {
             cfg_timemanage = TimeManagement::FAST;
+        } else if (tm == "no_pruning") {
+            cfg_timemanage = TimeManagement::NO_PRUNING;
         } else {
             printf("Invalid timemanage value.\n");
             exit(EXIT_FAILURE);
@@ -284,47 +453,35 @@ static void parse_commandline(int argc, char *argv[]) {
     }
     if (cfg_timemanage == TimeManagement::AUTO) {
         cfg_timemanage =
-            cfg_random_cnt ? TimeManagement::OFF : TimeManagement::ON;
+            cfg_noise ? TimeManagement::NO_PRUNING : TimeManagement::ON;
     }
 
     if (vm.count("lagbuffer")) {
         int lagbuffer = vm["lagbuffer"].as<int>();
         if (lagbuffer != cfg_lagbuffer_cs) {
-            myprintf("Using per-move time margin of %.2fs.\n", lagbuffer/100.0f);
+            myprintf("Using per-move time margin of %.2fs.\n",
+                     lagbuffer / 100.0f);
             cfg_lagbuffer_cs = lagbuffer;
         }
     }
-
-#ifdef USE_OPENCL
-    if (vm.count("gpu")) {
-        cfg_gpus = vm["gpu"].as<std::vector<int> >();
-    }
-
-    if (vm.count("full-tuner")) {
-        cfg_sgemm_exhaustive = true;
-    }
-
-    if (vm.count("tune-only")) {
-        cfg_tune_only = true;
-    }
-#endif
-
     if (vm.count("benchmark")) {
         // These must be set later to override default arguments.
         cfg_allow_pondering = false;
         cfg_benchmark = true;
-        cfg_noise = false;  // Not much of a benchmark if random was used.
+        cfg_noise = false; // Not much of a benchmark if random was used.
         cfg_random_cnt = 0;
         cfg_rng_seed = 1;
-        cfg_timemanage = TimeManagement::OFF;  // Reliable number of playouts.
-        if (vm["threads"].defaulted()) {
-            cfg_num_threads = 1;
-        }
+        cfg_timemanage = TimeManagement::OFF; // Reliable number of playouts.
+
         if (!vm.count("playouts") && !vm.count("visits")) {
             cfg_max_visits = 3200; // Default to self-play and match values.
         }
     }
 
+    // Do not lower the expected eval for root moves that are likely not
+    // the best if we have introduced noise there exactly to explore more.
+    cfg_fpu_root_reduction = cfg_noise ? 0.0f : cfg_fpu_reduction;
+
     auto out = std::stringstream{};
     for (auto i = 1; i < argc; i++) {
         out << " " << argv[i];
@@ -335,6 +492,14 @@ static void parse_commandline(int argc, char *argv[]) {
     cfg_options_str = out.str();
 }
 
+static void initialize_network() {
+    auto network = std::make_unique<Network>();
+    auto playouts = std::min(cfg_max_playouts, cfg_max_visits);
+    network->initialize(playouts, cfg_weightsfile);
+
+    GTP::initialize(std::move(network));
+}
+
 // Setup global objects after command line has been parsed
 void init_global_objects() {
     thread_pool.initialize(cfg_num_threads);
@@ -348,25 +513,23 @@ void init_global_objects() {
     // improves reproducibility across platforms.
     Random::get_Rng().seedrandom(cfg_rng_seed);
 
-    // When visits are limited ensure cache size is still limited.
-    auto playouts = std::min(cfg_max_playouts, cfg_max_visits);
-    NNCache::get_NNCache().set_size_from_playouts(playouts);
+    Utils::create_z_table();
 
-    // Initialize network
-    Network::initialize();
+    initialize_network();
 }
 
 void benchmark(GameState& game) {
-    game.set_timecontrol(0, 1, 0, 0);  // Set infinite time.
-    game.play_textmove("b", "q16");
-    auto search = std::make_unique<UCTSearch>(game);
+    game.set_timecontrol(0, 1, 0, 0); // Set infinite time.
+    game.play_textmove("b", "r16");
+    game.play_textmove("w", "d4");
+    game.play_textmove("b", "c3");
+
+    auto search = std::make_unique<UCTSearch>(game, *GTP::s_network);
     game.set_to_move(FastBoard::WHITE);
     search->think(FastBoard::WHITE);
 }
 
-int main(int argc, char *argv[]) {
-    auto input = std::string{};
-
+int main(int argc, char* argv[]) {
     // Set up engine parameters
     GTP::setup_default_parameters();
     parse_commandline(argc, argv);
@@ -378,7 +541,7 @@ int main(int argc, char *argv[]) {
 
     setbuf(stdout, nullptr);
     setbuf(stderr, nullptr);
-#ifndef WIN32
+#ifndef _WIN32
     setbuf(stdin, nullptr);
 #endif
 
@@ -391,8 +554,7 @@ int main(int argc, char *argv[]) {
     auto maingame = std::make_unique<GameState>();
 
     /* set board limits */
-    auto komi = 7.5f;
-    maingame->init_game(BOARD_SIZE, komi);
+    maingame->init_game(BOARD_SIZE, KOMI);
 
     if (cfg_benchmark) {
         cfg_quiet = false;
@@ -406,6 +568,7 @@ int main(int argc, char *argv[]) {
             std::cout << "Leela: ";
         }
 
+        auto input = std::string{};
         if (std::getline(std::cin, input)) {
             Utils::log_input(input);
             GTP::execute(*maingame, input);
diff --git a/src/Makefile b/src/Makefile
index b9b9cf4e5..fe89fa22c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -3,30 +3,30 @@ THE_OS := $(shell uname -s)
 default:
 	@echo "Detected OS: ${THE_OS}"
 	$(MAKE) CC=gcc CXX=g++ \
-		CXXFLAGS='$(CXXFLAGS) -Wall -Wextra -pipe -O3 -g -ffast-math -flto -march=native -std=c++14 -DNDEBUG'  \
+		CXXFLAGS='$(CXXFLAGS) -Wall -Wextra -Wno-deprecated-copy -pipe -O3 -g -ffast-math -flto -march=native -std=c++14 -DNDEBUG' \
 		LDFLAGS='$(LDFLAGS) -flto -g' \
 		leelaz
 
 debug:
 	@echo "Detected OS: ${THE_OS}"
 	$(MAKE) CC=gcc CXX=g++ \
-		CXXFLAGS='$(CXXFLAGS) -Wall -Wextra -pipe -Og -g -std=c++14' \
+		CXXFLAGS='$(CXXFLAGS) -Wall -Wextra -Wno-deprecated-copy -pipe -Og -g -std=c++14' \
 		LDFLAGS='$(LDFLAGS) -g' \
 		leelaz
 
 clang:
 	@echo "Detected OS: ${THE_OS}"
-	$(MAKE) CC=clang-5.0 CXX=clang++-5.0 \
-		CXXFLAGS='$(CXXFLAGS) -Wall -Wextra -Wno-missing-braces -O3 -ffast-math -flto -march=native -std=c++14 -DNDEBUG' \
+	$(MAKE) CC=clang CXX=clang++ \
+		CXXFLAGS='$(CXXFLAGS) -Wall -Wextra -Wno-deprecated-copy -O3 -ffast-math -flto -march=native -std=c++14 -DNDEBUG' \
 		LDFLAGS='$(LDFLAGS) -flto -fuse-linker-plugin' \
 		leelaz
 
-DYNAMIC_LIBS = -lboost_program_options -lpthread -lz
+DYNAMIC_LIBS = -lboost_system -lboost_filesystem -lboost_program_options -lpthread -lz
 LIBS =
 
 ifeq ($(THE_OS),Linux)
 # for Linux with OpenBLAS
-	CXXFLAGS += -I/usr/include/openblas
+	CXXFLAGS += -I/usr/include/openblas -I./Eigen
 	DYNAMIC_LIBS += -lopenblas
 	DYNAMIC_LIBS += -lOpenCL
 endif
@@ -34,6 +34,7 @@ ifeq ($(THE_OS),Darwin)
 # for macOS (comment out the Linux part)
 	LIBS += -framework Accelerate
 	LIBS += -framework OpenCL
+	CXXFLAGS += -I./Eigen
 	CXXFLAGS += -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Headers
 endif
 
@@ -50,7 +51,7 @@ sources = Network.cpp FullBoard.cpp KoState.cpp Training.cpp \
 	  SGFParser.cpp Timing.cpp Utils.cpp FastBoard.cpp \
 	  SGFTree.cpp Zobrist.cpp FastState.cpp GTP.cpp Random.cpp \
 	  SMP.cpp UCTNode.cpp UCTNodePointer.cpp UCTNodeRoot.cpp \
-	  OpenCL.cpp OpenCLScheduler.cpp NNCache.cpp Tuner.cpp
+	  OpenCL.cpp OpenCLScheduler.cpp NNCache.cpp Tuner.cpp CPUPipe.cpp
 
 objects = $(sources:.cpp=.o)
 deps = $(sources:%.cpp=%.d)
diff --git a/src/NNCache.cpp b/src/NNCache.cpp
index e0f110379..055087156 100644
--- a/src/NNCache.cpp
+++ b/src/NNCache.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Michael O and contributors
+    Copyright (C) 2017-2019 Michael O and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,29 +14,43 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
+
 #include <functional>
+#include <memory>
 
 #include "NNCache.h"
-#include "Utils.h"
+
+#include "GTP.h"
 #include "UCTSearch.h"
+#include "Utils.h"
 
-NNCache::NNCache(int size) : m_size(size) {}
+const int NNCache::MAX_CACHE_COUNT;
+const int NNCache::MIN_CACHE_COUNT;
+const size_t NNCache::ENTRY_SIZE;
 
-NNCache& NNCache::get_NNCache(void) {
-    static NNCache cache;
-    return cache;
-}
+NNCache::NNCache(const int size) : m_size(size) {}
 
-bool NNCache::lookup(std::uint64_t hash, Network::Netresult & result) {
+bool NNCache::lookup(const std::uint64_t hash, Netresult& result) {
     std::lock_guard<std::mutex> lock(m_mutex);
     ++m_lookups;
 
     auto iter = m_cache.find(hash);
     if (iter == m_cache.end()) {
-        return false;  // Not found.
+        return false; // Not found.
     }
 
     const auto& entry = iter->second;
@@ -47,12 +61,11 @@ bool NNCache::lookup(std::uint64_t hash, Network::Netresult & result) {
     return true;
 }
 
-void NNCache::insert(std::uint64_t hash,
-                     const Network::Netresult& result) {
+void NNCache::insert(const std::uint64_t hash, const Netresult& result) {
     std::lock_guard<std::mutex> lock(m_mutex);
 
     if (m_cache.find(hash) != m_cache.end()) {
-        return;  // Already in the cache.
+        return; // Already in the cache.
     }
 
     m_cache.emplace(hash, std::make_unique<Entry>(result));
@@ -66,7 +79,7 @@ void NNCache::insert(std::uint64_t hash,
     }
 }
 
-void NNCache::resize(int size) {
+void NNCache::resize(const int size) {
     m_size = size;
     while (m_order.size() > m_size) {
         m_cache.erase(m_order.front());
@@ -74,22 +87,30 @@ void NNCache::resize(int size) {
     }
 }
 
-void NNCache::set_size_from_playouts(int max_playouts) {
+void NNCache::clear() {
+    m_cache.clear();
+    m_order.clear();
+}
+
+void NNCache::set_size_from_playouts(const int max_playouts) {
     // cache hits are generally from last several moves so setting cache
     // size based on playouts increases the hit rate while balancing memory
-    // usage for low playout instances. 150'000 cache entries is ~225 MB
+    // usage for low playout instances. 150'000 cache entries is ~208 MiB
     constexpr auto num_cache_moves = 3;
     auto max_playouts_per_move =
-        std::min(max_playouts,
-                 UCTSearch::UNLIMITED_PLAYOUTS / num_cache_moves);
+        std::min(max_playouts, UCTSearch::UNLIMITED_PLAYOUTS / num_cache_moves);
     auto max_size = num_cache_moves * max_playouts_per_move;
-    max_size = std::min(150'000, std::max(6'000, max_size));
-    NNCache::get_NNCache().resize(max_size);
+    max_size = std::min(MAX_CACHE_COUNT, std::max(MIN_CACHE_COUNT, max_size));
+    resize(max_size);
 }
 
 void NNCache::dump_stats() {
     Utils::myprintf(
         "NNCache: %d/%d hits/lookups = %.1f%% hitrate, %d inserts, %u size\n",
-        m_hits, m_lookups, 100. * m_hits / (m_lookups + 1),
-        m_inserts, m_cache.size());
+        m_hits, m_lookups, 100. * m_hits / (m_lookups + 1), m_inserts,
+        m_cache.size());
+}
+
+size_t NNCache::get_estimated_size() {
+    return m_order.size() * NNCache::ENTRY_SIZE;
 }
diff --git a/src/NNCache.h b/src/NNCache.h
index 6901e5956..d2a7eb943 100644
--- a/src/NNCache.h
+++ b/src/NNCache.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Michael O and contributors
+    Copyright (C) 2017-2019 Michael O and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef NNCACHE_H_INCLUDED
@@ -21,29 +32,53 @@
 
 #include "config.h"
 
+#include <array>
 #include <deque>
+#include <memory>
 #include <mutex>
 #include <unordered_map>
 
-#include "Network.h"
-
 class NNCache {
 public:
-    // return the global NNCache
-    static NNCache& get_NNCache(void);
+    // Maximum size of the cache in number of items.
+    static constexpr int MAX_CACHE_COUNT = 150'000;
+
+    // Minimum size of the cache in number of items.
+    static constexpr int MIN_CACHE_COUNT = 6'000;
+
+    struct Netresult {
+        // 19x19 board positions
+        std::array<float, NUM_INTERSECTIONS> policy;
+
+        // pass
+        float policy_pass;
+
+        // winrate
+        float winrate;
+
+        Netresult() : policy_pass(0.0f), winrate(0.0f) {
+            policy.fill(0.0f);
+        }
+    };
+
+    static constexpr size_t ENTRY_SIZE = sizeof(Netresult)
+                                         + sizeof(std::uint64_t)
+                                         + sizeof(std::unique_ptr<Netresult>);
+
+    NNCache(int size = MAX_CACHE_COUNT); // ~ 208MiB
 
     // Set a reasonable size gives max number of playouts
     void set_size_from_playouts(int max_playouts);
 
     // Resize NNCache
     void resize(int size);
+    void clear();
 
     // Try and find an existing entry.
-    bool lookup(std::uint64_t hash, Network::Netresult & result);
+    bool lookup(std::uint64_t hash, Netresult& result);
 
     // Insert a new entry.
-    void insert(std::uint64_t hash,
-                const Network::Netresult& result);
+    void insert(std::uint64_t hash, const Netresult& result);
 
     // Return the hit rate ratio.
     std::pair<int, int> hit_rate() const {
@@ -52,9 +87,10 @@ class NNCache {
 
     void dump_stats();
 
-private:
-    NNCache(int size = 150000);  // ~ 225MB
+    // Return the estimated memory consumption of the cache.
+    size_t get_estimated_size();
 
+private:
     std::mutex m_mutex;
 
     size_t m_size;
@@ -65,9 +101,8 @@ class NNCache {
     int m_inserts{0};
 
     struct Entry {
-        Entry(const Network::Netresult& r)
-            : result(r) {}
-        Network::Netresult result;  // ~ 1.5KB
+        Entry(const Netresult& r) : result(r) {}
+        Netresult result; // ~ 1.4KiB
     };
 
     // Map from hash to {features, result}
diff --git a/src/Network.cpp b/src/Network.cpp
index d629afe06..c04868fc0 100644
--- a/src/Network.cpp
+++ b/src/Network.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,23 +14,35 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
 #include "config.h"
-#include "Network.h"
 
 #include <algorithm>
 #include <array>
+#include <boost/format.hpp>
+#include <boost/spirit/home/x3.hpp>
+#include <boost/utility.hpp>
 #include <cassert>
 #include <cmath>
 #include <iterator>
 #include <memory>
 #include <sstream>
 #include <string>
-#include <boost/utility.hpp>
-#include <boost/format.hpp>
-#include <boost/spirit/home/x3.hpp>
+#ifndef USE_BLAS
+#include <Eigen/Dense>
+#endif
 
 #ifdef __APPLE__
 #include <Accelerate/Accelerate.h>
@@ -41,18 +53,18 @@
 #ifdef USE_OPENBLAS
 #include <cblas.h>
 #endif
+#include "CPUPipe.h"
+#include "Network.h"
 #include "zlib.h"
 #ifdef USE_OPENCL
 #include "OpenCLScheduler.h"
 #include "UCTNode.h"
 #endif
-
 #include "FastBoard.h"
 #include "FastState.h"
 #include "FullBoard.h"
-#include "GameState.h"
 #include "GTP.h"
-#include "Im2Col.h"
+#include "GameState.h"
 #include "NNCache.h"
 #include "Random.h"
 #include "ThreadPool.h"
@@ -62,36 +74,56 @@
 namespace x3 = boost::spirit::x3;
 using namespace Utils;
 
-// Input + residual block tower
-static std::vector<std::vector<float>> conv_weights;
-static std::vector<std::vector<float>> conv_biases;
-static std::vector<std::vector<float>> batchnorm_means;
-static std::vector<std::vector<float>> batchnorm_stddivs;
+#ifndef USE_BLAS
+// Eigen helpers
+template <typename T>
+using EigenVectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+#endif
 
-// Policy head
-static std::vector<float> conv_pol_w;
-static std::vector<float> conv_pol_b;
-static std::array<float, 2> bn_pol_w1;
-static std::array<float, 2> bn_pol_w2;
+// Symmetry helper
+static std::array<std::array<int, NUM_INTERSECTIONS>, Network::NUM_SYMMETRIES>
+    symmetry_nn_idx_table;
 
-static std::array<float, (BOARD_SQUARES + 1) * BOARD_SQUARES * 2> ip_pol_w;
-static std::array<float, BOARD_SQUARES + 1> ip_pol_b;
+float Network::benchmark_time(const int centiseconds) {
+    const auto cpus = cfg_num_threads;
 
-// Value head
-static std::vector<float> conv_val_w;
-static std::vector<float> conv_val_b;
-static std::array<float, 1> bn_val_w1;
-static std::array<float, 1> bn_val_w2;
+    ThreadGroup tg(thread_pool);
+    std::atomic<int> runcount{0};
 
-static std::array<float, BOARD_SQUARES * 256> ip1_val_w;
-static std::array<float, 256> ip1_val_b;
+    GameState state;
+    state.init_game(BOARD_SIZE, KOMI);
 
-static std::array<float, 256> ip2_val_w;
-static std::array<float, 1> ip2_val_b;
-static bool value_head_not_stm;
+    // As a sanity run, try one run with self check.
+    // Isn't enough to guarantee correctness but better than nothing,
+    // plus for large nets self-check takes a while (1~3 eval per second)
+    get_output(&state, Ensemble::RANDOM_SYMMETRY, -1, false, true, true);
 
-// Symmetry helper
-static std::array<std::array<int, BOARD_SQUARES>, 8> symmetry_nn_idx_table;
+    const Time start;
+    for (auto i = size_t{0}; i < cpus; i++) {
+        tg.add_task([this, &runcount, start, centiseconds, state]() {
+            while (true) {
+                runcount++;
+                get_output(&state, Ensemble::RANDOM_SYMMETRY, -1, false);
+                const Time end;
+                const auto elapsed = Time::timediff_centis(start, end);
+                if (elapsed >= centiseconds) {
+                    break;
+                }
+            }
+        });
+    }
+    tg.wait_all();
+
+    const Time end;
+    const auto elapsed = Time::timediff_centis(start, end);
+    return 100.0f * runcount.load() / elapsed;
+}
 
 void Network::benchmark(const GameState* const state, const int iterations) {
     const auto cpus = cfg_num_threads;
@@ -100,11 +132,11 @@ void Network::benchmark(const GameState* const state, const int iterations) {
     ThreadGroup tg(thread_pool);
     std::atomic<int> runcount{0};
 
-    for (auto i = 0; i < cpus; i++) {
-        tg.add_task([&runcount, iterations, state]() {
+    for (auto i = size_t{0}; i < cpus; i++) {
+        tg.add_task([this, &runcount, iterations, state]() {
             while (runcount < iterations) {
                 runcount++;
-                get_scored_moves(state, Ensemble::RANDOM_SYMMETRY, -1, true);
+                get_output(state, Ensemble::RANDOM_SYMMETRY, -1, false);
             }
         });
     }
@@ -116,7 +148,9 @@ void Network::benchmark(const GameState* const state, const int iterations) {
              runcount.load(), elapsed, int(runcount.load() / elapsed));
 }
 
-void Network::process_bn_var(std::vector<float>& weights, const float epsilon) {
+template <class container>
+void process_bn_var(container& weights) {
+    constexpr auto epsilon = 1e-5f;
     for (auto&& w : weights) {
         w = 1.0f / std::sqrt(w + epsilon);
     }
@@ -125,78 +159,74 @@ void Network::process_bn_var(std::vector<float>& weights, const float epsilon) {
 std::vector<float> Network::winograd_transform_f(const std::vector<float>& f,
                                                  const int outputs,
                                                  const int channels) {
-    // F(2x2, 3x3) Winograd filter transformation
+    // F(4x4, 3x3) Winograd filter transformation
     // transpose(G.dot(f).dot(G.transpose()))
     // U matrix is transposed for better memory layout in SGEMM
     auto U = std::vector<float>(WINOGRAD_TILE * outputs * channels);
-    const auto G = std::array<float, WINOGRAD_TILE>{ 1.0,  0.0,  0.0,
-                                                     0.5,  0.5,  0.5,
-                                                     0.5, -0.5,  0.5,
-                                                     0.0,  0.0,  1.0};
-    auto temp = std::array<float, 12>{};
-
-    for (auto o = 0; o < outputs; o++) {
-        for (auto c = 0; c < channels; c++) {
-            for (auto i = 0; i < 4; i++){
-                for (auto j = 0; j < 3; j++) {
-                    auto acc = 0.0f;
-                    for (auto k = 0; k < 3; k++) {
-                        acc += G[i*3 + k] * f[o*channels*9 + c*9 + k*3 + j];
+    const auto G = std::array<float, 3 * WINOGRAD_ALPHA>{
+         1.0f,         0.0f,        0.0f,
+        -2.0f / 3.0f, -SQ2 / 3.0f, -1.0f / 3.0f,
+        -2.0f / 3.0f,  SQ2 / 3.0f, -1.0f / 3.0f,
+         1.0f / 6.0f,  SQ2 / 6.0f,  1.0f / 3.0f,
+         1.0f / 6.0f, -SQ2 / 6.0f,  1.0f / 3.0f,
+         0.0f,         0.0f,        1.0f};
+
+    auto temp = std::array<float, 3 * WINOGRAD_ALPHA>{};
+
+    constexpr auto max_buffersize = 8;
+    auto buffersize = max_buffersize;
+
+    if (outputs % buffersize != 0) {
+        buffersize = 1;
+    }
+
+    std::array<float, max_buffersize * WINOGRAD_ALPHA * WINOGRAD_ALPHA> buffer;
+
+    for (auto c = 0; c < channels; c++) {
+        for (auto o_b = 0; o_b < outputs / buffersize; o_b++) {
+            for (auto bufferline = 0; bufferline < buffersize; bufferline++) {
+                const auto o = o_b * buffersize + bufferline;
+
+                for (auto i = 0; i < WINOGRAD_ALPHA; i++) {
+                    for (auto j = 0; j < 3; j++) {
+                        auto acc = 0.0f;
+                        for (auto k = 0; k < 3; k++) {
+                            acc += G[i * 3 + k]
+                                   * f[o * channels * 9 + c * 9 + k * 3 + j];
+                        }
+                        temp[i * 3 + j] = acc;
                     }
-                    temp[i*3 + j] = acc;
                 }
-            }
 
-            for (auto xi = 0; xi < 4; xi++) {
-                for (auto nu = 0; nu < 4; nu++) {
-                    auto acc = 0.0f;
-                    for (auto k = 0; k < 3; k++) {
-                        acc += temp[xi*3 + k] * G[nu*3 + k];
+                for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
+                    for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
+                        auto acc = 0.0f;
+                        for (auto k = 0; k < 3; k++) {
+                            acc += temp[xi * 3 + k] * G[nu * 3 + k];
+                        }
+                        buffer[(xi * WINOGRAD_ALPHA + nu) * buffersize
+                               + bufferline] = acc;
                     }
-                    U[xi * (4 * outputs * channels)
-                      + nu * (outputs * channels)
-                      + c * outputs
-                      + o] = acc;
                 }
             }
-        }
-    }
-
-    return U;
-}
-
-std::vector<float> Network::zeropad_U(const std::vector<float>& U,
-                                      const int outputs, const int channels,
-                                      const int outputs_pad,
-                                      const int channels_pad) {
-    // Fill with zeroes
-    auto Upad = std::vector<float>(WINOGRAD_TILE * outputs_pad * channels_pad);
-
-    for (auto o = 0; o < outputs; o++) {
-        for (auto c = 0; c < channels; c++) {
-            for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++){
-                for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
-                    Upad[xi * (WINOGRAD_ALPHA * outputs_pad * channels_pad)
-                         + nu * (outputs_pad * channels_pad)
-                         + c * outputs_pad +
-                          o] =
-                    U[xi * (WINOGRAD_ALPHA * outputs * channels)
-                      + nu * (outputs * channels)
-                      + c * outputs
-                      + o];
+            for (auto i = 0; i < WINOGRAD_ALPHA * WINOGRAD_ALPHA; i++) {
+                for (auto entry = 0; entry < buffersize; entry++) {
+                    const auto o = o_b * buffersize + entry;
+                    U[i * outputs * channels + c * outputs + o] =
+                        buffer[buffersize * i + entry];
                 }
             }
         }
     }
 
-    return Upad;
+    return U;
 }
 
 std::pair<int, int> Network::load_v1_network(std::istream& wtfile) {
     // Count size of the network
     myprintf("Detecting residual layers...");
     // We are version 1 or 2
-    if (value_head_not_stm) {
+    if (m_value_head_not_stm) {
         myprintf("v%d...", 2);
     } else {
         myprintf("v%d...", 1);
@@ -241,59 +271,85 @@ std::pair<int, int> Network::load_v1_network(std::istream& wtfile) {
     while (std::getline(wtfile, line)) {
         std::vector<float> weights;
         auto it_line = line.cbegin();
-        const auto ok = phrase_parse(it_line, line.cend(),
-                                     *x3::float_, x3::space, weights);
+        const auto ok =
+            phrase_parse(it_line, line.cend(), *x3::float_, x3::space, weights);
         if (!ok || it_line != line.cend()) {
             myprintf("\nFailed to parse weight file. Error on line %d.\n",
-                    linecount + 2); //+1 from version line, +1 from 0-indexing
-            return {0,0};
+                     linecount + 2); //+1 from version line, +1 from 0-indexing
+            return {0, 0};
         }
         if (linecount < plain_conv_wts) {
             if (linecount % 4 == 0) {
-                conv_weights.emplace_back(weights);
+                m_fwd_weights->m_conv_weights.emplace_back(weights);
             } else if (linecount % 4 == 1) {
                 // Redundant in our model, but they encode the
                 // number of outputs so we have to read them in.
-                conv_biases.emplace_back(weights);
+                m_fwd_weights->m_conv_biases.emplace_back(weights);
             } else if (linecount % 4 == 2) {
-                batchnorm_means.emplace_back(weights);
+                m_fwd_weights->m_batchnorm_means.emplace_back(weights);
             } else if (linecount % 4 == 3) {
                 process_bn_var(weights);
-                batchnorm_stddivs.emplace_back(weights);
+                m_fwd_weights->m_batchnorm_stddevs.emplace_back(weights);
+            }
+        } else {
+            switch (linecount - plain_conv_wts) {
+                case 0: m_fwd_weights->m_conv_pol_w = std::move(weights); break;
+                case 1: m_fwd_weights->m_conv_pol_b = std::move(weights); break;
+                case 2:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_bn_pol_w1));
+                    break;
+                case 3:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_bn_pol_w2));
+                    break;
+                case 4:
+                    if (weights.size()
+                        != OUTPUTS_POLICY * NUM_INTERSECTIONS
+                               * POTENTIAL_MOVES) {
+                        myprintf("The weights file is not for %dx%d boards.\n",
+                                 BOARD_SIZE, BOARD_SIZE);
+                        return {0, 0};
+                    }
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_ip_pol_w));
+                    break;
+                case 5:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_ip_pol_b));
+                    break;
+                case 6: m_fwd_weights->m_conv_val_w = std::move(weights); break;
+                case 7: m_fwd_weights->m_conv_val_b = std::move(weights); break;
+                case 8:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_bn_val_w1));
+                    break;
+                case 9:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_bn_val_w2));
+                    break;
+                case 10:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_ip1_val_w));
+                    break;
+                case 11:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_ip1_val_b));
+                    break;
+                case 12:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_ip2_val_w));
+                    break;
+                case 13:
+                    std::copy(cbegin(weights), cend(weights),
+                              begin(m_ip2_val_b));
+                    break;
             }
-        } else if (linecount == plain_conv_wts) {
-            conv_pol_w = std::move(weights);
-        } else if (linecount == plain_conv_wts + 1) {
-            conv_pol_b = std::move(weights);
-        } else if (linecount == plain_conv_wts + 2) {
-            std::copy(cbegin(weights), cend(weights), begin(bn_pol_w1));
-        } else if (linecount == plain_conv_wts + 3) {
-            process_bn_var(weights);
-            std::copy(cbegin(weights), cend(weights), begin(bn_pol_w2));
-        } else if (linecount == plain_conv_wts + 4) {
-            std::copy(cbegin(weights), cend(weights), begin(ip_pol_w));
-        } else if (linecount == plain_conv_wts + 5) {
-            std::copy(cbegin(weights), cend(weights), begin(ip_pol_b));
-        } else if (linecount == plain_conv_wts + 6) {
-            conv_val_w = std::move(weights);
-        } else if (linecount == plain_conv_wts + 7) {
-            conv_val_b = std::move(weights);
-        } else if (linecount == plain_conv_wts + 8) {
-            std::copy(cbegin(weights), cend(weights), begin(bn_val_w1));
-        } else if (linecount == plain_conv_wts + 9) {
-            process_bn_var(weights);
-            std::copy(cbegin(weights), cend(weights), begin(bn_val_w2));
-        } else if (linecount == plain_conv_wts + 10) {
-            std::copy(cbegin(weights), cend(weights), begin(ip1_val_w));
-        } else if (linecount == plain_conv_wts + 11) {
-            std::copy(cbegin(weights), cend(weights), begin(ip1_val_b));
-        } else if (linecount == plain_conv_wts + 12) {
-            std::copy(cbegin(weights), cend(weights), begin(ip2_val_w));
-        } else if (linecount == plain_conv_wts + 13) {
-            std::copy(cbegin(weights), cend(weights), begin(ip2_val_b));
         }
         linecount++;
     }
+    process_bn_var(m_bn_pol_w2);
+    process_bn_var(m_bn_val_w2);
 
     return {channels, static_cast<int>(residual_blocks)};
 }
@@ -335,12 +391,12 @@ std::pair<int, int> Network::load_network_file(const std::string& filename) {
             return {0, 0};
         } else {
             // Version 2 networks are identical to v1, except
-            // that they return the score for black instead of
+            // that they return the value for black instead of
             // the player to move. This is used by ELF Open Go.
             if (format_version == 2) {
-                value_head_not_stm = true;
+                m_value_head_not_stm = true;
             } else {
-                value_head_not_stm = false;
+                m_value_head_not_stm = false;
             }
             return load_v1_network(buffer);
         }
@@ -348,107 +404,95 @@ std::pair<int, int> Network::load_network_file(const std::string& filename) {
     return {0, 0};
 }
 
-void Network::initialize() {
-    // Prepare symmetry table
-    for (auto s = 0; s < 8; s++) {
-        for (auto v = 0; v < BOARD_SQUARES; v++) {
-            symmetry_nn_idx_table[s][v] = get_nn_idx_symmetry(v, s);
-        }
-    }
-
-    // Load network from file
-    size_t channels, residual_blocks;
-    std::tie(channels, residual_blocks) = load_network_file(cfg_weightsfile);
-    if (channels == 0) {
-        exit(EXIT_FAILURE);
-    }
+std::unique_ptr<ForwardPipe>&& Network::init_net(
+    const int channels, std::unique_ptr<ForwardPipe>&& pipe) {
 
-    auto weight_index = size_t{0};
-    // Input convolution
-    // Winograd transform convolution weights
-    conv_weights[weight_index] =
-        winograd_transform_f(conv_weights[weight_index],
-                             channels, INPUT_CHANNELS);
-    weight_index++;
+    pipe->initialize(channels);
+    pipe->push_weights(WINOGRAD_ALPHA, INPUT_CHANNELS, channels, m_fwd_weights);
 
-    // Residual block convolutions
-    for (auto i = size_t{0}; i < residual_blocks * 2; i++) {
-        conv_weights[weight_index] =
-            winograd_transform_f(conv_weights[weight_index],
-                                 channels, channels);
-        weight_index++;
-    }
+    return std::move(pipe);
+}
 
-    // Biases are not calculated and are typically zero but some networks might
-    // still have non-zero biases.
-    // Move biases to batchnorm means to make the output match without having
-    // to separately add the biases.
-    for (auto i = size_t{0}; i < conv_biases.size(); i++) {
-        for (auto j = size_t{0}; j < batchnorm_means[i].size(); j++) {
-            batchnorm_means[i][j] -= conv_biases[i][j];
-            conv_biases[i][j] = 0.0f;
+#ifdef USE_HALF
+void Network::select_precision(const int channels) {
+    if (cfg_precision == precision_t::AUTO) {
+        auto score_fp16 = float{-1.0};
+        auto score_fp32 = float{-1.0};
+
+        myprintf("Initializing OpenCL (autodetecting precision).\n");
+
+        // Setup fp16 here so that we can see if we can skip autodetect.
+        // However, if fp16 sanity check fails we will return a fp32 and pray it works.
+        auto fp16_net = std::make_unique<OpenCLScheduler<half_float::half>>();
+        if (!fp16_net->needs_autodetect()) {
+            try {
+                myprintf("OpenCL: using fp16/half or tensor core compute support.\n");
+                m_forward = init_net(channels, std::move(fp16_net));
+                benchmark_time(1); // a sanity check run
+            } catch (...) {
+                myprintf("OpenCL: fp16/half or tensor core failed "
+                         "despite driver claiming support.\n");
+                myprintf("Falling back to single precision\n");
+                m_forward.reset();
+                m_forward = init_net(
+                    channels, std::make_unique<OpenCLScheduler<float>>());
+            }
+            return;
         }
-    }
 
-    for (auto i = size_t{0}; i < bn_val_w1.size(); i++) {
-        bn_val_w1[i] -= conv_val_b[i];
-        conv_val_b[i] = 0.0f;
-    }
-
-    for (auto i = size_t{0}; i < bn_pol_w1.size(); i++) {
-        bn_pol_w1[i] -= conv_pol_b[i];
-        conv_pol_b[i] = 0.0f;
-    }
-
-#ifdef USE_OPENCL
-    myprintf("Initializing OpenCL.\n");
-    opencl.initialize(channels);
-
-    for (const auto & opencl_net : opencl.get_networks()) {
-        const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
-
-        const auto mwg = tuners[0];
-        const auto kwg = tuners[2];
-        const auto vwm = tuners[3];
-
-        weight_index = 0;
-
-        const auto m_ceil = ceilMultiple(ceilMultiple(channels, mwg), vwm);
-        const auto k_ceil = ceilMultiple(ceilMultiple(INPUT_CHANNELS, kwg), vwm);
-
-        const auto Upad = zeropad_U(conv_weights[weight_index],
-                                    channels, INPUT_CHANNELS,
-                                    m_ceil, k_ceil);
-
-        // Winograd filter transformation changes filter size to 4x4
-        opencl_net->push_input_convolution(WINOGRAD_ALPHA, INPUT_CHANNELS,
-            channels, Upad,
-            batchnorm_means[weight_index], batchnorm_stddivs[weight_index]);
-        weight_index++;
+        // Start by setting up fp32.
+        try {
+            m_forward.reset();
+            m_forward =
+                init_net(channels, std::make_unique<OpenCLScheduler<float>>());
+            score_fp32 = benchmark_time(100);
+        } catch (...) {
+            // empty - if exception thrown just throw away fp32 net
+        }
 
-        // residual blocks
-        for (auto i = size_t{0}; i < residual_blocks; i++) {
-            const auto Upad1 = zeropad_U(conv_weights[weight_index],
-                                         channels, channels,
-                                         m_ceil, m_ceil);
-            const auto Upad2 = zeropad_U(conv_weights[weight_index + 1],
-                                         channels, channels,
-                                         m_ceil, m_ceil);
-            opencl_net->push_residual(WINOGRAD_ALPHA, channels, channels,
-                                      Upad1,
-                                      batchnorm_means[weight_index],
-                                      batchnorm_stddivs[weight_index],
-                                      Upad2,
-                                      batchnorm_means[weight_index + 1],
-                                      batchnorm_stddivs[weight_index + 1]);
-            weight_index += 2;
+        // Now benchmark fp16.
+        try {
+            m_forward.reset();
+            m_forward = init_net(channels, std::move(fp16_net));
+            score_fp16 = benchmark_time(100);
+        } catch (...) {
+            // empty - if exception thrown just throw away fp16 net
         }
 
-        // Output head convolutions
-        opencl_net->push_convolve1(channels, OUTPUTS_POLICY, conv_pol_w);
-        opencl_net->push_convolve1(channels, OUTPUTS_VALUE, conv_val_w);
+        if (score_fp16 < 0.0f && score_fp32 < 0.0f) {
+            myprintf("Both single precision and half precision failed to run.\n");
+            throw std::runtime_error("Failed to initialize net.");
+        } else if (score_fp16 < 0.0f) {
+            myprintf("Using OpenCL single precision (half precision failed to run).\n");
+            m_forward.reset();
+            m_forward =
+                init_net(channels, std::make_unique<OpenCLScheduler<float>>());
+        } else if (score_fp32 < 0.0f) {
+            myprintf("Using OpenCL half precision (single precision failed to run).\n");
+        } else if (score_fp32 * 1.05f > score_fp16) {
+            myprintf("Using OpenCL single precision (less than 5%% slower than half).\n");
+            m_forward.reset();
+            m_forward =
+                init_net(channels, std::make_unique<OpenCLScheduler<float>>());
+        } else {
+            myprintf("Using OpenCL half precision (at least 5%% faster than single).\n");
+        }
+        return;
+    } else if (cfg_precision == precision_t::SINGLE) {
+        myprintf("Initializing OpenCL (single precision).\n");
+        m_forward =
+            init_net(channels, std::make_unique<OpenCLScheduler<float>>());
+        return;
+    } else if (cfg_precision == precision_t::HALF) {
+        myprintf("Initializing OpenCL (half precision).\n");
+        m_forward = init_net(
+            channels, std::make_unique<OpenCLScheduler<half_float::half>>());
+        return;
     }
+}
 #endif
+
+void Network::initialize(const int playouts, const std::string& weightsfile) {
 #ifdef USE_BLAS
 #ifndef __APPLE__
 #ifdef USE_OPENBLAS
@@ -456,269 +500,135 @@ void Network::initialize() {
     myprintf("BLAS Core: %s\n", openblas_get_corename());
 #endif
 #ifdef USE_MKL
-    //mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
+    // mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL);
     mkl_set_num_threads(1);
     MKLVersion Version;
     mkl_get_version(&Version);
     myprintf("BLAS core: MKL %s\n", Version.Processor);
 #endif
 #endif
+#else
+    myprintf("BLAS Core: built-in Eigen %d.%d.%d library.\n",
+             EIGEN_WORLD_VERSION, EIGEN_MAJOR_VERSION, EIGEN_MINOR_VERSION);
 #endif
-}
 
-#ifdef USE_BLAS
-void Network::winograd_transform_in(const std::vector<float>& in,
-                                    std::vector<float>& V,
-                                    const int C) {
-    constexpr auto W = BOARD_SIZE;
-    constexpr auto H = BOARD_SIZE;
-    constexpr auto WTILES = (W + 1) / 2;
-    constexpr auto P = WTILES * WTILES;
-
-    std::array<std::array<float, WTILES * 2 + 2>, WTILES * 2 + 2> in_pad;
-    for (auto xin = size_t{0}; xin < in_pad.size(); xin++) {
-        in_pad[0][xin]     = 0.0f;
-        in_pad[H + 1][xin] = 0.0f;
-        in_pad[H + 2][xin] = 0.0f;
-    }
-    for (auto yin = size_t{1}; yin < in_pad[0].size() - 2; yin++) {
-        in_pad[yin][0]     = 0.0f;
-        in_pad[yin][W + 1] = 0.0f;
-        in_pad[yin][W + 2] = 0.0f;
-    }
+    m_fwd_weights = std::make_shared<ForwardPipeWeights>();
 
-    for (auto ch = 0; ch < C; ch++) {
-        for (auto yin = 0; yin < H; yin++) {
-            for (auto xin = 0; xin < W; xin++) {
-                in_pad[yin + 1][xin + 1] = in[ch*(W*H) + yin*W + xin];
-            }
-        }
-        for (auto block_y = 0; block_y < WTILES; block_y++) {
-            // Tiles overlap by 2
-            const auto yin = 2 * block_y;
-            for (auto block_x = 0; block_x < WTILES; block_x++) {
-                const auto xin = 2 * block_x;
-
-                // Calculates transpose(B).x.B
-                // B = [[ 1.0,  0.0,  0.0,  0.0],
-                //      [ 0.0,  1.0, -1.0,  1.0],
-                //      [-1.0,  1.0,  1.0,  0.0],
-                //      [ 0.0,  0.0,  0.0, -1.0]]
-
-                using WinogradTile =
-                    std::array<std::array<float, WINOGRAD_ALPHA>, WINOGRAD_ALPHA>;
-                WinogradTile T1, T2;
-
-                T1[0][0] = in_pad[yin + 0][xin + 0] - in_pad[yin + 2][xin + 0];
-                T1[0][1] = in_pad[yin + 0][xin + 1] - in_pad[yin + 2][xin + 1];
-                T1[0][2] = in_pad[yin + 0][xin + 2] - in_pad[yin + 2][xin + 2];
-                T1[0][3] = in_pad[yin + 0][xin + 3] - in_pad[yin + 2][xin + 3];
-                T1[1][0] = in_pad[yin + 1][xin + 0] + in_pad[yin + 2][xin + 0];
-                T1[1][1] = in_pad[yin + 1][xin + 1] + in_pad[yin + 2][xin + 1];
-                T1[1][2] = in_pad[yin + 1][xin + 2] + in_pad[yin + 2][xin + 2];
-                T1[1][3] = in_pad[yin + 1][xin + 3] + in_pad[yin + 2][xin + 3];
-                T1[2][0] = in_pad[yin + 2][xin + 0] - in_pad[yin + 1][xin + 0];
-                T1[2][1] = in_pad[yin + 2][xin + 1] - in_pad[yin + 1][xin + 1];
-                T1[2][2] = in_pad[yin + 2][xin + 2] - in_pad[yin + 1][xin + 2];
-                T1[2][3] = in_pad[yin + 2][xin + 3] - in_pad[yin + 1][xin + 3];
-                T1[3][0] = in_pad[yin + 1][xin + 0] - in_pad[yin + 3][xin + 0];
-                T1[3][1] = in_pad[yin + 1][xin + 1] - in_pad[yin + 3][xin + 1];
-                T1[3][2] = in_pad[yin + 1][xin + 2] - in_pad[yin + 3][xin + 2];
-                T1[3][3] = in_pad[yin + 1][xin + 3] - in_pad[yin + 3][xin + 3];
-
-                T2[0][0] = T1[0][0] - T1[0][2];
-                T2[0][1] = T1[0][1] + T1[0][2];
-                T2[0][2] = T1[0][2] - T1[0][1];
-                T2[0][3] = T1[0][1] - T1[0][3];
-                T2[1][0] = T1[1][0] - T1[1][2];
-                T2[1][1] = T1[1][1] + T1[1][2];
-                T2[1][2] = T1[1][2] - T1[1][1];
-                T2[1][3] = T1[1][1] - T1[1][3];
-                T2[2][0] = T1[2][0] - T1[2][2];
-                T2[2][1] = T1[2][1] + T1[2][2];
-                T2[2][2] = T1[2][2] - T1[2][1];
-                T2[2][3] = T1[2][1] - T1[2][3];
-                T2[3][0] = T1[3][0] - T1[3][2];
-                T2[3][1] = T1[3][1] + T1[3][2];
-                T2[3][2] = T1[3][2] - T1[3][1];
-                T2[3][3] = T1[3][1] - T1[3][3];
-
-                const auto offset = ch * P + block_y * WTILES + block_x;
-                for (auto i = 0; i < WINOGRAD_ALPHA; i++) {
-                    for (auto j = 0; j < WINOGRAD_ALPHA; j++) {
-                        V[(i*WINOGRAD_ALPHA + j)*C*P + offset] = T2[i][j];
-                    }
-                }
-            }
+    // Make a guess at a good size as long as the user doesn't
+    // explicitly set a maximum memory usage.
+    m_nncache.set_size_from_playouts(playouts);
+
+    // Prepare symmetry table
+    for (auto s = 0; s < NUM_SYMMETRIES; ++s) {
+        for (auto v = 0; v < NUM_INTERSECTIONS; ++v) {
+            const auto newvtx =
+                get_symmetry({v % BOARD_SIZE, v / BOARD_SIZE}, s);
+            symmetry_nn_idx_table[s][v] =
+                (newvtx.second * BOARD_SIZE) + newvtx.first;
+            assert(symmetry_nn_idx_table[s][v] >= 0
+                   && symmetry_nn_idx_table[s][v] < NUM_INTERSECTIONS);
         }
     }
-}
 
-void Network::winograd_sgemm(const std::vector<float>& U,
-                             const std::vector<float>& V,
-                             std::vector<float>& M,
-                             const int C, const int K) {
-    constexpr auto P = (BOARD_SIZE + 1) * (BOARD_SIZE + 1) / WINOGRAD_ALPHA;
-
-    for (auto b = 0; b < WINOGRAD_TILE; b++) {
-        const auto offset_u = b * K * C;
-        const auto offset_v = b * C * P;
-        const auto offset_m = b * K * P;
-
-        cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                    K, P, C,
-                    1.0f,
-                    &U[offset_u], K,
-                    &V[offset_v], P,
-                    0.0f,
-                    &M[offset_m], P);
+    // Load network from file
+    size_t channels, residual_blocks;
+    std::tie(channels, residual_blocks) = load_network_file(weightsfile);
+    if (channels == 0) {
+        exit(EXIT_FAILURE);
     }
-}
 
-void Network::winograd_transform_out(const std::vector<float>& M,
-                                     std::vector<float>& Y,
-                                     const int K) {
-    constexpr auto W = BOARD_SIZE;
-    constexpr auto H = BOARD_SIZE;
-    constexpr auto WTILES = (W + 1) / 2;
-    constexpr auto P = WTILES * WTILES;
-
-    for (auto k = 0; k < K; k++) {
-        const auto kHW = k * W * H;
-        for (auto block_x = 0; block_x < WTILES; block_x++) {
-            const auto x = 2 * block_x;
-            for (auto block_y = 0; block_y < WTILES; block_y++) {
-                const auto y = 2 * block_y;
-
-                const auto b = block_y * WTILES + block_x;
-                using WinogradTile =
-                    std::array<std::array<float, WINOGRAD_ALPHA>, WINOGRAD_ALPHA>;
-                WinogradTile temp_m;
-                for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
-                    for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
-                        temp_m[xi][nu] =
-                            M[xi*(WINOGRAD_ALPHA*K*P) + nu*(K*P)+ k*P + b];
-                    }
-                }
+    auto weight_index = size_t{0};
+    // Input convolution
+    // Winograd transform convolution weights
+    m_fwd_weights->m_conv_weights[weight_index] = winograd_transform_f(
+        m_fwd_weights->m_conv_weights[weight_index], channels, INPUT_CHANNELS);
+    weight_index++;
 
-                // Calculates transpose(A).temp_m.A
-                //    A = [1.0,  0.0],
-                //        [1.0,  1.0],
-                //        [1.0, -1.0],
-                //        [0.0, -1.0]]
-
-                const std::array<std::array<float, 2>, 2> o = {
-                    temp_m[0][0] + temp_m[0][1] + temp_m[0][2] +
-                    temp_m[1][0] + temp_m[1][1] + temp_m[1][2] +
-                    temp_m[2][0] + temp_m[2][1] + temp_m[2][2],
-                    temp_m[0][1] - temp_m[0][2] - temp_m[0][3] +
-                    temp_m[1][1] - temp_m[1][2] - temp_m[1][3] +
-                    temp_m[2][1] - temp_m[2][2] - temp_m[2][3],
-                    temp_m[1][0] + temp_m[1][1] + temp_m[1][2] -
-                    temp_m[2][0] - temp_m[2][1] - temp_m[2][2] -
-                    temp_m[3][0] - temp_m[3][1] - temp_m[3][2],
-                    temp_m[1][1] - temp_m[1][2] - temp_m[1][3] -
-                    temp_m[2][1] + temp_m[2][2] + temp_m[2][3] -
-                    temp_m[3][1] + temp_m[3][2] + temp_m[3][3]
-                };
-
-                const auto y_ind = kHW + (y)*W + (x);
-                Y[y_ind] = o[0][0];
-                if (x + 1 < W) {
-                    Y[y_ind + 1] = o[0][1];
-                }
-                if (y + 1 < H) {
-                    Y[y_ind + W] = o[1][0];
-                    if (x + 1 < W) {
-                        Y[y_ind + W + 1] = o[1][1];
-                    }
-                }
-            }
+    // Residual block convolutions
+    for (auto i = size_t{0}; i < residual_blocks * 2; i++) {
+        m_fwd_weights->m_conv_weights[weight_index] = winograd_transform_f(
+            m_fwd_weights->m_conv_weights[weight_index], channels, channels);
+        weight_index++;
+    }
+
+    // Biases are not calculated and are typically zero but some networks might
+    // still have non-zero biases.
+    // Move biases to batchnorm means to make the output match without having
+    // to separately add the biases.
+    auto bias_size = m_fwd_weights->m_conv_biases.size();
+    for (auto i = size_t{0}; i < bias_size; i++) {
+        auto means_size = m_fwd_weights->m_batchnorm_means[i].size();
+        for (auto j = size_t{0}; j < means_size; j++) {
+            m_fwd_weights->m_batchnorm_means[i][j] -=
+                m_fwd_weights->m_conv_biases[i][j];
+            m_fwd_weights->m_conv_biases[i][j] = 0.0f;
         }
     }
-}
 
-void Network::winograd_convolve3(const int outputs,
-                                 const std::vector<float>& input,
-                                 const std::vector<float>& U,
-                                 std::vector<float>& V,
-                                 std::vector<float>& M,
-                                 std::vector<float>& output) {
+    for (auto i = size_t{0}; i < m_bn_val_w1.size(); i++) {
+        m_bn_val_w1[i] -= m_fwd_weights->m_conv_val_b[i];
+        m_fwd_weights->m_conv_val_b[i] = 0.0f;
+    }
 
-    constexpr unsigned int filter_len = WINOGRAD_ALPHA * WINOGRAD_ALPHA;
-    const auto input_channels = U.size() / (outputs * filter_len);
+    for (auto i = size_t{0}; i < m_bn_pol_w1.size(); i++) {
+        m_bn_pol_w1[i] -= m_fwd_weights->m_conv_pol_b[i];
+        m_fwd_weights->m_conv_pol_b[i] = 0.0f;
+    }
 
-    winograd_transform_in(input, V, input_channels);
-    winograd_sgemm(U, V, M, input_channels, outputs);
-    winograd_transform_out(M, output, outputs);
-}
+#ifdef USE_OPENCL
+    if (cfg_cpu_only) {
+        myprintf("Initializing CPU-only evaluation.\n");
+        m_forward = init_net(channels, std::make_unique<CPUPipe>());
+    } else {
+#ifdef USE_OPENCL_SELFCHECK
+        // initialize CPU reference first, so that we can self-check
+        // when doing fp16 vs. fp32 detections
+        m_forward_cpu = init_net(channels, std::make_unique<CPUPipe>());
+#endif
+#ifdef USE_HALF
+        // HALF support is enabled, and we are using the GPU.
+        // Select the precision to use at runtime.
+        select_precision(channels);
+#else
+        myprintf("Initializing OpenCL (single precision).\n");
+        m_forward =
+            init_net(channels, std::make_unique<OpenCLScheduler<float>>());
+#endif
+    }
 
-template<unsigned int filter_size>
-void convolve(const size_t outputs,
-              const std::vector<float>& input,
-              const std::vector<float>& weights,
-              const std::vector<float>& biases,
-              std::vector<float>& output) {
-    // The size of the board is defined at compile time
-    constexpr unsigned int width = BOARD_SIZE;
-    constexpr unsigned int height = BOARD_SIZE;
-    constexpr auto board_squares = width * height;
-    constexpr auto filter_len = filter_size * filter_size;
-    const auto input_channels = weights.size() / (biases.size() * filter_len);
-    const auto filter_dim = filter_len * input_channels;
-    assert(outputs * board_squares == output.size());
-
-    std::vector<float> col(filter_dim * width * height);
-    im2col<filter_size>(input_channels, input, col);
-
-    // Weight shape (output, input, filter_size, filter_size)
-    // 96 18 3 3
-    // C←αAB + βC
-    // outputs[96,19x19] = weights[96,18x3x3] x col[18x3x3,19x19]
-    // M Number of rows in matrices A and C.
-    // N Number of columns in matrices B and C.
-    // K Number of columns in matrix A; number of rows in matrix B.
-    // lda The size of the first dimention of matrix A; if you are
-    // passing a matrix A[m][n], the value should be m.
-    //    cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-    //                ldb, beta, C, N);
-
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-                // M        N            K
-                outputs, board_squares, filter_dim,
-                1.0f, &weights[0], filter_dim,
-                &col[0], board_squares,
-                0.0f, &output[0], board_squares);
+#else // !USE_OPENCL
+    myprintf("Initializing CPU-only evaluation.\n");
+    m_forward = init_net(channels, std::make_unique<CPUPipe>());
+#endif
 
-    for (unsigned int o = 0; o < outputs; o++) {
-        for (unsigned int b = 0; b < board_squares; b++) {
-            output[(o * board_squares) + b] += biases[o];
-        }
-    }
+    // Need to estimate size before clearing up the pipe.
+    get_estimated_size();
+    m_fwd_weights.reset();
 }
 
-template<unsigned int inputs,
-         unsigned int outputs,
-         bool ReLU,
-         size_t W>
+template <unsigned int inputs, unsigned int outputs, bool ReLU, size_t W>
 std::vector<float> innerproduct(const std::vector<float>& input,
                                 const std::array<float, W>& weights,
                                 const std::array<float, outputs>& biases) {
     std::vector<float> output(outputs);
 
+#ifdef USE_BLAS
     cblas_sgemv(CblasRowMajor, CblasNoTrans,
                 // M     K
                 outputs, inputs,
                 1.0f, &weights[0], inputs,
                 &input[0], 1,
                 0.0f, &output[0], 1);
-
-    const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
-                                                          val : 0.0f; };
+#else
+    EigenVectorMap<float> y(output.data(), outputs);
+    y.noalias() =
+        ConstEigenMatrixMap<float>(weights.data(), inputs, outputs).transpose()
+        * ConstEigenVectorMap<float>(input.data(), inputs);
+#endif
     for (unsigned int o = 0; o < outputs; o++) {
         auto val = biases[o] + output[o];
         if (ReLU) {
-            val = lambda_ReLU(val);
+            val = std::max(0.0f, val);
         }
         output[o] = val;
     }
@@ -731,120 +641,51 @@ void batchnorm(const size_t channels,
                std::vector<float>& data,
                const float* const means,
                const float* const stddivs,
-               const float* const eltwise = nullptr)
-{
-    const auto lambda_ReLU = [](const auto val) { return (val > 0.0f) ?
-                                                          val : 0.0f; };
+               const float* const eltwise = nullptr) {
     for (auto c = size_t{0}; c < channels; ++c) {
         const auto mean = means[c];
         const auto scale_stddiv = stddivs[c];
+        const auto arr = &data[c * spatial_size];
 
         if (eltwise == nullptr) {
             // Classical BN
-            const auto arr = &data[c * spatial_size];
             for (auto b = size_t{0}; b < spatial_size; b++) {
-                arr[b] = lambda_ReLU(scale_stddiv * (arr[b] - mean));
+                arr[b] = std::max(0.0f, scale_stddiv * (arr[b] - mean));
             }
         } else {
             // BN + residual add
-            const auto arr = &data[c * spatial_size];
             const auto res = &eltwise[c * spatial_size];
             for (auto b = size_t{0}; b < spatial_size; b++) {
-                arr[b] = lambda_ReLU((scale_stddiv * (arr[b] - mean)) + res[b]);
+                arr[b] =
+                    std::max(0.0f, (scale_stddiv * (arr[b] - mean)) + res[b]);
             }
         }
     }
 }
 
-void Network::forward_cpu(const std::vector<float>& input,
-                          std::vector<float>& output_pol,
-                          std::vector<float>& output_val) {
-    // Input convolution
-    constexpr auto width = BOARD_SIZE;
-    constexpr auto height = BOARD_SIZE;
-    constexpr auto tiles = (width + 1) * (height + 1) / 4;
-    // Calculate output channels
-    const auto output_channels = conv_biases[0].size();
-    // input_channels is the maximum number of input channels of any
-    // convolution. Residual blocks are identical, but the first convolution
-    // might be bigger when the network has very few filters
-    const auto input_channels = std::max(static_cast<size_t>(output_channels),
-                                         static_cast<size_t>(INPUT_CHANNELS));
-    auto conv_out = std::vector<float>(output_channels * width * height);
-
-    auto V = std::vector<float>(WINOGRAD_TILE * input_channels * tiles);
-    auto M = std::vector<float>(WINOGRAD_TILE * output_channels * tiles);
-
-    winograd_convolve3(output_channels, input, conv_weights[0], V, M, conv_out);
-    batchnorm<BOARD_SQUARES>(output_channels, conv_out,
-                             batchnorm_means[0].data(),
-                             batchnorm_stddivs[0].data());
-
-    // Residual tower
-    auto conv_in = std::vector<float>(output_channels * width * height);
-    auto res = std::vector<float>(output_channels * width * height);
-    for (auto i = size_t{1}; i < conv_weights.size(); i += 2) {
-        auto output_channels = conv_biases[i].size();
-        std::swap(conv_out, conv_in);
-        winograd_convolve3(output_channels, conv_in,
-                           conv_weights[i], V, M, conv_out);
-        batchnorm<BOARD_SQUARES>(output_channels, conv_out,
-                                 batchnorm_means[i].data(),
-                                 batchnorm_stddivs[i].data());
-
-        output_channels = conv_biases[i + 1].size();
-        std::swap(conv_in, res);
-        std::swap(conv_out, conv_in);
-        winograd_convolve3(output_channels, conv_in,
-                           conv_weights[i + 1], V, M, conv_out);
-        batchnorm<BOARD_SQUARES>(output_channels, conv_out,
-                                 batchnorm_means[i + 1].data(),
-                                 batchnorm_stddivs[i + 1].data(),
-                                 res.data());
-    }
-    convolve<1>(OUTPUTS_POLICY, conv_out, conv_pol_w, conv_pol_b, output_pol);
-    convolve<1>(OUTPUTS_VALUE, conv_out, conv_val_w, conv_val_b, output_val);
-}
-
-template<typename T>
-T relative_difference(const T a, const T b) {
-    // Handle NaN
-    if (std::isnan(a) || std::isnan(b)) {
-        return std::numeric_limits<T>::max();
-    }
+#ifdef USE_OPENCL_SELFCHECK
+void Network::compare_net_outputs(const Netresult& data, const Netresult& ref) {
+    // Calculates L2-norm between data and ref.
+    constexpr auto max_error = 0.2f;
 
-    constexpr auto small_number = 1e-3f;
-    auto fa = std::fabs(a);
-    auto fb = std::fabs(b);
+    auto error = 0.0f;
 
-    if (fa > small_number && fb > small_number) {
-        // Handle sign difference
-        if ((a < 0) != (b < 0)) {
-            return std::numeric_limits<T>::max();
-        }
-    } else {
-        // Handle underflow
-        fa = std::max(fa, small_number);
-        fb = std::max(fb, small_number);
+    for (auto idx = size_t{0}; idx < data.policy.size(); ++idx) {
+        const auto diff = data.policy[idx] - ref.policy[idx];
+        error += diff * diff;
     }
-
-    return fabs(fa - fb) / std::min(fa, fb);
-}
-
-void compare_net_outputs(std::vector<float>& data,
-                         std::vector<float>& ref) {
-    // We accept an error up to 5%, but output values
-    // smaller than 1/1000th are "rounded up" for the comparison.
-    constexpr auto relative_error = 5e-2f;
-    for (auto idx = size_t{0}; idx < data.size(); ++idx) {
-        const auto err = relative_difference(data[idx], ref[idx]);
-        if (err > relative_error) {
-            printf("Error in OpenCL calculation: expected %f got %f "
-                   "(error=%f%%)\n", ref[idx], data[idx], err * 100.0);
-            printf("Update your GPU drivers or reduce the amount of games "
-                   "played simultaneously.\n");
-            throw std::runtime_error("OpenCL self-check mismatch.");
-        }
+    const auto diff_pass = data.policy_pass - ref.policy_pass;
+    const auto diff_winrate = data.winrate - ref.winrate;
+    error += diff_pass * diff_pass;
+    error += diff_winrate * diff_winrate;
+
+    error = std::sqrt(error);
+
+    if (error > max_error || std::isnan(error)) {
+        printf(
+            "Error in OpenCL calculation: Update your device's OpenCL drivers "
+            "or reduce the amount of games played simultaneously.\n");
+        throw std::runtime_error("OpenCL self-check mismatch.");
     }
 }
 #endif
@@ -870,137 +711,172 @@ std::vector<float> softmax(const std::vector<float>& input,
     return output;
 }
 
-Network::Netresult Network::get_scored_moves(
-    const GameState* const state, const Ensemble ensemble,
-    const int symmetry, const bool skip_cache) {
+bool Network::probe_cache(const GameState* const state,
+                          Network::Netresult& result) {
+    if (m_nncache.lookup(state->board.get_hash(), result)) {
+        return true;
+    }
+    // If we are not generating a self-play game, try to find
+    // symmetries if we are in the early opening.
+    if (!cfg_noise && !cfg_random_cnt
+        && state->get_movenum()
+               < (state->get_timecontrol().opening_moves(BOARD_SIZE) / 2)) {
+        for (auto sym = 0; sym < Network::NUM_SYMMETRIES; ++sym) {
+            if (sym == Network::IDENTITY_SYMMETRY) {
+                continue;
+            }
+            const auto hash = state->get_symmetry_hash(sym);
+            if (m_nncache.lookup(hash, result)) {
+                decltype(result.policy) corrected_policy;
+                for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; ++idx) {
+                    const auto sym_idx = symmetry_nn_idx_table[sym][idx];
+                    corrected_policy[idx] = result.policy[sym_idx];
+                }
+                result.policy = std::move(corrected_policy);
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+Network::Netresult Network::get_output(
+    const GameState* const state, const Ensemble ensemble, const int symmetry,
+    const bool read_cache, const bool write_cache, const bool force_selfcheck) {
     Netresult result;
     if (state->board.get_boardsize() != BOARD_SIZE) {
         return result;
     }
 
-    if (!skip_cache) {
+    if (read_cache) {
         // See if we already have this in the cache.
-        if (NNCache::get_NNCache().lookup(state->board.get_hash(), result)) {
+        if (probe_cache(state, result)) {
             return result;
         }
     }
 
     if (ensemble == DIRECT) {
-        assert(symmetry >= 0 && symmetry <= 7);
-        result = get_scored_moves_internal(state, symmetry);
+        assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
+        result = get_output_internal(state, symmetry);
     } else if (ensemble == AVERAGE) {
-        for (auto sym = 0; sym < 8; ++sym) {
-            auto tmpresult = get_scored_moves_internal(state, sym);
-            result.winrate += tmpresult.winrate / 8.0f;
-            result.policy_pass += tmpresult.policy_pass / 8.0f;
-
-            for (auto idx = size_t{0}; idx < BOARD_SQUARES; idx++) {
-                result.policy[idx] += tmpresult.policy[idx] / 8.0f;
+        assert(symmetry == -1);
+        for (auto sym = 0; sym < NUM_SYMMETRIES; ++sym) {
+            auto tmpresult = get_output_internal(state, sym);
+            result.winrate +=
+                tmpresult.winrate / static_cast<float>(NUM_SYMMETRIES);
+            result.policy_pass +=
+                tmpresult.policy_pass / static_cast<float>(NUM_SYMMETRIES);
+
+            for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; idx++) {
+                result.policy[idx] +=
+                    tmpresult.policy[idx] / static_cast<float>(NUM_SYMMETRIES);
             }
         }
     } else {
         assert(ensemble == RANDOM_SYMMETRY);
         assert(symmetry == -1);
-        const auto rand_sym = Random::get_Rng().randfix<8>();
-        result = get_scored_moves_internal(state, rand_sym);
+        const auto rand_sym = Random::get_Rng().randfix<NUM_SYMMETRIES>();
+        result = get_output_internal(state, rand_sym);
+#ifdef USE_OPENCL_SELFCHECK
+        // Both implementations are available, self-check the OpenCL driver by
+        // running both with a probability of 1/2000.
+        // selfcheck is done here because this is the only place NN
+        // evaluation is done on actual gameplay.
+        if (m_forward_cpu != nullptr
+            && (force_selfcheck
+                || Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0)) {
+            auto result_ref = get_output_internal(state, rand_sym, true);
+            compare_net_outputs(result, result_ref);
+        }
+#else
+        (void)force_selfcheck;
+#endif
     }
 
     // v2 format (ELF Open Go) returns black value, not stm
-    if (value_head_not_stm) {
+    if (m_value_head_not_stm) {
         if (state->board.get_to_move() == FastBoard::WHITE) {
             result.winrate = 1.0f - result.winrate;
         }
     }
 
-    // Insert result into cache.
-    NNCache::get_NNCache().insert(state->board.get_hash(), result);
+    if (write_cache) {
+        // Insert result into cache.
+        m_nncache.insert(state->board.get_hash(), result);
+    }
 
     return result;
 }
 
-Network::Netresult Network::get_scored_moves_internal(
-    const GameState* const state, const int symmetry) {
-    assert(symmetry >= 0 && symmetry <= 7);
+Network::Netresult Network::get_output_internal(const GameState* const state,
+                                                const int symmetry,
+                                                bool selfcheck) {
+    assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
     constexpr auto width = BOARD_SIZE;
     constexpr auto height = BOARD_SIZE;
 
     const auto input_data = gather_features(state, symmetry);
     std::vector<float> policy_data(OUTPUTS_POLICY * width * height);
     std::vector<float> value_data(OUTPUTS_VALUE * width * height);
-#ifdef USE_HALF
-    std::vector<net_t> policy_data_n(OUTPUTS_POLICY * width * height);
-    std::vector<net_t> value_data_n(OUTPUTS_VALUE * width * height);
-#endif
-#ifdef USE_OPENCL
-#ifdef USE_HALF
-    opencl.forward(input_data, policy_data_n, value_data_n);
-    std::copy(begin(policy_data_n), end(policy_data_n), begin(policy_data));
-    std::copy(begin(value_data_n), end(value_data_n), begin(value_data));
-#else
-    opencl.forward(input_data, policy_data, value_data);
-#endif
-#elif defined(USE_BLAS) && !defined(USE_OPENCL)
-    forward_cpu(input_data, policy_data, value_data);
-#endif
 #ifdef USE_OPENCL_SELFCHECK
-    // Both implementations are available, self-check the OpenCL driver by
-    // running both with a probability of 1/2000.
-    if (Random::get_Rng().randfix<SELFCHECK_PROBABILITY>() == 0) {
-        auto cpu_policy_data = std::vector<float>(policy_data.size());
-        auto cpu_value_data = std::vector<float>(value_data.size());
-        forward_cpu(input_data, cpu_policy_data, cpu_value_data);
-        compare_net_outputs(policy_data, cpu_policy_data);
-        compare_net_outputs(value_data, cpu_value_data);
+    if (selfcheck) {
+        m_forward_cpu->forward(input_data, policy_data, value_data);
+    } else {
+        m_forward->forward(input_data, policy_data, value_data);
     }
+#else
+    m_forward->forward(input_data, policy_data, value_data);
+    (void)selfcheck;
 #endif
 
     // Get the moves
-    batchnorm<BOARD_SQUARES>(OUTPUTS_POLICY, policy_data,
-        bn_pol_w1.data(), bn_pol_w2.data());
+    batchnorm<NUM_INTERSECTIONS>(OUTPUTS_POLICY, policy_data,
+                                 m_bn_pol_w1.data(), m_bn_pol_w2.data());
     const auto policy_out =
-        innerproduct<OUTPUTS_POLICY * BOARD_SQUARES, BOARD_SQUARES + 1, false>(
-            policy_data, ip_pol_w, ip_pol_b);
+        innerproduct<OUTPUTS_POLICY * NUM_INTERSECTIONS, POTENTIAL_MOVES,
+                     false>(policy_data, m_ip_pol_w, m_ip_pol_b);
     const auto outputs = softmax(policy_out, cfg_softmax_temp);
 
-    // Now get the score
-    batchnorm<BOARD_SQUARES>(OUTPUTS_VALUE, value_data,
-        bn_val_w1.data(), bn_val_w2.data());
+    // Now get the value
+    batchnorm<NUM_INTERSECTIONS>(OUTPUTS_VALUE, value_data, m_bn_val_w1.data(),
+                                 m_bn_val_w2.data());
     const auto winrate_data =
-        innerproduct<BOARD_SQUARES, 256, true>(value_data, ip1_val_w, ip1_val_b);
-    const auto winrate_out =
-        innerproduct<256, 1, false>(winrate_data, ip2_val_w, ip2_val_b);
+        innerproduct<OUTPUTS_VALUE * NUM_INTERSECTIONS, VALUE_LAYER, true>(
+            value_data, m_ip1_val_w, m_ip1_val_b);
+    const auto winrate_out = innerproduct<VALUE_LAYER, 1, false>(
+        winrate_data, m_ip2_val_w, m_ip2_val_b);
 
-    // Sigmoid
-    const auto winrate_sig = (1.0f + std::tanh(winrate_out[0])) / 2.0f;
+    // Map TanH output range [-1..1] to [0..1] range
+    const auto winrate = (1.0f + std::tanh(winrate_out[0])) / 2.0f;
 
     Netresult result;
 
-    for (auto idx = size_t{0}; idx < BOARD_SQUARES; idx++) {
+    for (auto idx = size_t{0}; idx < NUM_INTERSECTIONS; idx++) {
         const auto sym_idx = symmetry_nn_idx_table[symmetry][idx];
         result.policy[sym_idx] = outputs[idx];
     }
 
-    result.policy_pass = outputs[BOARD_SQUARES];
-    result.winrate = winrate_sig;
+    result.policy_pass = outputs[NUM_INTERSECTIONS];
+    result.winrate = winrate;
 
     return result;
 }
 
 void Network::show_heatmap(const FastState* const state,
-                           const Netresult& result,
-                           const bool topmoves) {
+                           const Netresult& result, const bool topmoves) {
     std::vector<std::string> display_map;
     std::string line;
 
     for (unsigned int y = 0; y < BOARD_SIZE; y++) {
         for (unsigned int x = 0; x < BOARD_SIZE; x++) {
-            auto score = 0;
+            auto policy = 0;
             const auto vertex = state->board.get_vertex(x, y);
-            if (state->board.get_square(vertex) == FastBoard::EMPTY) {
-                score = result.policy[y * BOARD_SIZE + x] * 1000;
+            if (state->board.get_state(vertex) == FastBoard::EMPTY) {
+                policy = result.policy[y * BOARD_SIZE + x] * 1000;
             }
 
-            line += boost::str(boost::format("%3d ") % score);
+            line += boost::str(boost::format("%3d ") % policy);
         }
 
         display_map.push_back(line);
@@ -1010,17 +886,17 @@ void Network::show_heatmap(const FastState* const state,
     for (int i = display_map.size() - 1; i >= 0; --i) {
         myprintf("%s\n", display_map[i].c_str());
     }
-    const auto pass_score = int(result.policy_pass * 1000);
-    myprintf("pass: %d\n", pass_score);
+    const auto pass_policy = int(result.policy_pass * 1000);
+    myprintf("pass: %d\n", pass_policy);
     myprintf("winrate: %f\n", result.winrate);
 
     if (topmoves) {
-        std::vector<Network::ScoreVertexPair> moves;
-        for (auto i=0; i < BOARD_SQUARES; i++) {
+        std::vector<Network::PolicyVertexPair> moves;
+        for (auto i = 0; i < NUM_INTERSECTIONS; i++) {
             const auto x = i % BOARD_SIZE;
             const auto y = i / BOARD_SIZE;
             const auto vertex = state->board.get_vertex(x, y);
-            if (state->board.get_square(vertex) == FastBoard::EMPTY) {
+            if (state->board.get_state(vertex) == FastBoard::EMPTY) {
                 moves.emplace_back(result.policy[i], vertex);
             }
         }
@@ -1029,97 +905,154 @@ void Network::show_heatmap(const FastState* const state,
         std::stable_sort(rbegin(moves), rend(moves));
 
         auto cum = 0.0f;
-        size_t tried = 0;
-        while (cum < 0.85f && tried < moves.size()) {
-            if (moves[tried].first < 0.01f) break;
+        for (const auto& move : moves) {
+            if (cum > 0.85f || move.first < 0.01f) break;
             myprintf("%1.3f (%s)\n",
-                    moves[tried].first,
-                    state->board.move_to_text(moves[tried].second).c_str());
-            cum += moves[tried].first;
-            tried++;
+                     move.first,
+                     state->board.move_to_text(move.second).c_str());
+            cum += move.first;
         }
     }
 }
 
 void Network::fill_input_plane_pair(const FullBoard& board,
-                                    std::vector<net_t>::iterator black,
-                                    std::vector<net_t>::iterator white,
+                                    std::vector<float>::iterator black,
+                                    std::vector<float>::iterator white,
                                     const int symmetry) {
-    for (auto idx = 0; idx < BOARD_SQUARES; idx++) {
+    for (auto idx = 0; idx < NUM_INTERSECTIONS; idx++) {
         const auto sym_idx = symmetry_nn_idx_table[symmetry][idx];
         const auto x = sym_idx % BOARD_SIZE;
         const auto y = sym_idx / BOARD_SIZE;
-        const auto color = board.get_square(x, y);
+        const auto color = board.get_state(x, y);
         if (color == FastBoard::BLACK) {
-            black[idx] = net_t(true);
+            black[idx] = float(true);
         } else if (color == FastBoard::WHITE) {
-            white[idx] = net_t(true);
+            white[idx] = float(true);
         }
     }
 }
 
-std::vector<net_t> Network::gather_features(const GameState* const state,
+std::vector<float> Network::gather_features(const GameState* const state,
                                             const int symmetry) {
-    assert(symmetry >= 0 && symmetry <= 7);
-    auto input_data = std::vector<net_t>(INPUT_CHANNELS * BOARD_SQUARES);
+    assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
+    auto input_data = std::vector<float>(INPUT_CHANNELS * NUM_INTERSECTIONS);
 
     const auto to_move = state->get_to_move();
     const auto blacks_move = to_move == FastBoard::BLACK;
 
-    const auto black_it = blacks_move ?
-                          begin(input_data) :
-                          begin(input_data) + INPUT_MOVES * BOARD_SQUARES;
-    const auto white_it = blacks_move ?
-                          begin(input_data) + INPUT_MOVES * BOARD_SQUARES :
-                          begin(input_data);
-    const auto to_move_it = blacks_move ?
-        begin(input_data) + 2 * INPUT_MOVES * BOARD_SQUARES :
-        begin(input_data) + (2 * INPUT_MOVES + 1) * BOARD_SQUARES;
+    const auto black_it =
+        blacks_move ? begin(input_data)
+                    : begin(input_data) + INPUT_MOVES * NUM_INTERSECTIONS;
+    const auto white_it =
+        blacks_move ? begin(input_data) + INPUT_MOVES * NUM_INTERSECTIONS
+                    : begin(input_data);
+    const auto to_move_it =
+        blacks_move
+            ? begin(input_data) + 2 * INPUT_MOVES * NUM_INTERSECTIONS
+            : begin(input_data) + (2 * INPUT_MOVES + 1) * NUM_INTERSECTIONS;
 
     const auto moves = std::min<size_t>(state->get_movenum() + 1, INPUT_MOVES);
     // Go back in time, fill history boards
     for (auto h = size_t{0}; h < moves; h++) {
         // collect white, black occupation planes
         fill_input_plane_pair(state->get_past_board(h),
-                              black_it + h * BOARD_SQUARES,
-                              white_it + h * BOARD_SQUARES,
-                              symmetry);
+                              black_it + h * NUM_INTERSECTIONS,
+                              white_it + h * NUM_INTERSECTIONS, symmetry);
     }
 
-    std::fill(to_move_it, to_move_it + BOARD_SQUARES, net_t(true));
+    std::fill(to_move_it, to_move_it + NUM_INTERSECTIONS, float(true));
 
     return input_data;
 }
 
-int Network::get_nn_idx_symmetry(const int vertex, int symmetry) {
-    assert(vertex >= 0 && vertex < BOARD_SQUARES);
-    assert(symmetry >= 0 && symmetry < 8);
-    auto x = vertex % BOARD_SIZE;
-    auto y = vertex / BOARD_SIZE;
-    int newx;
-    int newy;
+std::pair<int, int> Network::get_symmetry(const std::pair<int, int>& vertex,
+                                          const int symmetry,
+                                          const int board_size) {
+    auto x = vertex.first;
+    auto y = vertex.second;
+    assert(x >= 0 && x < board_size);
+    assert(y >= 0 && y < board_size);
+    assert(symmetry >= 0 && symmetry < NUM_SYMMETRIES);
 
-    if (symmetry >= 4) {
+    if ((symmetry & 4) != 0) {
         std::swap(x, y);
-        symmetry -= 4;
     }
 
-    if (symmetry == 0) {
-        newx = x;
-        newy = y;
-    } else if (symmetry == 1) {
-        newx = x;
-        newy = BOARD_SIZE - y - 1;
-    } else if (symmetry == 2) {
-        newx = BOARD_SIZE - x - 1;
-        newy = y;
-    } else {
-        assert(symmetry == 3);
-        newx = BOARD_SIZE - x - 1;
-        newy = BOARD_SIZE - y - 1;
+    if ((symmetry & 2) != 0) {
+        x = board_size - x - 1;
+    }
+
+    if ((symmetry & 1) != 0) {
+        y = board_size - y - 1;
+    }
+
+    assert(x >= 0 && x < board_size);
+    assert(y >= 0 && y < board_size);
+    assert(symmetry != IDENTITY_SYMMETRY || vertex == std::make_pair(x, y));
+    return {x, y};
+}
+
+size_t Network::get_estimated_size() {
+    if (estimated_size != 0) {
+        return estimated_size;
     }
+    auto result = size_t{0};
+
+    const auto lambda_vector_size =
+        [](const std::vector<std::vector<float>>& v) {
+            auto result = size_t{0};
+            for (auto it = begin(v); it != end(v); ++it) {
+                result += it->size() * sizeof(float);
+            }
+            return result;
+        };
+
+    result += lambda_vector_size(m_fwd_weights->m_conv_weights);
+    result += lambda_vector_size(m_fwd_weights->m_conv_biases);
+    result += lambda_vector_size(m_fwd_weights->m_batchnorm_means);
+    result += lambda_vector_size(m_fwd_weights->m_batchnorm_stddevs);
+
+    result += m_fwd_weights->m_conv_pol_w.size() * sizeof(float);
+    result += m_fwd_weights->m_conv_pol_b.size() * sizeof(float);
+
+    // Policy head
+    result += OUTPUTS_POLICY * sizeof(float); // m_bn_pol_w1
+    result += OUTPUTS_POLICY * sizeof(float); // m_bn_pol_w2
+    result += OUTPUTS_POLICY * NUM_INTERSECTIONS * POTENTIAL_MOVES
+              * sizeof(float);                 // m_ip_pol_w
+    result += POTENTIAL_MOVES * sizeof(float); // m_ip_pol_b
+
+    // Value head
+    result += m_fwd_weights->m_conv_val_w.size() * sizeof(float);
+    result += m_fwd_weights->m_conv_val_b.size() * sizeof(float);
+    result += OUTPUTS_VALUE * sizeof(float); // m_bn_val_w1
+    result += OUTPUTS_VALUE * sizeof(float); // m_bn_val_w2
+
+    result += OUTPUTS_VALUE * NUM_INTERSECTIONS * VALUE_LAYER
+              * sizeof(float);             // m_ip1_val_w
+    result += VALUE_LAYER * sizeof(float); // m_ip1_val_b
+
+    result += VALUE_LAYER * sizeof(float); // m_ip2_val_w
+    result += sizeof(float);               // m_ip2_val_b
+    return estimated_size = result;
+}
+
+size_t Network::get_estimated_cache_size() {
+    return m_nncache.get_estimated_size();
+}
+
+void Network::nncache_resize(const int max_count) {
+    return m_nncache.resize(max_count);
+}
+
+void Network::nncache_clear() {
+    m_nncache.clear();
+}
+
+void Network::drain_evals() {
+    m_forward->drain();
+}
 
-    const auto newvtx = (newy * BOARD_SIZE) + newx;
-    assert(newvtx >= 0 && newvtx < BOARD_SQUARES);
-    return newvtx;
+void Network::resume_evals() {
+    m_forward->resume();
 }
diff --git a/src/Network.h b/src/Network.h
index 12948114d..462d2c9cc 100644
--- a/src/Network.h
+++ b/src/Network.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef NETWORK_H_INCLUDED
@@ -22,75 +33,103 @@
 #include "config.h"
 
 #include <array>
+#include <deque>
+#include <fstream>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#include <fstream>
 
 #include "FastState.h"
+#include "NNCache.h"
+#ifdef USE_OPENCL
+#include "OpenCLScheduler.h"
+#endif
+#include "ForwardPipe.h"
 #include "GameState.h"
+#ifdef USE_OPENCL
+#include "OpenCLScheduler.h"
+#endif
+#ifdef USE_OPENCL_SELFCHECK
+#include "SMP.h"
+#endif
+
+// Winograd filter transformation changes 3x3 filters to M + 3 - 1
+constexpr auto WINOGRAD_M = 4;
+constexpr auto WINOGRAD_ALPHA = WINOGRAD_M + 3 - 1;
+constexpr auto WINOGRAD_WTILES =
+    BOARD_SIZE / WINOGRAD_M + (BOARD_SIZE % WINOGRAD_M != 0);
+constexpr auto WINOGRAD_TILE = WINOGRAD_ALPHA * WINOGRAD_ALPHA;
+constexpr auto WINOGRAD_P = WINOGRAD_WTILES * WINOGRAD_WTILES;
+constexpr auto SQ2 = 1.4142135623730951f; // Square root of 2
+
+// See drain_evals() / resume_evals() for details.
+class NetworkHaltException : public std::exception {};
 
 class Network {
+    using ForwardPipeWeights = ForwardPipe::ForwardPipeWeights;
+
 public:
+    static constexpr auto NUM_SYMMETRIES = 8;
+    static constexpr auto IDENTITY_SYMMETRY = 0;
     enum Ensemble {
         DIRECT, RANDOM_SYMMETRY, AVERAGE
     };
-    using ScoreVertexPair = std::pair<float,int>;
+    using PolicyVertexPair = std::pair<float, int>;
+    using Netresult = NNCache::Netresult;
 
-    struct Netresult {
-        // 19x19 board positions
-        std::vector<float> policy;
+    virtual ~Network() = default;
 
-        // pass
-        float policy_pass;
-
-        // winrate
-        float winrate;
-
-        Netresult() : policy(BOARD_SQUARES), policy_pass(0.0f), winrate(0.0f) {}
-    };
-
-    static Netresult get_scored_moves(const GameState* const state,
-                                      const Ensemble ensemble,
-                                      const int symmetry = -1,
-                                      const bool skip_cache = false);
+    Netresult get_output(const GameState* state, Ensemble ensemble,
+                         int symmetry = -1, bool read_cache = true,
+                         bool write_cache = true, bool force_selfcheck = false);
 
     static constexpr auto INPUT_MOVES = 8;
     static constexpr auto INPUT_CHANNELS = 2 * INPUT_MOVES + 2;
     static constexpr auto OUTPUTS_POLICY = 2;
     static constexpr auto OUTPUTS_VALUE = 1;
+    static constexpr auto VALUE_LAYER = 256;
+
+    void initialize(int playouts, const std::string& weightsfile);
 
-    // Winograd filter transformation changes 3x3 filters to 4x4
-    static constexpr auto WINOGRAD_ALPHA = 4;
-    static constexpr auto WINOGRAD_TILE = WINOGRAD_ALPHA * WINOGRAD_ALPHA;
+    float benchmark_time(int centiseconds);
+    void benchmark(const GameState* state, int iterations = 1600);
+    static void show_heatmap(const FastState* state, const Netresult& netres,
+                             bool topmoves);
 
-    static void initialize();
-    static void benchmark(const GameState * const state,
-                          const int iterations = 1600);
-    static void show_heatmap(const FastState * const state,
-                             const Netresult & netres, const bool topmoves);
+    static std::vector<float> gather_features(const GameState* state,
+                                              int symmetry);
+    static std::pair<int, int> get_symmetry(const std::pair<int, int>& vertex,
+                                            int symmetry,
+                                            int board_size = BOARD_SIZE);
+
+    size_t get_estimated_size();
+    size_t get_estimated_cache_size();
+    void nncache_resize(int max_count);
+    void nncache_clear();
+
+    // 'Drain' evaluations.  Threads with an evaluation will throw a
+    // NetworkHaltException if possible, or will just proceed and drain ASAP.
+    // New evaluation requests will also result in a NetworkHaltException.
+    virtual void drain_evals();
+
+    // Flag the network to be open for business.
+    virtual void resume_evals();
 
-    static std::vector<net_t> gather_features(const GameState* const state,
-                                              const int symmetry);
 private:
-    static std::pair<int, int> load_v1_network(std::istream& wtfile);
-    static std::pair<int, int> load_network_file(const std::string& filename);
-    static void process_bn_var(std::vector<float>& weights,
-                               const float epsilon = 1e-5f);
+    std::pair<int, int> load_v1_network(std::istream& wtfile);
+    std::pair<int, int> load_network_file(const std::string& filename);
 
     static std::vector<float> winograd_transform_f(const std::vector<float>& f,
-        const int outputs, const int channels);
+                                                   int outputs, int channels);
     static std::vector<float> zeropad_U(const std::vector<float>& U,
-        const int outputs, const int channels,
-        const int outputs_pad, const int channels_pad);
+                                        int outputs, int channels,
+                                        int outputs_pad, int channels_pad);
     static void winograd_transform_in(const std::vector<float>& in,
-                                      std::vector<float>& V,
-                                      const int C);
+                                      std::vector<float>& V, int C);
     static void winograd_transform_out(const std::vector<float>& M,
-                                       std::vector<float>& Y,
-                                       const int K);
-    static void winograd_convolve3(const int outputs,
+                                       std::vector<float>& Y, int K);
+    static void winograd_convolve3(int outputs,
                                    const std::vector<float>& input,
                                    const std::vector<float>& U,
                                    std::vector<float>& V,
@@ -98,20 +137,50 @@ class Network {
                                    std::vector<float>& output);
     static void winograd_sgemm(const std::vector<float>& U,
                                const std::vector<float>& V,
-                               std::vector<float>& M, const int C, const int K);
-    static int get_nn_idx_symmetry(const int vertex, int symmetry);
+                               std::vector<float>& M, int C, int K);
+    Netresult get_output_internal(const GameState* state, int symmetry,
+                                  bool selfcheck = false);
     static void fill_input_plane_pair(const FullBoard& board,
-                                      std::vector<net_t>::iterator black,
-                                      std::vector<net_t>::iterator white,
-                                      const int symmetry);
-    static Netresult get_scored_moves_internal(const GameState* const state,
-                                               const int symmetry);
-#if defined(USE_BLAS)
-    static void forward_cpu(const std::vector<float>& input,
-                            std::vector<float>& output_pol,
-                            std::vector<float>& output_val);
-
+                                      std::vector<float>::iterator black,
+                                      std::vector<float>::iterator white,
+                                      int symmetry);
+    bool probe_cache(const GameState* state, Network::Netresult& result);
+    std::unique_ptr<ForwardPipe>&& init_net(
+        int channels, std::unique_ptr<ForwardPipe>&& pipe);
+#ifdef USE_HALF
+    void select_precision(int channels);
 #endif
-};
+    std::unique_ptr<ForwardPipe> m_forward;
+#ifdef USE_OPENCL_SELFCHECK
+    void compare_net_outputs(const Netresult& data, const Netresult& ref);
+    std::unique_ptr<ForwardPipe> m_forward_cpu;
+#endif
+
+    NNCache m_nncache;
 
+    size_t estimated_size{0};
+
+    // Residual tower
+    std::shared_ptr<ForwardPipeWeights> m_fwd_weights;
+
+    // Policy head
+    std::array<float, OUTPUTS_POLICY> m_bn_pol_w1;
+    std::array<float, OUTPUTS_POLICY> m_bn_pol_w2;
+
+    std::array<float, OUTPUTS_POLICY * NUM_INTERSECTIONS * POTENTIAL_MOVES>
+        m_ip_pol_w;
+    std::array<float, POTENTIAL_MOVES> m_ip_pol_b;
+
+    // Value head
+    std::array<float, OUTPUTS_VALUE> m_bn_val_w1;
+    std::array<float, OUTPUTS_VALUE> m_bn_val_w2;
+
+    std::array<float, OUTPUTS_VALUE * NUM_INTERSECTIONS * VALUE_LAYER>
+        m_ip1_val_w;
+    std::array<float, VALUE_LAYER> m_ip1_val_b;
+
+    std::array<float, VALUE_LAYER> m_ip2_val_w;
+    std::array<float, 1> m_ip2_val_b;
+    bool m_value_head_not_stm;
+};
 #endif
diff --git a/src/OpenCL.cpp b/src/OpenCL.cpp
index 7b2f09216..db232845f 100644
--- a/src/OpenCL.cpp
+++ b/src/OpenCL.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,464 +14,150 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
 
 #ifdef USE_OPENCL
-#include "OpenCL.h"
-
-#include <cassert>
 #include <algorithm>
 #include <boost/algorithm/string.hpp>
 #include <boost/format.hpp>
-#include <iterator>
-#include <limits>
-#include <stdexcept>
-
+#include <cassert>
 #include <cstdio>
 #include <iostream>
+#include <iterator>
+#include <limits>
 #include <memory>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 
-#include "Network.h"
 #include "GTP.h"
-#include "Utils.h"
+#include "Network.h"
+#include "OpenCL.h"
 #include "Tuner.h"
+#include "Utils.h"
 
 using namespace Utils;
 
-static std::string cl_args =
-#ifdef USE_HALF
-    "-DUSE_HALF "
-#endif
-    "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero";
-
-static std::string sourceCode_config = R"(
-#ifdef USE_HALF
-    typedef half net_t;
-    #define vload_net_t(offset,p) vload_half(offset,p)
-    #define vstore_net_t(data,offset,p) vstore_half(data,offset,p)
-#else
-    typedef float net_t;
-    #define vload_net_t(offset,p) ((p)[(offset)])
-    #define vstore_net_t(data,offset,p) (((p)[(offset)])=(data))
-#endif
-    #define BOARD_SIZE )" + std::to_string(BOARD_SIZE) +
-    "\n    #define BOARD_SQUARES " + std::to_string(BOARD_SQUARES);
-
-static std::string sourceCode_convolve1 = R"(
-    __kernel
-    __attribute__((work_group_size_hint(8, 16, 1)))
-    void convolve1(
-                   __global const net_t * restrict in,
-                   __global net_t * restrict merge,
-                   __global const net_t * restrict weights,
-                   __local float * channel_buff,
-                   __local float * row_buff) {
-        // cl::NDRange global(channels, outputs, row);
-        const int c   = get_global_id(0);  // channel
-        const int o   = get_global_id(1);  // output
-        const int row = get_global_id(2);  // row
-        const int channels = get_global_size(0);
-        const int outputs  = get_global_size(1);
-        // cl::NDRange local(2, (1->32), 1);
-        const int lx = get_local_id(0);
-        const int ly = get_local_id(1);
-        const int chan_buff_size = 8;
-        const int out_buff_size  = get_local_size(1);
-        const int row_buff_size  = 7;
-        const int chan_shift     = 3;
-        // input = channels * height * width
-        // output = outputs * height * width
-        // weights = output * channels * filter
-        // merge = channels * outputs * height * width
-        const int width = BOARD_SIZE;
-        const int height = BOARD_SIZE;
-        const int strip_size = width;
-        // Copy the input channels (strips) locally
-        if (out_buff_size < BOARD_SIZE && ly == 0) {
-            // strip-row
-            for (int w = 0; w < width; w++) {
-                channel_buff[lx * width + w] =
-                    vload_net_t((c * height + row) * width + w, in);
-            }
-        } else if (out_buff_size >= BOARD_SIZE && ly < BOARD_SIZE) {
-            // Every thread copies a column
-            channel_buff[lx * width + ly] = vload_net_t((c * height + row) * width + ly, in);
-        }
-        // Copy the filter we are applying locally
-        __private float filter_buff = vload_net_t((o * channels + c), weights);
-        barrier(CLK_LOCAL_MEM_FENCE);
-        int out_lane = 0;
-        int out_cw   = 0;
-        #pragma unroll
-        for (int cw = 0; cw < width; cw++) {
-            int fid = lx * strip_size;
-            float out  = channel_buff[fid + cw] * filter_buff;
-            row_buff[(ly * chan_buff_size + lx) * row_buff_size + out_lane] = out;
-            out_lane++;
-            // Row buffer full or last lane?
-            if (out_lane == row_buff_size || (cw == width - 1)) {
-                barrier(CLK_LOCAL_MEM_FENCE);
-                if (lx < out_lane) {
-                    float val;
-                    val  = row_buff[(ly * chan_buff_size + 0) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 1) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 2) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 3) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 4) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 5) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 6) * row_buff_size + lx];
-                    val += row_buff[(ly * chan_buff_size + 7) * row_buff_size + lx];
-                    vstore_net_t(val, (((c >> chan_shift) * height + row) * width + out_cw + lx) * outputs + o, merge);
-                }
-                out_cw  += row_buff_size;
-                out_lane = 0;
-           }
-       }
-    }
-
-__kernel void merge(
-                        __global const net_t * restrict in,
-                        __global net_t * restrict out,
-                        __private const int channels) {
-        // cl::NDRange global(outputs, BOARD_SQUARES);
-        const int gx = get_global_id(0);
-        const int gy = get_global_id(1);
-        const int output = gx;
-        const int b = gy;
-        const int outputs = get_global_size(0);
-        const int width = BOARD_SIZE;
-        const int height = BOARD_SIZE;
-        const int o = output;
-        float sum = 0;
-        for (int c = 0; c < channels; c++) {
-            sum += vload_net_t((c * BOARD_SQUARES + b) * outputs + o, in);
-        }
-        vstore_net_t(sum, o * BOARD_SQUARES + b, out);
-    }
-)";
-
-static std::string sourceCode_convolve3 = R"(
-void __in_transform_eq(float x[4][4], __global net_t * restrict V, int offset, int CPpad) {
-    float T1[4][4];
-
-    T1[0][0] = x[0][0] - x[2][0];
-    T1[0][1] = x[0][1] - x[2][1];
-    T1[0][2] = x[0][2] - x[2][2];
-    T1[0][3] = x[0][3] - x[2][3];
-    T1[1][0] = x[1][0] + x[2][0];
-    T1[1][1] = x[1][1] + x[2][1];
-    T1[1][2] = x[1][2] + x[2][2];
-    T1[1][3] = x[1][3] + x[2][3];
-    T1[2][0] = x[2][0] - x[1][0];
-    T1[2][1] = x[2][1] - x[1][1];
-    T1[2][2] = x[2][2] - x[1][2];
-    T1[2][3] = x[2][3] - x[1][3];
-    T1[3][0] = x[1][0] - x[3][0];
-    T1[3][1] = x[1][1] - x[3][1];
-    T1[3][2] = x[1][2] - x[3][2];
-    T1[3][3] = x[1][3] - x[3][3];
-
-    vstore_net_t(T1[0][0] - T1[0][2], (0*4 + 0)*CPpad + offset, V);
-    vstore_net_t(T1[0][1] + T1[0][2], (0*4 + 1)*CPpad + offset, V);
-    vstore_net_t(T1[0][2] - T1[0][1], (0*4 + 2)*CPpad + offset, V);
-    vstore_net_t(T1[0][1] - T1[0][3], (0*4 + 3)*CPpad + offset, V);
-    vstore_net_t(T1[1][0] - T1[1][2], (1*4 + 0)*CPpad + offset, V);
-    vstore_net_t(T1[1][1] + T1[1][2], (1*4 + 1)*CPpad + offset, V);
-    vstore_net_t(T1[1][2] - T1[1][1], (1*4 + 2)*CPpad + offset, V);
-    vstore_net_t(T1[1][1] - T1[1][3], (1*4 + 3)*CPpad + offset, V);
-    vstore_net_t(T1[2][0] - T1[2][2], (2*4 + 0)*CPpad + offset, V);
-    vstore_net_t(T1[2][1] + T1[2][2], (2*4 + 1)*CPpad + offset, V);
-    vstore_net_t(T1[2][2] - T1[2][1], (2*4 + 2)*CPpad + offset, V);
-    vstore_net_t(T1[2][1] - T1[2][3], (2*4 + 3)*CPpad + offset, V);
-    vstore_net_t(T1[3][0] - T1[3][2], (3*4 + 0)*CPpad + offset, V);
-    vstore_net_t(T1[3][1] + T1[3][2], (3*4 + 1)*CPpad + offset, V);
-    vstore_net_t(T1[3][2] - T1[3][1], (3*4 + 2)*CPpad + offset, V);
-    vstore_net_t(T1[3][1] - T1[3][3], (3*4 + 3)*CPpad + offset, V);
-}
-
-__kernel void in_transform(__global net_t * restrict in, __global net_t * restrict V,
-                           const int C, const int Cpad,
-                           const int Ppad) {
-    const int W = BOARD_SIZE;
-    const int H = BOARD_SIZE;
-    const int T = W*H;
-    const int WTILES = (W + 1) / 2;
-    const int P = WTILES*WTILES;
-    const int CPpad = Ppad * Cpad;
-
-    const int block = get_global_id(0);
-    const int ch = get_global_id(1);
-    const int chT = ch*(T);
-
-    const int block_x = block % WTILES;
-    const int block_y = block / WTILES;
-
-    // Tiles overlap by 2
-    const int yin = 2 * block_y - 1;
-    const int xin = 2 * block_x - 1;
-
-    if (block < P && ch < C) {
-        // Cache input tile and handle zero padding
-        float x[4][4];
-        for (int i = 0; i < 4; i++) {
-            for (int j = 0; j < 4; j++) {
-                int a = xin + j;
-                int b = yin + i;
-                if (b >= 0 && a >= 0 && b < H && a < W) {
-                    x[i][j] = vload_net_t(chT + b*W + a, in);
-                } else {
-                    x[i][j] = 0.0f;
-                }
-            }
-        }
+template <typename net_t> static std::string getClArgs();
 
-        const int offset = ch*Ppad + block;
-        __in_transform_eq(x, V, offset, CPpad);
-    }
+template <>
+std::string getClArgs<float>() {
+    return "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros "
+           "-cl-denorms-are-zero";
 }
-
-void __out_transform_eq(__global const net_t * restrict M, float o[4],
-                        int Kpad, int Ppad, int block_x, int block_y)
-{
-    const int W = BOARD_SIZE;
-    const int H = BOARD_SIZE;
-    const int WTILES = (W + 1) / 2;
-    const int b = block_y * WTILES + block_x;
-    const int KPpad = Kpad * Ppad;
-    const int k = get_global_id(0);
-    float temp_m[16];
-    for (int xn = 0, xnKPpad = b*Kpad + k; xn < 16; xn++, xnKPpad += KPpad) {
-        temp_m[xn] = vload_net_t(xnKPpad, M);
-    }
-
-    o[0] = temp_m[0*4 + 0] + temp_m[0*4 + 1] + temp_m[0*4 + 2] +
-           temp_m[1*4 + 0] + temp_m[1*4 + 1] + temp_m[1*4 + 2] +
-           temp_m[2*4 + 0] + temp_m[2*4 + 1] + temp_m[2*4 + 2];
-
-    o[1] = temp_m[0*4 + 1] - temp_m[0*4 + 2] - temp_m[0*4 + 3] +
-           temp_m[1*4 + 1] - temp_m[1*4 + 2] - temp_m[1*4 + 3] +
-           temp_m[2*4 + 1] - temp_m[2*4 + 2] - temp_m[2*4 + 3];
-
-    o[2] = temp_m[1*4 + 0] + temp_m[1*4 + 1] + temp_m[1*4 + 2] -
-           temp_m[2*4 + 0] - temp_m[2*4 + 1] - temp_m[2*4 + 2] -
-           temp_m[3*4 + 0] - temp_m[3*4 + 1] - temp_m[3*4 + 2];
-
-    o[3] = temp_m[1*4 + 1] - temp_m[1*4 + 2] - temp_m[1*4 + 3] -
-           temp_m[2*4 + 1] + temp_m[2*4 + 2] + temp_m[2*4 + 3] -
-           temp_m[3*4 + 1] + temp_m[3*4 + 2] + temp_m[3*4 + 3];
+#ifdef USE_HALF
+template <>
+std::string getClArgs<half_float::half>() {
+    return "-DUSE_HALF "
+           "-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros "
+           "-cl-denorms-are-zero";
 }
+#endif
 
-__kernel void out_transform_fused_bn(__global const net_t * restrict M,
-                                     __global net_t * restrict Y,
-                                     const int K,
-                                     const int Kpad, const int Ppad,
-                                     __global const net_t * restrict residual,
-                                     __constant const net_t * restrict means,
-                                     __constant const net_t * restrict stddivs) {
-    const int W = BOARD_SIZE;
-    const int H = BOARD_SIZE;
-    const int WTILES = (W + 1) / 2;
-    const int P = WTILES * WTILES;
-
-    int k = get_global_id(0);
-    int block = get_global_id(1);
-
-    const int block_x = block % WTILES;
-    const int block_y = block / WTILES;
-
-    int x = 2*block_x;
-    int y = 2*block_y;
-    int a_ind = (y)*W + (x);
-    if (k < K && block < P) {
-        const int kHW = k * W * H;
-        float o[4];
-        __out_transform_eq(M, o, Kpad, Ppad, block_x, block_y);
-
-        const float mean = vload_net_t(k, means);
-        const float scale_stddiv = vload_net_t(k, stddivs);
-
-        const bool pred[4] = { 1, x+1 < W, y+1 < H, x+1 < W & y+1 < H};
-
-        const int a[4] = {a_ind, a_ind+1, a_ind+W, a_ind+W+1};
-
-        for (int i = 0; i < 4; i++) {
-            if (pred[i]) {
-                o[i] = scale_stddiv * (o[i] - mean);
-                if (residual) {
-                    o[i] += vload_net_t(kHW + a[i], residual);
-                }
-                o[i] = o[i] > 0 ? o[i] : 0.0f;
-                vstore_net_t(o[i], kHW + a[i], Y);
-            }
-        }
-    }
-}
+const std::string sourceCode_common =
+    #include "kernels/common.opencl"
+;
 
-__kernel void out_transform_fused_bn_in(
-                                     __global const net_t * restrict M,
-                                     __global net_t * restrict Y,
-                                     __global net_t * restrict V,
-                                     const int K,
-                                     const int Kpad, const int Ppad, const int Cpad,
-                                     __global const net_t * restrict residual,
-                                     __constant const net_t * restrict means,
-                                     __constant const net_t * restrict stddivs,
-                                     __local float * ybuf) {
-    const int W = BOARD_SIZE;
-    const int H = BOARD_SIZE;
-    const int T = W*H;
-    const int WTILES = (W + 1) / 2;
-    const int P = WTILES * WTILES;
-    const int KPpad = Kpad * Ppad;
-
-    const int k = get_global_id(0);
-    const int kg = get_local_id(0);
-    const int block = get_global_id(1);
-
-    const int block_x = block % WTILES;
-    const int block_y = block / WTILES;
-
-    const int yin = 2 * block_y - 1;
-    const int xin = 2 * block_x - 1;
-
-
-    const int x = 2*block_x;
-    const int y = 2*block_y;
-    int a_ind = (y)*W + (x);
-
-
-    if (k < K && block < P) {
-        const int a[4] = {a_ind, a_ind+1, a_ind+W, a_ind+W+1};
-        const bool pred[4] = { 1, x+1 < W, y+1 < H, x+1 < W & y+1 < H};
-        const int kHW = k * W * H;
-
-        float o[4];
-        __out_transform_eq(M, o, Kpad, Ppad, block_x, block_y);
-
-        const float mean = vload_net_t(k, means);
-        const float scale_stddiv = vload_net_t(k, stddivs);
-
-        for (int i = 0; i < 4; i++) {
-            if (pred[i]) {
-                o[i] = scale_stddiv * (o[i] - mean);
-                if (residual) {
-                    o[i] += vload_net_t(kHW + a[i], residual);
-                }
-                o[i] = o[i] > 0 ? o[i] : 0.0f;
-                ybuf[kg * T + a[i]] = o[i];
-                if (Y) {
-                    vstore_net_t(o[i], kHW + a[i], Y);
-                }
-            }
-        }
-    }
+static const std::string sourceCode_tensorcore_test =
+    #include "kernels/tensorcore_test.opencl"
+;
 
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (block < P && k < K) {
-        const int CPpad = Ppad * Cpad;
-        // Cache input tile and handle zero padding
-        float xx[4][4];
-        for (int i = 0; i < 4; i++) {
-            int b = yin + i;
-            for (int j = 0; j < 4; j++) {
-                int a = xin + j;
-                if (b >= 0 && a >= 0 && b < H && a < W) {
-                    xx[i][j] = ybuf[kg * T + b*W + a];
-                } else {
-                    xx[i][j] = 0.0f;
-                }
-            }
-        }
+static const std::string sourceCode_config = R"(
+#define BOARD_SIZE )" + std::to_string(BOARD_SIZE) +
+"\n#define NUM_INTERSECTIONS " + std::to_string(NUM_INTERSECTIONS) +
+"\n#define WINOGRAD_M " + std::to_string(WINOGRAD_M) +
+"\n#define WINOGRAD_ALPHA " + std::to_string(WINOGRAD_ALPHA) +
+"\n#define WTILES " + std::to_string(WINOGRAD_WTILES);
 
-        const int offset = k*Ppad + block;
-        __in_transform_eq(xx, V, offset, CPpad);
-    }
-}
-)";
+static const std::string sourceCode_convolve1 =
+    #include "kernels/convolve1.opencl"
+;
 
-#ifdef USE_HALF
-const std::string sourceCode_sgemm =
-    #include "clblast_level3_half/common.opencl"
-    #include "clblast_level3_half/xgemm_part1.opencl"
-    #include "clblast_level3_half/xgemm_part2.opencl"
-    #include "clblast_level3_half/xgemm_part3.opencl"
-    #include "clblast_level3_half/xgemm_batched.opencl"
+static const std::string sourceCode_convolve3 =
+    #include "kernels/convolve3.opencl"
 ;
-#else
+
 const std::string sourceCode_sgemm =
-    #include "clblast_level3/common.opencl"
-    #include "clblast_level3/xgemm_part1.opencl"
-    #include "clblast_level3/xgemm_part2.opencl"
-    #include "clblast_level3/xgemm_part3.opencl"
-    #include "clblast_level3/xgemm_batched.opencl"
+    "#if TCE == 1\n" // Enable tensorcore
+    #include "kernels/clblast/hgemm_tensorcore.opencl"
+    "\n#else\n" // Use clblast
+    #include "kernels/clblast/xgemm_part1.opencl"
+    #include "kernels/clblast/xgemm_part2.opencl"
+    #include "kernels/clblast/xgemm_part3.opencl"
+    #include "kernels/clblast/xgemm_batched.opencl"
+    "\n#endif\n"
 ;
-#endif
 
-thread_local ThreadData opencl_thread_data;
-
-void OpenCL::ensure_thread_initialized() {
-    if (!opencl_thread_data.m_is_initialized) {
+template <typename net_t>
+void OpenCL<net_t>::ensure_context_initialized(OpenCLContext& opencl_context) {
+    if (!opencl_context.m_is_initialized) {
         // Make kernels
-        opencl_thread_data.m_convolve1_kernel =
+        opencl_context.m_convolve1_kernel =
             cl::Kernel(m_program, "convolve1");
-        opencl_thread_data.m_merge_kernel =
+        opencl_context.m_merge_kernel =
             cl::Kernel(m_program, "merge");
-        opencl_thread_data.m_in_transform_kernel =
+        opencl_context.m_in_transform_kernel =
             cl::Kernel(m_program, "in_transform");
-        opencl_thread_data.m_sgemm_kernel =
+        opencl_context.m_sgemm_kernel =
             cl::Kernel(m_program, "XgemmBatched");
-        opencl_thread_data.m_out_transform_bn_kernel =
+        opencl_context.m_out_transform_bn_kernel =
             cl::Kernel(m_program, "out_transform_fused_bn");
-        opencl_thread_data.m_out_transform_bn_in_kernel =
+        opencl_context.m_out_transform_bn_in_kernel =
             cl::Kernel(m_program, "out_transform_fused_bn_in");
-        opencl_thread_data.m_commandqueue =
-            cl::CommandQueue(m_context, m_device);
-        opencl_thread_data.m_is_initialized = true;
+        opencl_context.m_commandqueue = cl::CommandQueue(m_context, m_device);
+        opencl_context.m_is_initialized = true;
     }
 }
 
-void OpenCL_Network::add_weights(size_t layer,
-                                 size_t size,
-                                 const float * weights) {
+template <typename net_t>
+void OpenCL_Network<net_t>::add_weights(const size_t layer, const size_t size,
+                                        const net_t* const weights) {
     if (layer >= m_layers.size()) {
         m_layers.push_back(Layer());
     }
 
-    auto converted_weights = std::vector<net_t>();
-    for (auto i = size_t{0}; i < size; i++) {
-        converted_weights.emplace_back(weights[i]);
-    }
+    auto weightSize = size * sizeof(net_t);
 
-    auto weightSize = size * sizeof(decltype(converted_weights)::value_type);
-    m_layers.back().weights.emplace_back(
-        m_opencl.m_context,
-        CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
-        weightSize,
-        const_cast<net_t*>(converted_weights.data()));
+    auto queue = cl::CommandQueue(getOpenCL().m_context, getOpenCL().m_device);
+    auto buffer =
+        cl::Buffer(m_opencl.m_context, CL_MEM_READ_ONLY, weightSize, nullptr);
+    queue.enqueueWriteBuffer(buffer, CL_TRUE, 0, weightSize,
+                             const_cast<net_t*>(weights));
+    m_layers.back().weights.push_back(std::move(buffer));
 }
 
-void OpenCL_Network::forward(const std::vector<net_t>& input,
-                             std::vector<net_t>& output_pol,
-                             std::vector<net_t>& output_val) {
-    constexpr auto width = BOARD_SIZE;
-    constexpr auto height = BOARD_SIZE;
+template <typename net_t>
+void OpenCL_Network<net_t>::forward(const std::vector<float>& input,
+                                    std::vector<float>& output_pol,
+                                    std::vector<float>& output_val,
+                                    OpenCLContext& opencl_context,
+                                    const int batch_size) {
     constexpr auto tiles = WINOGRAD_P;
-    constexpr auto one_plane = width * height * sizeof(net_t);
-    const auto finalSize_pol = m_layers[m_layers.size()-2].outputs * one_plane;
+    constexpr auto one_plane = NUM_INTERSECTIONS * sizeof(net_t);
+    const auto finalSize_pol =
+        m_layers[m_layers.size() - 2].outputs * one_plane;
     const auto finalSize_val = m_layers.back().outputs * one_plane;
 
-    m_opencl.ensure_thread_initialized();
+    m_opencl.ensure_context_initialized(opencl_context);
 
-    if (!opencl_thread_data.m_buffers_allocated) {
+    if (!opencl_context.m_buffers_allocated) {
         auto max_channels = unsigned{0};
         for (const auto& layer : m_layers) {
-            max_channels = std::max(max_channels,
-                                    std::max(layer.channels, layer.outputs));
+            max_channels =
+                std::max(max_channels, std::max(layer.channels, layer.outputs));
         }
 
         const auto mwg = m_opencl.m_sgemm_tuners.mwg;
@@ -482,45 +168,55 @@ void OpenCL_Network::forward(const std::vector<net_t>& input,
         const auto m_ceil = ceilMultiple(ceilMultiple(max_channels, mwg), vwm);
         const auto n_ceil = ceilMultiple(ceilMultiple(tiles, nwg), vwn);
 
-        const auto alloc_inSize =
-            m_ceil * m_ceil *  max_channels * sizeof(net_t);
-        const auto alloc_vm_size =
-            WINOGRAD_TILE * m_ceil * n_ceil * sizeof(net_t);
+        const auto alloc_inSize = getOpenCL().m_batch_size * NUM_INTERSECTIONS
+                                  * max_channels * sizeof(net_t);
+        const auto alloc_vm_size = getOpenCL().m_batch_size * WINOGRAD_TILE
+                                   * m_ceil * n_ceil * sizeof(net_t);
 
         auto v_zeros = std::vector<net_t>(alloc_vm_size);
 
-        opencl_thread_data.m_inBuffer = cl::Buffer(
+        opencl_context.m_inBuffer = cl::Buffer(
             m_opencl.m_context,
             CL_MEM_READ_WRITE, alloc_inSize);
-        opencl_thread_data.m_inBuffer2 = cl::Buffer(
+        opencl_context.m_inBuffer2 = cl::Buffer(
             m_opencl.m_context,
             CL_MEM_READ_WRITE, alloc_inSize);
-        opencl_thread_data.m_VBuffer = cl::Buffer(
+        opencl_context.m_VBuffer = cl::Buffer(
             m_opencl.m_context,
             CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR,
             alloc_vm_size, v_zeros.data(), nullptr);
-        opencl_thread_data.m_MBuffer = cl::Buffer(
+        opencl_context.m_MBuffer = cl::Buffer(
             m_opencl.m_context,
             CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_vm_size);
 
-        opencl_thread_data.m_pinnedOutBuffer_pol = cl::Buffer(
+        opencl_context.m_pinnedOutBuffer_pol = cl::Buffer(
             m_opencl.m_context,
-            CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, finalSize_pol);
-        opencl_thread_data.m_pinnedOutBuffer_val = cl::Buffer(
+            CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+            getOpenCL().m_batch_size * finalSize_pol);
+        opencl_context.m_pinnedOutBuffer_val = cl::Buffer(
             m_opencl.m_context,
-            CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, finalSize_val);
+            CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+            getOpenCL().m_batch_size * finalSize_val);
 
-        opencl_thread_data.m_buffers_allocated = true;
+        opencl_context.m_buffers_allocated = true;
     }
 
-    cl::Buffer & inBuffer = opencl_thread_data.m_inBuffer;
-    cl::Buffer & inBuffer2 = opencl_thread_data.m_inBuffer2;
-    cl::Buffer & VBuffer = opencl_thread_data.m_VBuffer;
-    cl::Buffer & MBuffer = opencl_thread_data.m_MBuffer;
-    cl::CommandQueue & queue = opencl_thread_data.m_commandqueue;
+    cl::Buffer& inBuffer = opencl_context.m_inBuffer;
+    cl::Buffer& inBuffer2 = opencl_context.m_inBuffer2;
+    cl::Buffer& VBuffer = opencl_context.m_VBuffer;
+    cl::Buffer& MBuffer = opencl_context.m_MBuffer;
+    cl::CommandQueue& queue = opencl_context.m_commandqueue;
+
+    std::vector<net_t> net_t_input(input.size());
+    std::copy(begin(input), end(input), begin(net_t_input));
 
     const auto inSize = sizeof(net_t) * input.size();
-    queue.enqueueWriteBuffer(inBuffer, CL_FALSE, 0, inSize, input.data());
+    queue.enqueueWriteBuffer(inBuffer, CL_FALSE, 0, inSize, net_t_input.data());
+
+    // Fused in_out transformation kernel is slower with big batch_sizes than
+    // calling out and in transformations separately.
+    // This condition could be tunable in future.
+    auto use_inout = (batch_size == 1);
 
     auto skip_in_trans = false;
     for (auto iter = cbegin(m_layers); iter != cend(m_layers); iter++) {
@@ -533,27 +229,32 @@ void OpenCL_Network::forward(const std::vector<net_t>& input,
             auto bn_weights = begin(layer.weights) + 1;
             auto skip_next_in_trans = false;
             if (niter->is_residual_block) {
-                skip_next_in_trans = true;
+                skip_next_in_trans = use_inout;
             }
-            convolve3(layer.channels,
-                     layer.outputs,
-                     inBuffer,
-                     inBuffer,
-                     VBuffer,
-                     MBuffer,
-                     conv_weights,
-                     nullptr,
-                     bn_weights,
-                     skip_in_trans, skip_next_in_trans, true);
+
+            convolve3(opencl_context,
+                      layer.channels,
+                      layer.outputs,
+                      inBuffer,
+                      inBuffer,
+                      VBuffer,
+                      MBuffer,
+                      conv_weights,
+                      nullptr,
+                      bn_weights,
+                      skip_in_trans, skip_next_in_trans, true,
+                      batch_size);
+
             skip_in_trans = skip_next_in_trans;
         } else if (layer.is_residual_block) {
             assert(layer.channels == layer.outputs);
             assert(niter != cend(m_layers));
             auto conv1_weights = begin(layer.weights);
-            auto bn1_weights   = begin(layer.weights) + 1;
+            auto   bn1_weights = begin(layer.weights) + 1;
             auto conv2_weights = begin(layer.weights) + 3;
-            auto bn2_weights   = begin(layer.weights) + 4;
-            convolve3(layer.channels,
+            auto   bn2_weights = begin(layer.weights) + 4;
+            convolve3(opencl_context,
+                      layer.channels,
                       layer.outputs,
                       inBuffer,
                       inBuffer2,
@@ -562,13 +263,15 @@ void OpenCL_Network::forward(const std::vector<net_t>& input,
                       conv1_weights,
                       nullptr,
                       bn1_weights,
-                      skip_in_trans, true, false);
+                      skip_in_trans, use_inout, false,
+                      batch_size);
 
             auto skip_next_in_trans = false;
             if (niter->is_residual_block) {
-                skip_next_in_trans = true;
+                skip_next_in_trans = use_inout;
             }
-            convolve3(layer.channels,
+            convolve3(opencl_context,
+                      layer.channels,
                       layer.outputs,
                       inBuffer2,
                       inBuffer,
@@ -577,33 +280,35 @@ void OpenCL_Network::forward(const std::vector<net_t>& input,
                       conv2_weights,
                       &inBuffer,
                       bn2_weights,
-                      true, skip_next_in_trans, true);
+                      use_inout, skip_next_in_trans, true,
+                      batch_size);
             skip_in_trans = skip_next_in_trans;
         } else {
             assert(layer.is_convolve1);
 
             cl::Buffer out_buffer;
             if (niter == cend(m_layers)) {
-                out_buffer = opencl_thread_data.m_pinnedOutBuffer_val;
+                out_buffer = opencl_context.m_pinnedOutBuffer_val;
             } else {
-                out_buffer = opencl_thread_data.m_pinnedOutBuffer_pol;
+                out_buffer = opencl_context.m_pinnedOutBuffer_pol;
             }
 
-            convolve1(layer.channels,
-                    layer.outputs,
-                    inBuffer,
-                    out_buffer,
-                    VBuffer,
-                    begin(layer.weights));
+            convolve1(opencl_context, layer.channels,
+                      layer.outputs,
+                      inBuffer,
+                      out_buffer,
+                      VBuffer,
+                      begin(layer.weights),
+                      batch_size);
         }
     }
 
-    auto pinnedOutBufferHost_pol = queue.enqueueMapBuffer(
-        opencl_thread_data.m_pinnedOutBuffer_pol, CL_FALSE,
-        CL_MAP_READ, 0, finalSize_pol);
-    auto pinnedOutBufferHost_val = queue.enqueueMapBuffer(
-        opencl_thread_data.m_pinnedOutBuffer_val, CL_FALSE,
-        CL_MAP_READ, 0, finalSize_val);
+    auto pinnedOutBufferHost_pol =
+        queue.enqueueMapBuffer(opencl_context.m_pinnedOutBuffer_pol, CL_FALSE,
+                               CL_MAP_READ, 0, batch_size * finalSize_pol);
+    auto pinnedOutBufferHost_val =
+        queue.enqueueMapBuffer(opencl_context.m_pinnedOutBuffer_val, CL_FALSE,
+                               CL_MAP_READ, 0, batch_size * finalSize_val);
 
     {
         // Finish call is usually a busy wait. When using multiple threads
@@ -612,34 +317,38 @@ void OpenCL_Network::forward(const std::vector<net_t>& input,
         queue.finish();
     }
 
-    std::memcpy(output_pol.data(), pinnedOutBufferHost_pol, finalSize_pol);
-    std::memcpy(output_val.data(), pinnedOutBufferHost_val, finalSize_val);
-
-    queue.enqueueUnmapMemObject(opencl_thread_data.m_pinnedOutBuffer_pol,
-            pinnedOutBufferHost_pol);
-    queue.enqueueUnmapMemObject(opencl_thread_data.m_pinnedOutBuffer_val,
-            pinnedOutBufferHost_val);
+    auto polptr = static_cast<net_t*>(pinnedOutBufferHost_pol);
+    auto valptr = static_cast<net_t*>(pinnedOutBufferHost_val);
+    std::copy(polptr, polptr + output_pol.size(), begin(output_pol));
+    std::copy(valptr, valptr + output_val.size(), begin(output_val));
 
+    queue.enqueueUnmapMemObject(opencl_context.m_pinnedOutBuffer_pol,
+                                pinnedOutBufferHost_pol);
+    queue.enqueueUnmapMemObject(opencl_context.m_pinnedOutBuffer_val,
+                                pinnedOutBufferHost_val);
 }
 
-void OpenCL_Network::convolve3(int channels, int outputs,
-                              cl::Buffer& bufferIn,
-                              cl::Buffer& bufferOut,
-                              cl::Buffer& bufferV,
-                              cl::Buffer& bufferM,
-                              weight_slice_t weights,
-                              cl::Buffer* bufferResidual,
-                              weight_slice_t bn_weights,
-                              bool skip_in_transform,
-                              bool fuse_in_transform,
-                              bool store_inout) {
-
-    cl::Kernel & in_transform_kernel = opencl_thread_data.m_in_transform_kernel;
-    cl::Kernel & sgemm_kernel = opencl_thread_data.m_sgemm_kernel;
-    cl::Kernel & out_transform_bn_kernel =
-        opencl_thread_data.m_out_transform_bn_kernel;
-    cl::Kernel & out_transform_bn_in_kernel =
-        opencl_thread_data.m_out_transform_bn_in_kernel;
+template <typename net_t>
+void OpenCL_Network<net_t>::convolve3(OpenCLContext& opencl_context,
+                                      const int channels, const int outputs,
+                                      cl::Buffer& bufferIn,
+                                      cl::Buffer& bufferOut,
+                                      cl::Buffer& bufferV,
+                                      cl::Buffer& bufferM,
+                                      const weight_slice_t weights,
+                                      cl::Buffer* const bufferResidual,
+                                      const weight_slice_t bn_weights,
+                                      const bool skip_in_transform,
+                                      const bool fuse_in_transform,
+                                      const bool store_inout,
+                                      const int batch_size) {
+
+    cl::Kernel& in_transform_kernel = opencl_context.m_in_transform_kernel;
+    cl::Kernel& sgemm_kernel = opencl_context.m_sgemm_kernel;
+    cl::Kernel& out_transform_bn_kernel =
+        opencl_context.m_out_transform_bn_kernel;
+    cl::Kernel& out_transform_bn_in_kernel =
+        opencl_context.m_out_transform_bn_in_kernel;
 
     auto mwg = m_opencl.m_sgemm_tuners.mwg;
     auto nwg = m_opencl.m_sgemm_tuners.nwg;
@@ -648,6 +357,10 @@ void OpenCL_Network::convolve3(int channels, int outputs,
     auto vwn = m_opencl.m_sgemm_tuners.vwn;
     auto mdimc = m_opencl.m_sgemm_tuners.mdimc;
     auto ndimc = m_opencl.m_sgemm_tuners.ndimc;
+    auto tce = m_opencl.m_sgemm_tuners.tce;
+    auto mdima = m_opencl.m_sgemm_tuners.mdima;
+    auto ndimb = m_opencl.m_sgemm_tuners.ndimb;
+
     auto wavefront_size = m_opencl.m_wavefront_size;
 
     assert(mwg != 0);
@@ -660,15 +373,15 @@ void OpenCL_Network::convolve3(int channels, int outputs,
     assert(wavefront_size != 0);
 
     constexpr auto tiles = WINOGRAD_P;
-    constexpr auto width = BOARD_SIZE;
-    constexpr auto height = BOARD_SIZE;
 
-    auto wgs = ceilMultiple(tiles, wavefront_size);
+    auto wgs = ceilMultiple(batch_size * tiles, wavefront_size);
+    auto wgs_single = ceilMultiple(tiles, wavefront_size);
+
     auto m_ceil = int(ceilMultiple(ceilMultiple(outputs, mwg), vwm));
-    auto n_ceil = int(ceilMultiple(ceilMultiple(tiles, nwg), vwn));
+    auto n_ceil = int(ceilMultiple(ceilMultiple(batch_size * tiles, nwg), vwn));
     auto k_ceil = int(ceilMultiple(ceilMultiple(channels, kwg), vwm));
 
-    cl::CommandQueue & queue = opencl_thread_data.m_commandqueue;
+    cl::CommandQueue& queue = opencl_context.m_commandqueue;
 
     if (!skip_in_transform) {
         try {
@@ -677,12 +390,13 @@ void OpenCL_Network::convolve3(int channels, int outputs,
             in_transform_kernel.setArg(2, channels);
             in_transform_kernel.setArg(3, k_ceil);
             in_transform_kernel.setArg(4, n_ceil);
+            in_transform_kernel.setArg(5, batch_size);
 
             queue.enqueueNDRangeKernel(in_transform_kernel, cl::NullRange,
                                        cl::NDRange(wgs, channels));
-        } catch (const cl::Error &e) {
-            std::cerr << "Error in convolve3: " << e.what() << ": "
-                << e.err() << std::endl;
+        } catch (const cl::Error& e) {
+            std::cerr << "Error in convolve3/in: " << e.what() << ": "
+                      << e.err() << std::endl;
             throw;
         }
     }
@@ -701,17 +415,25 @@ void OpenCL_Network::convolve3(int channels, int outputs,
                                   (n_ceil * ndimc) / nwg,
                                   cl::size_type(WINOGRAD_TILE)};
 
+        // tensorcore implementation uses a different dimension
+        if (tce) {
+            local_sgemm = {32 * mdimc / mdima, ndimc / ndimb, 1};
+            size_sgemm = {32 * m_ceil / mdima * mdimc / mwg,
+                          n_ceil / ndimb * ndimc / nwg,
+                          cl::size_type(WINOGRAD_TILE)};
+        }
         queue.enqueueNDRangeKernel(sgemm_kernel, cl::NullRange,
                                    size_sgemm, local_sgemm);
-    } catch (const cl::Error &e) {
-        std::cerr << "Error in convolve3: " << e.what() << ": "
-            << e.err() << std::endl;
+    } catch (const cl::Error& e) {
+        std::cerr << "Error in convolve3/sgemm: " << e.what() << ": " << e.err()
+                  << std::endl;
         throw;
     }
 
     try {
         if (fuse_in_transform) {
             // TODO : Eventually this might also be something tuneable?
+            // Needs to match OUTIN_KWG in kernel
             constexpr auto dim_size = 2;
             out_transform_bn_in_kernel.setArg(0, bufferM);
             if (store_inout) {
@@ -733,45 +455,55 @@ void OpenCL_Network::convolve3(int channels, int outputs,
             }
             out_transform_bn_in_kernel.setArg(8, bn_weights[0]);
             out_transform_bn_in_kernel.setArg(9, bn_weights[1]);
-            out_transform_bn_in_kernel.setArg(10,
-                cl::Local(dim_size * width * height * sizeof(float)));
 
-            queue.enqueueNDRangeKernel(out_transform_bn_in_kernel,
-                                       cl::NullRange,
-                                       cl::NDRange(outputs, wgs),
-                                       cl::NDRange(dim_size, wgs));
+            queue.enqueueNDRangeKernel(
+                out_transform_bn_in_kernel, cl::NullRange,
+                cl::NDRange(outputs, wgs_single, batch_size),
+                cl::NDRange(dim_size, wgs_single, 1));
         } else {
             out_transform_bn_kernel.setArg(0, bufferM);
             out_transform_bn_kernel.setArg(1, bufferOut);
             out_transform_bn_kernel.setArg(2, outputs);
             out_transform_bn_kernel.setArg(3, m_ceil);
             out_transform_bn_kernel.setArg(4, n_ceil);
+            out_transform_bn_kernel.setArg(5, batch_size);
             if (bufferResidual) {
-                out_transform_bn_kernel.setArg(5, *bufferResidual);
+                out_transform_bn_kernel.setArg(6, *bufferResidual);
             } else {
-                out_transform_bn_kernel.setArg(5, nullptr);
+                out_transform_bn_kernel.setArg(6, nullptr);
             }
-            out_transform_bn_kernel.setArg(6, bn_weights[0]);
-            out_transform_bn_kernel.setArg(7, bn_weights[1]);
+            out_transform_bn_kernel.setArg(7, bn_weights[0]);
+            out_transform_bn_kernel.setArg(8, bn_weights[1]);
+
+            // Needs to match OUT_KWG, OUT_BWG in the kernel.
+            // This could be tuned.
+            cl::NDRange local_out = {32, 2};
+
+            cl::NDRange global_out = {
+                ceilMultiple(outputs, local_out[0]),
+                ceilMultiple(tiles * batch_size, local_out[1])};
 
             queue.enqueueNDRangeKernel(out_transform_bn_kernel, cl::NullRange,
-                                       cl::NDRange(outputs, wgs));
+                                       global_out, local_out);
         }
-    } catch (const cl::Error &e) {
-        std::cerr << "Error in convolve3: " << e.what() << ": "
-            << e.err() << std::endl;
+    } catch (const cl::Error& e) {
+        std::cerr << "Error in convolve3/out: " << e.what() << ": " << e.err()
+                  << std::endl;
         throw;
     }
 }
 
-void OpenCL_Network::convolve1(int channels, int outputs,
-                              cl::Buffer& bufferInput,
-                              cl::Buffer& bufferOutput,
-                              cl::Buffer& bufferMerge,
-                              weight_slice_t weights) {
+template <typename net_t>
+void OpenCL_Network<net_t>::convolve1(OpenCLContext& opencl_context,
+                                      const int channels, const int outputs,
+                                      cl::Buffer& bufferInput,
+                                      cl::Buffer& bufferOutput,
+                                      cl::Buffer& bufferMerge,
+                                      const weight_slice_t weights,
+                                      const int batch_size) {
     // The size of the board is defined at compile time
     constexpr int width = BOARD_SIZE;
-    constexpr int boardsize = BOARD_SQUARES;
+    constexpr int boardsize = NUM_INTERSECTIONS;
     constexpr int rowTiles = BOARD_SIZE;
 
     // Input channel grouping in multiples of 8
@@ -780,7 +512,7 @@ void OpenCL_Network::convolve1(int channels, int outputs,
     constexpr int rowGroup = 1;
     size_t outputGroup = std::min(outputs, 32);
 
-    auto m_convolve_kernel = &opencl_thread_data.m_convolve1_kernel;
+    auto m_convolve_kernel = &opencl_context.m_convolve1_kernel;
 
 #ifndef NDEBUG
     // Total output size after reducing
@@ -797,25 +529,27 @@ void OpenCL_Network::convolve1(int channels, int outputs,
     int rowBuffer = std::min<int>(channelGroup, 7);
     size_t rowSize = channelGroup * outputGroup * rowBuffer * sizeof(float);
 
-    cl::CommandQueue & queue = opencl_thread_data.m_commandqueue;
+    cl::CommandQueue& queue = opencl_context.m_commandqueue;
 
     try {
         m_convolve_kernel->setArg(0, bufferInput);
         m_convolve_kernel->setArg(1, bufferMerge);
         m_convolve_kernel->setArg(2, weights[0]);
-        m_convolve_kernel->setArg(3, cl::Local(stripSize * channelGroup * rowGroup));
+        m_convolve_kernel->setArg(
+            3, cl::Local(stripSize * channelGroup * rowGroup));
         m_convolve_kernel->setArg(4, cl::Local(rowSize));
 
-        queue.enqueueNDRangeKernel(*m_convolve_kernel, cl::NullRange,
-                                   cl::NDRange(channels, outputs, rowTiles),
-                                   cl::NDRange(channelGroup, outputGroup, rowGroup));
-    } catch (const cl::Error &e) {
-        std::cerr << "Error in convolve1: " << e.what() << ": "
-                  << e.err() << std::endl;
+        queue.enqueueNDRangeKernel(
+            *m_convolve_kernel, cl::NullRange,
+            cl::NDRange(channels, outputs, batch_size * rowTiles),
+            cl::NDRange(channelGroup, outputGroup, rowGroup));
+    } catch (const cl::Error& e) {
+        std::cerr << "Error in convolve1: " << e.what() << ": " << e.err()
+                  << std::endl;
         throw;
     }
 
-    cl::Kernel & merge_kernel = opencl_thread_data.m_merge_kernel;
+    cl::Kernel& merge_kernel = opencl_context.m_merge_kernel;
     assert(channels % (1 << channelShift) == 0);
 
     try {
@@ -823,18 +557,19 @@ void OpenCL_Network::convolve1(int channels, int outputs,
         merge_kernel.setArg(1, bufferOutput);
         merge_kernel.setArg(2, channels >> channelShift);
 
-        queue.enqueueNDRangeKernel(merge_kernel, cl::NullRange,
-                                   cl::NDRange(outputs, boardsize),
-                                   cl::NDRange(std::min(8, outputs), BOARD_SIZE));
-    } catch (const cl::Error &e) {
-        std::cerr << "Error in merge: " << e.what() << ": "
-                  << e.err() << std::endl;
+        queue.enqueueNDRangeKernel(
+            merge_kernel, cl::NullRange,
+            cl::NDRange(outputs, boardsize, batch_size),
+            cl::NDRange(std::min(8, outputs), BOARD_SIZE, 1));
+    } catch (const cl::Error& e) {
+        std::cerr << "Error in merge: " << e.what() << ": " << e.err()
+                  << std::endl;
         throw;
     }
 }
 
-template<class T>
-static std::string opencl_dev_type_to_string(T type) {
+template <class T>
+static std::string opencl_dev_type_to_string(const T type) {
     if (type == CL_DEVICE_TYPE_CPU) {
         return "CPU";
     } else if (type == CL_DEVICE_TYPE_GPU) {
@@ -851,7 +586,8 @@ static std::string trim(std::string trim_me) {
     return trim_me;
 }
 
-void OpenCL::process_tuners(std::string tuners) {
+template <typename net_t>
+void OpenCL<net_t>::process_tuners(std::string tuners) {
     std::string buf;
     std::stringstream ss(tuners);
     std::size_t found;
@@ -861,8 +597,12 @@ void OpenCL::process_tuners(std::string tuners) {
     auto kwg = false;
     auto ndimc = false;
     auto mdimc = false;
+    auto mdima = false;
+    auto ndimb = false;
     auto vwm = false;
     auto vwn = false;
+    auto tce = false;
+
     while (ss >> buf) {
         found = buf.find("=");
         if (found == std::string::npos) {
@@ -883,6 +623,14 @@ void OpenCL::process_tuners(std::string tuners) {
             m_sgemm_tuners.kwg = value;
             kwg = true;
         }
+        if (name == "-DMDIMA") {
+            m_sgemm_tuners.mdima = value;
+            mdima = true;
+        }
+        if (name == "-DNDIMB") {
+            m_sgemm_tuners.ndimb = value;
+            ndimb = true;
+        }
         if (name == "-DMDIMC") {
             m_sgemm_tuners.mdimc = value;
             mdimc = true;
@@ -899,8 +647,13 @@ void OpenCL::process_tuners(std::string tuners) {
             m_sgemm_tuners.vwn = value;
             vwn = true;
         }
+        if (name == "-DTCE") {
+            m_sgemm_tuners.tce = value;
+            tce = true;
+        }
     }
-    if (!mwg || !nwg || !kwg || !mdimc || !ndimc || !vwm || !vwn) {
+    if (!mwg || !nwg || !kwg || !mdimc || !ndimc
+        || !vwm || !vwn || !mdima || !ndimb) {
         std::cerr << "Missing tuner parameters";
         if (!mwg) {
             std::cerr << " MWG";
@@ -911,6 +664,12 @@ void OpenCL::process_tuners(std::string tuners) {
         if (!kwg) {
             std::cerr << " KWG";
         }
+        if (!mdima) {
+            std::cerr << " MDIMA";
+        }
+        if (!ndimb) {
+            std::cerr << " NDIMB";
+        }
         if (!mdimc) {
             std::cerr << " MDIMC";
         }
@@ -923,12 +682,16 @@ void OpenCL::process_tuners(std::string tuners) {
         if (!vwn) {
             std::cerr << " VWN";
         }
+        if (!tce) {
+            std::cerr << " VWN";
+        }
         std::cerr << std::endl;
         std::exit(-1);
     }
 }
 
-std::vector<size_t> OpenCL::get_sgemm_tuners(void) {
+template <typename net_t>
+std::vector<size_t> OpenCL<net_t>::get_sgemm_tuners() {
     std::vector<size_t> tuners;
 
     tuners.emplace_back(m_sgemm_tuners.mwg);
@@ -942,12 +705,12 @@ std::vector<size_t> OpenCL::get_sgemm_tuners(void) {
     return tuners;
 }
 
-void OpenCL::initialize(const int channels, const std::vector<int> & gpus,
-                        bool silent) {
+template <typename net_t>
+OpenCL<net_t>::OpenCL(const int gpu, const bool silent) {
     std::vector<cl::Platform> platforms;
     try {
         cl::Platform::get(&platforms);
-    } catch (const cl::Error &e) {
+    } catch (const cl::Error& e) {
         myprintf("OpenCL: %s\n", e.what());
         throw;
     }
@@ -964,13 +727,13 @@ void OpenCL::initialize(const int channels, const std::vector<int> & gpus,
         myprintf("Detected %d OpenCL platforms.\n", platforms.size());
     }
 
-    for (const auto &p : platforms) {
+    for (const auto& p : platforms) {
         std::string platvers = p.getInfo<CL_PLATFORM_VERSION>();
         if (!silent) {
             std::string platprof = p.getInfo<CL_PLATFORM_PROFILE>();
             std::string platname = p.getInfo<CL_PLATFORM_NAME>();
             std::string platvend = p.getInfo<CL_PLATFORM_VENDOR>();
-            myprintf("Platform version: %s\n", platvers.c_str());;
+            myprintf("Platform version: %s\n", platvers.c_str());
             myprintf("Platform profile: %s\n", platprof.c_str());
             myprintf("Platform name:    %s\n", platname.c_str());
             myprintf("Platform vendor:  %s\n", platvend.c_str());
@@ -984,7 +747,7 @@ void OpenCL::initialize(const int channels, const std::vector<int> & gpus,
         std::vector<cl::Device> devices;
         try {
             p.getDevices(CL_DEVICE_TYPE_ALL, &devices);
-        } catch (const cl::Error &e) {
+        } catch (const cl::Error& e) {
             myprintf("Error getting device(s): %s: %d\n", e.what(), e.err());
             devices.clear();
         }
@@ -994,40 +757,45 @@ void OpenCL::initialize(const int channels, const std::vector<int> & gpus,
                 myprintf("Device name:   %s\n",
                          trim(d.getInfo<CL_DEVICE_NAME>()).c_str());
                 myprintf("Device type:   %s\n",
-                         opencl_dev_type_to_string(d.getInfo<CL_DEVICE_TYPE>()).c_str());
+                         opencl_dev_type_to_string(d.getInfo<CL_DEVICE_TYPE>())
+                             .c_str());
                 myprintf("Device vendor: %s\n",
-                          d.getInfo<CL_DEVICE_VENDOR>().c_str());
+                         d.getInfo<CL_DEVICE_VENDOR>().c_str());
                 myprintf("Device driver: %s\n",
-                          d.getInfo<CL_DRIVER_VERSION>().c_str());
+                         d.getInfo<CL_DRIVER_VERSION>().c_str());
                 myprintf("Device speed:  %u MHz\n",
-                          d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
+                         d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>());
                 myprintf("Device cores:  %u CU\n",
-                          d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>());
+                         d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>());
             }
 
             // assign score, try to find best device
             int this_score = 0;
             std::string this_vendor = d.getInfo<CL_DEVICE_VENDOR>();
-            this_score += 1000 * boost::icontains(this_vendor, "advanced micro devices");
+            this_score +=
+                1000 * boost::icontains(this_vendor, "advanced micro devices");
             this_score += 1000 * boost::icontains(this_vendor, "amd");
             this_score += 1000 * boost::icontains(this_vendor, "nvidia");
-            this_score +=  500 * boost::icontains(this_vendor, "intel");
-            this_score +=  100 * (d.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU);
-            this_score +=  opencl_version * 10;
+            this_score += 500 * boost::icontains(this_vendor, "intel");
+            this_score +=
+                100 * (d.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU);
+            this_score += opencl_version * 10;
             if (!silent) {
                 myprintf("Device score:  %d\n", this_score);
             }
 
-            bool preferred =
-                std::find(cbegin(gpus), cend(gpus), id) != cend(gpus);
+            bool preferred = (gpu == id);
 
-            if ((this_score > best_score) || preferred) {
+            if (((this_score > best_score)
+                 && (d.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_CPU))
+                || preferred) {
                 best_version = opencl_version;
                 best_platform = p;
                 best_device = d;
                 best_vendor = this_vendor;
                 if (preferred) {
-                    best_score = std::numeric_limits<decltype(best_score)>::max();
+                    best_score =
+                        std::numeric_limits<decltype(best_score)>::max();
                 } else {
                     best_score = this_score;
                 }
@@ -1042,67 +810,114 @@ void OpenCL::initialize(const int channels, const std::vector<int> & gpus,
     }
 
     myprintf("Selected platform: %s\n",
-        best_platform.getInfo<CL_PLATFORM_NAME>().c_str());
+             best_platform.getInfo<CL_PLATFORM_NAME>().c_str());
     myprintf("Selected device: %s\n",
-        trim(best_device.getInfo<CL_DEVICE_NAME>()).c_str());
+             trim(best_device.getInfo<CL_DEVICE_NAME>()).c_str());
     myprintf("with OpenCL %2.1f capability.\n", best_version);
 
     cl::Context context;
     try {
         context = cl::Context(best_device);
-    } catch (const cl::Error &e) {
+    } catch (const cl::Error& e) {
         myprintf("Error creating OpenCL context: %s: %d", e.what(), e.err());
         throw std::runtime_error("Error creating OpenCL context.");
     }
     m_context = context;
     m_device = best_device;
 
+    m_cl_args = getClArgs<net_t>();
+
+    myprintf("Half precision compute support: ");
+    if (m_device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp16")
+        != std::string::npos) {
+        myprintf("Yes.\n");
+        m_fp16_compute = true;
+        m_cl_args += " -DFP16_SUPPORT";
+    } else {
+        myprintf("No.\n");
+    }
+
+    myprintf("Tensor Core support: ");
+    {
+        // if this is a nvidia GPU, test-compile a sample inline assembly code
+        // with tensor wmma instructions. if not, don't bother trying
+        std::string this_vendor = m_device.getInfo<CL_DEVICE_VENDOR>();
+        if (boost::icontains(this_vendor, "nvidia")) {
+            try {
+                cl::Program(m_context, sourceCode_tensorcore_test)
+                    .build(m_cl_args.c_str());
+                m_tensorcore = true;
+                myprintf("Yes.\n");
+            } catch (...) {
+                myprintf("No.\n");
+            }
+        } else {
+            myprintf("No.\n");
+        }
+    }
+}
+
+template <typename net_t>
+void OpenCL<net_t>::initialize(const int channels, const size_t batch_size) {
+    m_batch_size = batch_size;
     // Make program of the source code in the context
     try {
-        m_program = cl::Program(m_context,
-                                  sourceCode_config
-                                + sourceCode_convolve1
-                                + sourceCode_convolve3
-                                + sourceCode_sgemm);
-    } catch (const cl::Error &e) {
+        m_program = cl::Program(m_context, sourceCode_common + sourceCode_config
+                                               + sourceCode_convolve1
+                                               + sourceCode_convolve3
+                                               + sourceCode_sgemm);
+    } catch (const cl::Error& e) {
         myprintf("Error getting kernels: %s: %d", e.what(), e.err());
         throw std::runtime_error("Error getting OpenCL kernels.");
     }
 
-    m_cl_args = cl_args;
+    auto t = Tuner<net_t>(*this, m_context, m_device);
+    if (m_tensorcore) {
+        t.enable_tensorcore();
+    }
 
-    auto t = Tuner(*this, m_context, m_device);
-    auto sgemm_tuners =
-        t.load_sgemm_tuners(channels, WINOGRAD_P, channels, WINOGRAD_TILE);
+    auto sgemm_tuners = t.load_sgemm_tuners(channels, batch_size * WINOGRAD_P,
+                                            channels, WINOGRAD_TILE);
 
-    // Exit immediately after tuning. Some NVIDIA drivers are buggy
-    // and will fail to compile the rest of the kernels after a tuning
-    // run. See #729.
+    // Some NVIDIA drivers are buggy and will fail to compile the rest of the
+    // kernels after a tuning run.
     if (cfg_tune_only) {
-        exit(EXIT_SUCCESS);
+        // Originally this was an exit() but this will make the tuner
+        // only tune the first GPU.  Return instead.  Exit will be called
+        // after all GPUs are created.
+        return;
     }
 
     // Build program for these specific devices
     try {
-        std::string args = cl_args;
+        std::string args = m_cl_args;
+        // Intel iGPUs need vector types for math for best performance
+        if (m_device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT>() > 1) {
+            args += " -DWINOGRAD_SIMD";
+        }
+
         args += sgemm_tuners;
         m_program.build(args.c_str());
     } catch (const cl::Error&) {
-        myprintf("Error building kernels: %s\n",
-                 m_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(m_device).c_str());
+        myprintf(
+            "Error building kernels: %s\n",
+            m_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(m_device).c_str());
         throw std::runtime_error("Error building OpenCL kernels.");
     }
 
-    ensure_thread_initialized();
+    OpenCLContext tdata;
+    ensure_context_initialized(tdata);
+
     process_tuners(sgemm_tuners);
 
     m_wavefront_size =
-        opencl_thread_data.m_sgemm_kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(
-            best_device);
+        tdata.m_sgemm_kernel
+            .getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(
+                m_device);
     myprintf("Wavefront/Warp size: %d\n", m_wavefront_size);
 
-    m_max_workgroup_size = best_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
-    m_max_workgroup_dims = best_device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
+    m_max_workgroup_size = m_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    m_max_workgroup_dims = m_device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
 
     myprintf("Max workgroup size: %d\n", m_max_workgroup_size);
     myprintf("Max workgroup dimensions: ");
@@ -1114,7 +929,18 @@ void OpenCL::initialize(const int channels, const std::vector<int> & gpus,
     m_init_ok = true;
 }
 
-std::string OpenCL::get_device_name() {
+template <typename net_t>
+bool OpenCL<net_t>::has_fp16_compute() {
+    return m_fp16_compute;
+}
+
+template <typename net_t>
+bool OpenCL<net_t>::has_tensor_cores() {
+    return m_tensorcore;
+}
+
+template <typename net_t>
+std::string OpenCL<net_t>::get_device_name() {
     std::stringstream ss;
 
     ss << "OpenCL: ";
@@ -1124,4 +950,12 @@ std::string OpenCL::get_device_name() {
 
     return ss.str();
 }
+
+template class OpenCL<float>;
+template class OpenCL_Network<float>;
+#ifdef USE_HALF
+template class OpenCL<half_float::half>;
+template class OpenCL_Network<half_float::half>;
+#endif
+
 #endif
diff --git a/src/OpenCL.h b/src/OpenCL.h
index bc7e0c3f1..4dd061730 100644
--- a/src/OpenCL.h
+++ b/src/OpenCL.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef OPENCL_H_INCLUDED
@@ -21,25 +32,25 @@
 
 #include "config.h"
 
-#define CL_HPP_MINIMUM_OPENCL_VERSION   110
-#define CL_HPP_TARGET_OPENCL_VERSION    120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#define CL_HPP_TARGET_OPENCL_VERSION  120
 #define CL_HPP_ENABLE_EXCEPTIONS
 #include <CL/cl2.hpp>
+#include <cassert>
 #include <cstddef>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
-#include <mutex>
 
 #include "Tuner.h"
 
-static constexpr auto WINOGRAD_P = (BOARD_SIZE + 1) * (BOARD_SIZE + 1) / 4;
-static constexpr auto WINOGRAD_TILE = 4 * 4;
-
-class OpenCL;
+template <typename net_t> class OpenCL;
+template <typename net_t> class OpenCL_Network;
 
 class Layer {
-    friend class OpenCL_Network;
+    template <typename> friend class OpenCL_Network;
+
 private:
     unsigned int channels{0};
     unsigned int outputs{0};
@@ -50,9 +61,10 @@ class Layer {
     std::vector<cl::Buffer> weights;
 };
 
-class ThreadData {
-    friend class OpenCL;
-    friend class OpenCL_Network;
+class OpenCLContext {
+    template <typename> friend class OpenCL;
+    template <typename> friend class OpenCL_Network;
+
 private:
     bool m_is_initialized{false};
     cl::CommandQueue m_commandqueue;
@@ -71,20 +83,21 @@ class ThreadData {
     bool m_buffers_allocated{false};
 };
 
+template <typename net_t>
 class OpenCL_Network {
 public:
-    OpenCL_Network(OpenCL & opencl) : m_opencl(opencl) {}
-    OpenCL & getOpenCL() {
+    OpenCL_Network(OpenCL<net_t>& opencl) : m_opencl(opencl) {}
+    OpenCL<net_t>& getOpenCL() {
         return m_opencl;
     }
 
-    void push_input_convolution(unsigned int filter_size,
-                       unsigned int channels,
-                       unsigned int outputs,
-                       const std::vector<float>& weights,
-                       const std::vector<float>& means,
-                       const std::vector<float>& variances) {
-        size_t layer = get_layer_count();
+    void push_input_convolution(const unsigned int filter_size,
+                                const unsigned int channels,
+                                const unsigned int outputs,
+                                const std::vector<net_t>& weights,
+                                const std::vector<net_t>& means,
+                                const std::vector<net_t>& variances) {
+        const auto layer = get_layer_count();
         push_weights(layer, weights);
         push_weights(layer, means);
         push_weights(layer, variances);
@@ -94,16 +107,15 @@ class OpenCL_Network {
         m_layers[layer].channels = channels;
     }
 
-    void push_residual(unsigned int filter_size,
-                       unsigned int channels,
-                       unsigned int outputs,
-                       const std::vector<float>& weights_1,
-                       const std::vector<float>& means_1,
-                       const std::vector<float>& variances_1,
-                       const std::vector<float>& weights_2,
-                       const std::vector<float>& means_2,
-                       const std::vector<float>& variances_2) {
-        size_t layer = get_layer_count();
+    void push_residual(const unsigned int filter_size,
+                       const unsigned int channels, const unsigned int outputs,
+                       const std::vector<net_t>& weights_1,
+                       const std::vector<net_t>& means_1,
+                       const std::vector<net_t>& variances_1,
+                       const std::vector<net_t>& weights_2,
+                       const std::vector<net_t>& means_2,
+                       const std::vector<net_t>& variances_2) {
+        const auto layer = get_layer_count();
         push_weights(layer, weights_1);
         push_weights(layer, means_1);
         push_weights(layer, variances_1);
@@ -116,10 +128,13 @@ class OpenCL_Network {
         m_layers[layer].channels = channels;
     }
 
-    void push_convolve1(unsigned int channels,
-                       unsigned int outputs,
-                       const std::vector<float>& weights) {
-        size_t layer = get_layer_count();
+    void push_convolve(const unsigned int filter_size,
+                       const unsigned int channels, const unsigned int outputs,
+                       const std::vector<net_t>& weights) {
+        (void)filter_size;
+        assert(filter_size == 1);
+
+        const auto layer = get_layer_count();
         push_weights(layer, weights);
         m_layers[layer].is_convolve1 = true;
         m_layers[layer].outputs = outputs;
@@ -130,35 +145,35 @@ class OpenCL_Network {
         return m_layers.size();
     }
 
-    void forward(const std::vector<net_t>& input,
-            std::vector<net_t>& output_pol,
-            std::vector<net_t>& output_val);
+    void forward(const std::vector<float>& input,
+                 std::vector<float>& output_pol,
+                 std::vector<float>& output_val,
+                 OpenCLContext& opencl_context, int batch_size = 1);
 
 private:
     using weight_slice_t = std::vector<cl::Buffer>::const_iterator;
 
-    void push_weights(size_t layer, const std::vector<float>& weights) {
+    void push_weights(const size_t layer, const std::vector<net_t>& weights) {
         add_weights(layer, weights.size(), weights.data());
     }
-    void add_weights(size_t layer, size_t size, const float* weights);
-
-    void convolve3(int channels, int outputs,
-                    cl::Buffer& bufferIn,
-                    cl::Buffer& bufferOut,
-                    cl::Buffer& bufferV,
-                    cl::Buffer& bufferM, weight_slice_t weights,
-                    cl::Buffer* bufferResidual,
-                    weight_slice_t bn_weights,
-                    bool skip_in_transform,
-                    bool fuse_in_transform, bool store_inout);
-
-    void convolve1(int channels, int outputs,
-                  cl::Buffer& bufferInput,
-                  cl::Buffer& bufferOutput,
-                  cl::Buffer& bufferMerge,
-                  weight_slice_t weights);
-
-    OpenCL & m_opencl;
+    void add_weights(size_t layer, size_t size, const net_t* weights);
+
+    void convolve3(OpenCLContext& opencl_context, int channels, int outputs,
+                   cl::Buffer& bufferIn,
+                   cl::Buffer& bufferOut,
+                   cl::Buffer& bufferV,
+                   cl::Buffer& bufferM, weight_slice_t weights,
+                   cl::Buffer* bufferResidual,
+                   weight_slice_t bn_weights, bool skip_in_transform,
+                   bool fuse_in_transform, bool store_inout, int batch_size);
+
+    void convolve1(OpenCLContext& opencl_context, int channels, int outputs,
+                   cl::Buffer& bufferInput,
+                   cl::Buffer& bufferOutput,
+                   cl::Buffer& bufferMerge,
+                   weight_slice_t weights, int batch_size);
+
+    OpenCL<net_t>& m_opencl;
 
     // this mutex is not required for correctness, but this exists simply
     // because queue.finish() is a busy wait and having a lot of threads
@@ -168,39 +183,49 @@ class OpenCL_Network {
     std::vector<Layer> m_layers;
 };
 
+template <typename net_t>
 class OpenCL {
-    friend class OpenCL_Network;
-    friend class Tuner;
+    friend class OpenCL_Network<net_t>;
+    friend class Tuner<net_t>;
+
 public:
-    void initialize(const int channels, const std::vector<int> & gpus,
-                    bool silent = false);
-    void ensure_thread_initialized(void);
+    OpenCL(int gpu, bool silent = false);
+
+    void initialize(int channels, size_t batch_size = 1);
+    void ensure_context_initialized(OpenCLContext& opencl_context);
     std::string get_device_name();
+    bool has_fp16_compute();
+    bool has_tensor_cores();
 
-    std::vector<size_t> get_sgemm_tuners(void);
+    std::vector<size_t> get_sgemm_tuners();
 
     cl::Device m_device;
     cl::Context m_context;
+
 private:
-    void tune_sgemm(void);
     void process_tuners(std::string tuners);
 
+    size_t m_batch_size = 1;
     cl::Program m_program;
     std::string m_cl_args;
 
     struct sgemm_tuners {
         size_t mwg, nwg, kwg;
         size_t vwm, vwn;
+        size_t mdima, ndimb;
         size_t mdimc, ndimc;
+        size_t tce;
     };
     sgemm_tuners m_sgemm_tuners;
     size_t m_wavefront_size{0};
     size_t m_max_workgroup_size{0};
     std::vector<size_t> m_max_workgroup_dims;
+    bool m_fp16_compute{false};
+    bool m_tensorcore{false};
     bool m_init_ok{false};
 };
 
-extern thread_local ThreadData opencl_thread_data;
 extern const std::string sourceCode_sgemm;
+extern const std::string sourceCode_common;
 
 #endif
diff --git a/src/OpenCLScheduler.cpp b/src/OpenCLScheduler.cpp
index 096a50f7b..762d2a04d 100644
--- a/src/OpenCLScheduler.cpp
+++ b/src/OpenCLScheduler.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Junhee Yoo and contributors
+    Copyright (C) 2018-2019 Junhee Yoo and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,69 +14,435 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 #include "config.h"
 
 #ifdef USE_OPENCL
+
 #include "GTP.h"
-#include "Random.h"
+#include "Network.h"
 #include "OpenCLScheduler.h"
+#include "Random.h"
+#include "Utils.h"
+
+using Utils::ceilMultiple;
+using Utils::myprintf;
+
+class from_float {
+public:
+    from_float(const std::vector<float>& f) : m_f(f) {}
 
-thread_local auto current_thread_gpu_num = size_t{0};
-OpenCLScheduler opencl;
+    operator const std::vector<float> &() {
+        return m_f;
+    }
+
+    operator std::vector<half_float::half>() {
+        auto ret = std::vector<half_float::half>(m_f.size());
+        std::copy(cbegin(m_f), cend(m_f), begin(ret));
+        return ret;
+    }
+
+private:
+    const std::vector<float>& m_f;
+};
 
-void OpenCLScheduler::initialize(const int channels) {
+template <typename T>
+static std::vector<T> zeropad_U(const std::vector<float>& U, const int outputs,
+                                const int channels, const int outputs_pad,
+                                const int channels_pad) {
+    // Fill with zeroes
+    auto Upad = std::vector<T>(WINOGRAD_TILE * outputs_pad * channels_pad);
+
+    for (auto xi = 0; xi < WINOGRAD_ALPHA; xi++) {
+        for (auto nu = 0; nu < WINOGRAD_ALPHA; nu++) {
+            for (auto c = 0; c < channels; c++) {
+                for (auto o = 0; o < outputs; o++) {
+                    Upad[xi * (WINOGRAD_ALPHA * outputs_pad * channels_pad)
+                         + nu * (outputs_pad * channels_pad) + c * outputs_pad
+                         + o] =
+                        U[xi * (WINOGRAD_ALPHA * outputs * channels)
+                          + nu * (outputs * channels) + c * outputs + o];
+                }
+            }
+        }
+    }
+
+    return Upad;
+}
+
+template <typename net_t>
+OpenCLScheduler<net_t>::OpenCLScheduler() {
     // multi-gpu?
-    if (!cfg_gpus.empty()) {
-        auto silent{false};
-        for (auto gpu : cfg_gpus) {
-            auto opencl = std::make_unique<OpenCL>();
-            auto net = std::make_unique<OpenCL_Network>(*opencl);
-            opencl->initialize(channels, {gpu}, silent);
-            m_opencl.push_back(std::move(opencl));
-            m_networks.push_back(std::move(net));
-
-            // Clear thread data on every init call.  We don't know which GPU
-            // this thread will be eventually be assigned to
-            opencl_thread_data = ThreadData();
-
-            // starting next GPU, let's not dump full list of GPUs
-            silent = true;
+    auto gpus = cfg_gpus;
+
+    // An empty GPU list from the command line represents autodetect.
+    // Put a minus one GPU index here.
+    if (gpus.empty()) {
+        gpus = {-1};
+    }
+
+    auto silent{false};
+
+    for (auto gpu : gpus) {
+        auto opencl = std::make_unique<OpenCL<net_t>>(gpu, silent);
+        auto net = std::make_unique<OpenCL_Network<net_t>>(*opencl);
+        m_opencl.push_back(std::move(opencl));
+        m_networks.push_back(std::move(net));
+
+        // Starting next GPU, let's not dump full list of GPUs.
+        silent = true;
+    }
+}
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::initialize(const int channels) {
+    // Launch the worker threads.  Minimum 1 worker per GPU, but use enough
+    // threads so that we can at least concurrently schedule something to the
+    // GPU.
+    auto num_worker_threads =
+        cfg_num_threads / cfg_batch_size / (m_opencl.size() + 1) + 1;
+    auto gnum = 0;
+    for (auto& opencl : m_opencl) {
+        opencl->initialize(channels, cfg_batch_size);
+
+        for (auto i = unsigned{0}; i < num_worker_threads; i++) {
+            auto t =
+                std::thread(&OpenCLScheduler<net_t>::batch_worker, this, gnum);
+            m_worker_threads.push_back(std::move(t));
         }
+        gnum++;
+    }
+
+    // Exit immediately after tuning.  We should exit here because we skipped
+    // initializing rest of the kernels due to some NVIDIA drivers crashing.
+    if (cfg_tune_only) {
+        exit(EXIT_SUCCESS);
+    }
+}
+
+template <typename net_t>
+OpenCLScheduler<net_t>::~OpenCLScheduler() {
+    {
+        std::unique_lock<std::mutex> lk(m_mutex);
+        m_running = false;
+    }
+    m_cv.notify_all();
+    for (auto& x : m_worker_threads) {
+        x.join();
+    }
+}
+
+template <typename net_t>
+bool OpenCLScheduler<net_t>::needs_autodetect() {
+    for (auto& opencl : m_opencl) {
+        // If any card has no native fp16 compute, we'll have to benchmark.
+        if (!opencl->has_fp16_compute() && !opencl->has_tensor_cores()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::push_input_convolution(
+    const unsigned int filter_size, const unsigned int channels,
+    const unsigned int outputs,
+    const std::vector<float>& weights,
+    const std::vector<float>& means,
+    const std::vector<float>& variances) {
+
+    for (const auto& opencl_net : m_networks) {
+        const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
+
+        const auto mwg = tuners[0];
+        const auto kwg = tuners[2];
+        const auto vwm = tuners[3];
+
+        const auto m_ceil = ceilMultiple(ceilMultiple(outputs, mwg), vwm);
+        const auto k_ceil = ceilMultiple(ceilMultiple(channels, kwg), vwm);
+
+        const auto Upad =
+            zeropad_U<net_t>(weights, outputs, channels, m_ceil, k_ceil);
+        opencl_net->push_input_convolution(filter_size, channels, outputs, Upad,
+                                           from_float(means),
+                                           from_float(variances));
+    }
+}
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::push_residual(
+    const unsigned int filter_size, const unsigned int channels,
+    const unsigned int outputs,
+    const std::vector<float>& weights_1,
+    const std::vector<float>& means_1,
+    const std::vector<float>& variances_1,
+    const std::vector<float>& weights_2,
+    const std::vector<float>& means_2,
+    const std::vector<float>& variances_2) {
+    for (const auto& opencl_net : m_networks) {
+        const auto tuners = opencl_net->getOpenCL().get_sgemm_tuners();
+
+        const auto mwg = tuners[0];
+        const auto vwm = tuners[3];
+
+        const auto m_ceil = ceilMultiple(ceilMultiple(outputs, mwg), vwm);
+        const auto Upad1 =
+            zeropad_U<net_t>(weights_1, outputs, outputs, m_ceil, m_ceil);
+        const auto Upad2 =
+            zeropad_U<net_t>(weights_2, outputs, outputs, m_ceil, m_ceil);
+        opencl_net->push_residual(filter_size, channels, outputs,
+                                  Upad1, from_float(means_1),
+                                  from_float(variances_1),
+                                  Upad2, from_float(means_2),
+                                  from_float(variances_2));
+    }
+}
 
-        for (size_t gnum = 0; gnum < m_networks.size(); gnum++) {
-            // launch the worker thread.  2 threads so that we can fully
-            // utilize GPU, since the worker thread consists of some CPU
-            // work for task preparation.
-            constexpr auto num_threads = 2;
-            for (auto i = 0; i < num_threads; i++) {
-                m_threadpool.add_thread([gnum] {
-                    current_thread_gpu_num = gnum;
+template <typename net_t>
+void OpenCLScheduler<net_t>::push_convolve(const unsigned int filter_size,
+                                           const unsigned int channels,
+                                           const unsigned int outputs,
+                                           const std::vector<float>& weights) {
+    for (const auto& opencl_net : m_networks) {
+        opencl_net->push_convolve(filter_size, channels, outputs,
+                                  from_float(weights));
+    }
+}
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::push_weights(
+    const unsigned int filter_size, const unsigned int channels,
+    const unsigned int outputs,
+    std::shared_ptr<const ForwardPipeWeights> weights) {
+
+    auto weight_index = size_t{0};
+
+    // Winograd filter transformation changes filter size to 4x4
+    push_input_convolution(filter_size, channels, outputs,
+                           weights->m_conv_weights[weight_index],
+                           weights->m_batchnorm_means[weight_index],
+                           weights->m_batchnorm_stddevs[weight_index]);
+    weight_index++;
+
+    // residual blocks : except the first entry,
+    // the second ~ last entry is all on residual topwer
+    for (auto i = size_t{0}; i < weights->m_conv_weights.size() / 2; i++) {
+        push_residual(filter_size, outputs, outputs,
+                      weights->m_conv_weights[weight_index],
+                      weights->m_batchnorm_means[weight_index],
+                      weights->m_batchnorm_stddevs[weight_index],
+                      weights->m_conv_weights[weight_index + 1],
+                      weights->m_batchnorm_means[weight_index + 1],
+                      weights->m_batchnorm_stddevs[weight_index + 1]);
+        weight_index += 2;
+    }
+
+    // Output head convolutions
+    push_convolve(1, outputs, Network::OUTPUTS_POLICY, weights->m_conv_pol_w);
+    push_convolve(1, outputs, Network::OUTPUTS_VALUE, weights->m_conv_val_w);
+}
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::forward(const std::vector<float>& input,
+                                     std::vector<float>& output_pol,
+                                     std::vector<float>& output_val) {
+    auto entry =
+        std::make_shared<ForwardQueueEntry>(input, output_pol, output_val);
+    std::unique_lock<std::mutex> lk(entry->mutex);
+    {
+        std::unique_lock<std::mutex> lk(m_mutex);
+        m_forward_queue.push_back(entry);
+
+        if (m_single_eval_in_progress.load()) {
+            m_waittime += 2;
+        }
+    }
+    m_cv.notify_one();
+    entry->cv.wait(lk);
+
+    if (m_draining) {
+        throw NetworkHaltException();
+    }
+}
+
+#ifndef NDEBUG
+struct batch_stats_t batch_stats;
+#endif
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::batch_worker(const size_t gnum) {
+    constexpr auto in_size = Network::INPUT_CHANNELS * BOARD_SIZE * BOARD_SIZE;
+    constexpr auto out_pol_size =
+        Network::OUTPUTS_POLICY * BOARD_SIZE * BOARD_SIZE;
+    constexpr auto out_val_size =
+        Network::OUTPUTS_VALUE * BOARD_SIZE * BOARD_SIZE;
+
+    OpenCLContext context;
+
+    // batch scheduling heuristic.
+    // Returns the batch picked up from the queue (m_forward_queue)
+    // 1) Wait for m_waittime milliseconds for full batch
+    // 2) if we don't have a full batch then just do a single eval
+    //
+    // The purpose of m_waittime is to prevent the system from deadlocking
+    // because we were waiting for a job too long, while the job is never
+    // going to come due to a control dependency (e.g., evals stuck on a
+    // critical path).  To do so:
+    //
+    // 1) if we couldn't form a batch after waiting m_waittime ms, it means
+    // that we hit the critical path and should do scalar evals.
+    // Wait 1ms shorter next time.
+    //
+    // 2) if we picked up a single eval, but were getting additional evals
+    // while that single eval was being processed, it means that we made
+    // the wrong decision.  Wait 2ms longer next time.
+
+    auto pickup_task = [this]() {
+        std::list<std::shared_ptr<ForwardQueueEntry>> inputs;
+        size_t count = 0;
+
+        std::unique_lock<std::mutex> lk(m_mutex);
+        while (true) {
+            if (!m_running) {
+                return inputs;
+            }
+            count = m_forward_queue.size();
+            if (count >= cfg_batch_size) {
+                count = cfg_batch_size;
+                break;
+            }
+
+            bool timeout = !m_cv.wait_for(
+                lk, std::chrono::milliseconds(m_waittime), [this]() {
+                    return !m_running
+                           || m_forward_queue.size() >= cfg_batch_size;
                 });
+
+            if (!m_forward_queue.empty()) {
+                if (timeout
+                    && m_single_eval_in_progress.exchange(true) == false) {
+                    // Waited long enough but couldn't form a batch.
+                    // Check if there is any other single eval in progress,
+                    // and if not, do one from this thread.
+                    if (m_waittime > 1) {
+                        m_waittime--;
+                    }
+                    count = 1;
+                    break;
+                }
             }
         }
-    } else {
-        auto opencl = std::make_unique<OpenCL>();
-        auto net = std::make_unique<OpenCL_Network>(*opencl);
-        opencl->initialize(channels, {});
+        // Move 'count' evals from shared queue to local list.
+        auto end = begin(m_forward_queue);
+        std::advance(end, count);
+        std::move(begin(m_forward_queue), end, std::back_inserter(inputs));
+        m_forward_queue.erase(begin(m_forward_queue), end);
 
-        m_opencl.push_back(std::move(opencl));
-        m_networks.push_back(std::move(net));
+        return inputs;
+    };
+
+    auto batch_input = std::vector<float>();
+    auto batch_output_pol = std::vector<float>();
+    auto batch_output_val = std::vector<float>();
+
+    while (true) {
+        auto inputs = pickup_task();
+        auto count = inputs.size();
+
+        if (!m_running) {
+            return;
+        }
+
+#ifndef NDEBUG
+        if (count == 1) {
+            batch_stats.single_evals++;
+        } else {
+            batch_stats.batch_evals++;
+        }
+#endif
+
+        // prepare input for forward() call
+        batch_input.resize(in_size * count);
+        batch_output_pol.resize(out_pol_size * count);
+        batch_output_val.resize(out_val_size * count);
+
+        auto index = size_t{0};
+        for (auto& x : inputs) {
+            std::unique_lock<std::mutex> lk(x->mutex);
+            std::copy(begin(x->in), end(x->in),
+                      begin(batch_input) + in_size * index);
+            index++;
+        }
+
+        // run the NN evaluation
+        m_networks[gnum]->forward(batch_input, batch_output_pol,
+                                  batch_output_val, context, count);
+
+        // Get output and copy back
+        index = 0;
+        for (auto& x : inputs) {
+            std::copy(begin(batch_output_pol) + out_pol_size * index,
+                      begin(batch_output_pol) + out_pol_size * (index + 1),
+                      begin(x->out_p));
+            std::copy(begin(batch_output_val) + out_val_size * index,
+                      begin(batch_output_val) + out_val_size * (index + 1),
+                      begin(x->out_v));
+            x->cv.notify_all();
+            index++;
+        }
+
+        if (count == 1) {
+            m_single_eval_in_progress = false;
+        }
     }
 }
 
-void OpenCLScheduler::forward(const std::vector<net_t>& input,
-                              std::vector<net_t>& output_pol,
-                              std::vector<net_t>& output_val) {
-    if (m_networks.size() == 1) {
-        m_networks[0]->forward(input, output_pol, output_val);
-        return;
+template <typename net_t>
+void OpenCLScheduler<net_t>::drain() {
+    // When signaled to drain requests, this method picks up all pending
+    // requests and wakes them up.  Throws exception once the woken up request
+    // sees m_draining.
+    m_draining = true;
+
+    std::list<std::shared_ptr<ForwardQueueEntry>> fq;
+    {
+        std::unique_lock<std::mutex> lk(m_mutex);
+        std::move(m_forward_queue.begin(), m_forward_queue.end(),
+                  std::back_inserter(fq));
+        m_forward_queue.clear();
     }
 
-    auto f = m_threadpool.add_task([this, &input, &output_pol, &output_val]{
-        m_networks[current_thread_gpu_num]->forward(input, output_pol, output_val);
-    });
+    for (auto& x : fq) {
+        {
+            // dummy lock/unlock to make sure thread in forward() is sleeping
+            std::unique_lock<std::mutex> lk(x->mutex);
+        }
+        x->cv.notify_all();
+    }
+}
+
+template <typename net_t>
+void OpenCLScheduler<net_t>::resume() {
+    // UCTNode::think() should wait for all child threads to complete before resuming.
+    assert(m_forward_queue.empty());
 
-    f.get();
+    m_draining = false;
 }
+
+template class OpenCLScheduler<float>;
+#ifdef USE_HALF
+template class OpenCLScheduler<half_float::half>;
+#endif
+
 #endif
diff --git a/src/OpenCLScheduler.h b/src/OpenCLScheduler.h
index ca232a068..3a447fe7f 100644
--- a/src/OpenCLScheduler.h
+++ b/src/OpenCLScheduler.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Junhee Yoo and contributors
+    Copyright (C) 2018-2019 Junhee Yoo and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,44 +14,107 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
-#ifndef OPENCL_SCHEDULER_H_INCLUDED
-#define OPENCL_SCHEDULER_H_INCLUDED
+#ifndef OPENCLSCHEDULER_H_INCLUDED
+#define OPENCLSCHEDULER_H_INCLUDED
 #include "config.h"
 
+#include <list>
+#include <thread>
 #include <vector>
-#include <future>
 
+#include "ForwardPipe.h"
 #include "OpenCL.h"
+#include "SMP.h"
 #include "ThreadPool.h"
 
-class OpenCLScheduler {
-public:
-    void initialize(const int channels);
-    std::vector<std::unique_ptr<OpenCL_Network>> & get_networks() {
-        return m_networks;
-    }
-    void forward(const std::vector<net_t>& input,
-                 std::vector<net_t>& output_pol,
-                 std::vector<net_t>& output_val);
-private:
-    class ForwardTask {
+#ifndef NDEBUG
+struct batch_stats_t {
+    std::atomic<size_t> single_evals{0};
+    std::atomic<size_t> batch_evals{0};
+};
+extern batch_stats_t batch_stats;
+#endif
+
+template <typename net_t>
+class OpenCLScheduler : public ForwardPipe {
+    class ForwardQueueEntry {
     public:
-        const std::vector<net_t> *input;
-        std::vector<net_t> * output;
-        std::promise<void> prom;
-        ForwardTask() : input(nullptr), output(nullptr) {}
-        ForwardTask(const std::vector<net_t> * in,
-                    std::vector<net_t> * out)
-            : input(in), output(out) {}
+        std::mutex mutex;
+        std::condition_variable cv;
+        const std::vector<float>& in;
+        std::vector<float>& out_p;
+        std::vector<float>& out_v;
+        ForwardQueueEntry(const std::vector<float>& input,
+                          std::vector<float>& output_pol,
+                          std::vector<float>& output_val)
+            : in(input), out_p(output_pol), out_v(output_val) {}
     };
 
-    std::vector<std::unique_ptr<OpenCL_Network>> m_networks;
-    std::vector<std::unique_ptr<OpenCL>> m_opencl;
-    Utils::ThreadPool m_threadpool;
-};
+public:
+    virtual ~OpenCLScheduler();
+    OpenCLScheduler();
+
+    virtual void initialize(int channels);
+    virtual void forward(const std::vector<float>& input,
+                         std::vector<float>& output_pol,
+                         std::vector<float>& output_val);
+    virtual bool needs_autodetect();
+    virtual void push_weights(
+        unsigned int filter_size, unsigned int channels, unsigned int outputs,
+        std::shared_ptr<const ForwardPipeWeights> weights);
 
-extern OpenCLScheduler opencl;
+private:
+    bool m_running = true;
+    std::atomic<bool> m_draining{false};
+    std::vector<std::unique_ptr<OpenCL_Network<net_t>>> m_networks;
+    std::vector<std::unique_ptr<OpenCL<net_t>>> m_opencl;
+
+    std::mutex m_mutex;
+    std::condition_variable m_cv;
+
+    // start with 10 milliseconds : lock protected
+    int m_waittime{10};
+
+    // set to true when single (non-batch) eval is in progress
+    std::atomic<bool> m_single_eval_in_progress{false};
+
+    std::list<std::shared_ptr<ForwardQueueEntry>> m_forward_queue;
+    std::list<std::thread> m_worker_threads;
+
+    void batch_worker(size_t gnum);
+    void push_input_convolution(unsigned int filter_size, unsigned int channels,
+                                unsigned int outputs,
+                                const std::vector<float>& weights,
+                                const std::vector<float>& means,
+                                const std::vector<float>& variances);
+
+    void push_residual(unsigned int filter_size, unsigned int channels,
+                       unsigned int outputs,
+                       const std::vector<float>& weights_1,
+                       const std::vector<float>& means_1,
+                       const std::vector<float>& variances_1,
+                       const std::vector<float>& weights_2,
+                       const std::vector<float>& means_2,
+                       const std::vector<float>& variances_2);
+
+    void push_convolve(unsigned int filter_size, unsigned int channels,
+                       unsigned int outputs, const std::vector<float>& weights);
+
+    virtual void drain();
+    virtual void resume();
+};
 
 #endif
diff --git a/src/Random.cpp b/src/Random.cpp
index 39c583546..1b176bb76 100644
--- a/src/Random.cpp
+++ b/src/Random.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,25 +14,37 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "Random.h"
 
 #include <climits>
 #include <cstdint>
-#include <thread>
 #include <random>
+#include <thread>
+
+#include "Random.h"
 
 #include "GTP.h"
 #include "Utils.h"
 
-Random& Random::get_Rng(void) {
+Random& Random::get_Rng() {
     static thread_local Random s_rng{0};
     return s_rng;
 }
 
-Random::Random(std::uint64_t seed) {
+Random::Random(const std::uint64_t seed) {
     if (seed == 0) {
         size_t thread_id =
             std::hash<std::thread::id>()(std::this_thread::get_id());
@@ -45,7 +57,7 @@ Random::Random(std::uint64_t seed) {
 // This is xoroshiro128+.
 // Note that the last bit isn't entirely random, so don't use it,
 // if possible.
-std::uint64_t Random::gen(void) {
+std::uint64_t Random::gen() {
     const std::uint64_t s0 = m_s[0];
     std::uint64_t s1 = m_s[1];
     const std::uint64_t result = s0 + s1;
@@ -73,11 +85,10 @@ static std::uint64_t splitmix64(std::uint64_t z) {
     return z ^ (z >> 31);
 }
 
-void Random::seedrandom(std::uint64_t seed) {
+void Random::seedrandom(const std::uint64_t seed) {
     // Initialize state of xoroshiro128+ by transforming the seed
     // with the splitmix64 algorithm.
     // As suggested by http://xoroshiro.di.unimi.it/xoroshiro128plus.c
     m_s[0] = splitmix64(seed);
     m_s[1] = splitmix64(m_s[0]);
 }
-
diff --git a/src/Random.h b/src/Random.h
index 4d4ce0c7a..10f7f0ac1 100644
--- a/src/Random.h
+++ b/src/Random.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,12 +14,24 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef RANDOM_H_INCLUDED
 #define RANDOM_H_INCLUDED
 
 #include "config.h"
+
 #include <cstdint>
 #include <limits>
 
@@ -33,11 +45,11 @@ class Random {
     void seedrandom(std::uint64_t s);
 
     // Random numbers from [0, max - 1]
-    template<int MAX>
+    template <int MAX>
     std::uint32_t randfix() {
-        static_assert(0 < MAX &&
-                     MAX < std::numeric_limits<std::uint32_t>::max(),
-                     "randfix out of range");
+        static_assert(0 < MAX
+                          && MAX < std::numeric_limits<std::uint32_t>::max(),
+                      "randfix out of range");
         // Last bit isn't random, so don't use it in isolation. We specialize
         // this case.
         static_assert(MAX != 2, "don't isolate the LSB with xoroshiro128+");
@@ -47,10 +59,10 @@ class Random {
     std::uint64_t randuint64();
 
     // Random number from [0, max - 1]
-    std::uint64_t randuint64(const std::uint64_t max);
+    std::uint64_t randuint64(std::uint64_t max);
 
     // return the thread local RNG
-    static Random& get_Rng(void);
+    static Random& get_Rng();
 
     // UniformRandomBitGenerator interface
     using result_type = std::uint64_t;
@@ -65,12 +77,12 @@ class Random {
     }
 
 private:
-    std::uint64_t gen(void);
+    std::uint64_t gen();
     std::uint64_t m_s[2];
 };
 
 // Specialization for last bit: use sign test
-template<>
+template <>
 inline std::uint32_t Random::randfix<2>() {
     return (gen() > (std::numeric_limits<std::uint64_t>::max() / 2));
 }
diff --git a/src/SGFParser.cpp b/src/SGFParser.cpp
index 508d1fa6b..39ba5c7ac 100644
--- a/src/SGFParser.cpp
+++ b/src/SGFParser.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,9 +14,18 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
-#include "SGFParser.h"
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
 #include <cassert>
 #include <cctype>
@@ -24,24 +33,28 @@
 #include <stdexcept>
 #include <string>
 
+#include "SGFParser.h"
+
 #include "SGFTree.h"
 #include "Utils.h"
 
 std::vector<std::string> SGFParser::chop_stream(std::istream& ins,
-                                                size_t stopat) {
+                                                const size_t stopat) {
     std::vector<std::string> result;
     std::string gamebuff;
 
     ins >> std::noskipws;
 
-    int nesting = 0;      // parentheses
-    bool intag = false;   // brackets
+    int nesting = 0;    // parentheses
+    bool intag = false; // brackets
     int line = 0;
     gamebuff.clear();
 
     char c;
     while (ins >> c && result.size() <= stopat) {
-        if (c == '\n') line++;
+        if (c == '\n') {
+            line++;
+        }
 
         gamebuff.push_back(c);
         if (c == '\\') {
@@ -85,9 +98,10 @@ std::vector<std::string> SGFParser::chop_stream(std::istream& ins,
     return result;
 }
 
-std::vector<std::string> SGFParser::chop_all(std::string filename,
-                                             size_t stopat) {
-    std::ifstream ins(filename.c_str(), std::ifstream::binary | std::ifstream::in);
+std::vector<std::string> SGFParser::chop_all(const std::string& filename,
+                                             const size_t stopat) {
+    std::ifstream ins(filename.c_str(),
+                      std::ifstream::binary | std::ifstream::in);
 
     if (ins.fail()) {
         throw std::runtime_error("Error opening file");
@@ -100,12 +114,13 @@ std::vector<std::string> SGFParser::chop_all(std::string filename,
 }
 
 // scan the file and extract the game with number index
-std::string SGFParser::chop_from_file(std::string filename, size_t index) {
+std::string SGFParser::chop_from_file(const std::string& filename,
+                                      const size_t index) {
     auto vec = chop_all(filename, index);
     return vec[index];
 }
 
-std::string SGFParser::parse_property_name(std::istringstream & strm) {
+std::string SGFParser::parse_property_name(std::istringstream& strm) {
     std::string result;
 
     char c;
@@ -124,8 +139,8 @@ std::string SGFParser::parse_property_name(std::istringstream & strm) {
     return result;
 }
 
-bool SGFParser::parse_property_value(std::istringstream & strm,
-                                     std::string & result) {
+bool SGFParser::parse_property_value(std::istringstream& strm,
+                                     std::string& result) {
     strm >> std::noskipws;
 
     char c;
@@ -157,7 +172,7 @@ bool SGFParser::parse_property_value(std::istringstream & strm,
     return true;
 }
 
-void SGFParser::parse(std::istringstream & strm, SGFTree * node) {
+void SGFParser::parse(std::istringstream& strm, SGFTree* node) {
     bool splitpoint = false;
 
     char c;
@@ -200,7 +215,7 @@ void SGFParser::parse(std::istringstream & strm, SGFTree * node) {
             // start a variation here
             splitpoint = true;
             // new node
-            SGFTree * newptr = node->add_child();
+            SGFTree* newptr = node->add_child();
             parse(strm, newptr);
         } else if (c == ')') {
             // variation ends, go back
@@ -216,7 +231,7 @@ void SGFParser::parse(std::istringstream & strm, SGFTree * node) {
             }
         } else if (c == ';') {
             // new node
-            SGFTree * newptr = node->add_child();
+            SGFTree* newptr = node->add_child();
             node = newptr;
             continue;
         }
diff --git a/src/SGFParser.h b/src/SGFParser.h
index 04132322f..117c0c87b 100644
--- a/src/SGFParser.h
+++ b/src/SGFParser.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,14 +14,25 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef SGFPARSER_H_INCLUDED
 #define SGFPARSER_H_INCLUDED
 
+#include <climits>
 #include <cstddef>
 #include <cstdint>
-#include <climits>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -30,16 +41,18 @@
 
 class SGFParser {
 private:
-    static std::string parse_property_name(std::istringstream & strm);
-    static bool parse_property_value(std::istringstream & strm, std::string & result);
+    static std::string parse_property_name(std::istringstream& strm);
+    static bool parse_property_value(std::istringstream& strm,
+                                     std::string& result);
+
 public:
-    static std::string chop_from_file(std::string fname, size_t index);
-    static std::vector<std::string> chop_all(std::string fname,
+    static std::string chop_from_file(const std::string& filename,
+                                      size_t index);
+    static std::vector<std::string> chop_all(const std::string& filename,
                                              size_t stopat = SIZE_MAX);
     static std::vector<std::string> chop_stream(std::istream& ins,
                                                 size_t stopat = SIZE_MAX);
-    static void parse(std::istringstream & strm, SGFTree * node);
+    static void parse(std::istringstream& strm, SGFTree* node);
 };
 
-
 #endif
diff --git a/src/SGFTree.cpp b/src/SGFTree.cpp
index 6402e8205..a3b1d21c2 100644
--- a/src/SGFTree.cpp
+++ b/src/SGFTree.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,14 +14,24 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "SGFTree.h"
 
-#include <cassert>
-#include <boost/format.hpp>
 #include <boost/algorithm/string.hpp>
+#include <boost/format.hpp>
+#include <cassert>
 #include <ctime>
 #include <memory>
 #include <sstream>
@@ -29,6 +39,8 @@
 #include <type_traits>
 #include <utility>
 
+#include "SGFTree.h"
+
 #include "FullBoard.h"
 #include "GTP.h"
 #include "KoState.h"
@@ -37,20 +49,22 @@
 
 using namespace Utils;
 
-void SGFTree::init_state(void) {
+const int SGFTree::EOT;
+
+void SGFTree::init_state() {
     m_initialized = true;
     // Initialize with defaults.
     // The SGF might be missing boardsize or komi
     // which means we'll never initialize properly.
-    m_state.init_game(19, 7.5f);
+    m_state.init_game(std::min(BOARD_SIZE, 19), KOMI);
 }
 
-KoState * SGFTree::get_state(void) {
+const KoState* SGFTree::get_state() const {
     assert(m_initialized);
     return &m_state;
 }
 
-SGFTree * SGFTree::get_child(size_t count) {
+const SGFTree* SGFTree::get_child(const size_t count) const {
     if (count < m_children.size()) {
         assert(m_initialized);
         return &(m_children[count]);
@@ -62,23 +76,29 @@ SGFTree * SGFTree::get_child(size_t count) {
 // This follows the entire line, and doesn't really need the intermediate
 // states, just the moves. As a consequence, states that contain more than
 // just moves won't have any effect.
-GameState SGFTree::follow_mainline_state(unsigned int movenum) {
-    SGFTree * link = this;
+GameState SGFTree::follow_mainline_state(const unsigned int movenum) const {
+    const auto* link = this;
     // This initializes a starting state from a KoState and
     // sets up the game history.
     GameState result(get_state());
 
+    if (m_timecontrol_ptr) {
+        result.set_timecontrol(*m_timecontrol_ptr);
+    }
+
     for (unsigned int i = 0; i <= movenum && link != nullptr; i++) {
         // root position has no associated move
         if (i != 0) {
-            int move = link->get_move(result.get_to_move());
-            if (move != SGFTree::EOT) {
-                if (move != FastBoard::PASS && move != FastBoard::EMPTY
-                    && result.board.get_square(move) != FastBoard::EMPTY) {
+            auto colored_move = link->get_colored_move();
+            if (colored_move.first != FastBoard::INVAL) {
+                if (colored_move.second != FastBoard::PASS
+                    && colored_move.second != FastBoard::EMPTY
+                    && result.board.get_state(colored_move.second)
+                           != FastBoard::EMPTY) {
                     // Fail loading
                     return result;
                 }
-                result.play_move(move);
+                result.play_move(colored_move.first, colored_move.second);
             }
         }
         link = link->get_child(0);
@@ -87,7 +107,7 @@ GameState SGFTree::follow_mainline_state(unsigned int movenum) {
     return result;
 }
 
-void SGFTree::load_from_string(std::string gamebuff) {
+void SGFTree::load_from_string(const std::string& gamebuff) {
     std::istringstream pstream(gamebuff);
 
     // loads properties with moves
@@ -102,22 +122,22 @@ void SGFTree::load_from_string(std::string gamebuff) {
 }
 
 // load a single game from a file
-void SGFTree::load_from_file(std::string filename, int index) {
-    std::string gamebuff = SGFParser::chop_from_file(filename, index);
+void SGFTree::load_from_file(const std::string& filename, const int index) {
+    auto gamebuff = SGFParser::chop_from_file(filename, index);
 
-    //myprintf("Parsing: %s\n", gamebuff.c_str());
+    // myprintf("Parsing: %s\n", gamebuff.c_str());
 
     load_from_string(gamebuff);
 }
 
-void SGFTree::populate_states(void) {
+void SGFTree::populate_states() {
     PropertyMap::iterator it;
-    bool valid_size = false;
-    bool has_handicap = false;
+    auto valid_size = false;
+    auto has_handicap = false;
 
     // first check for go game setup in properties
     it = m_properties.find("GM");
-    if (it != m_properties.end()) {
+    if (it != end(m_properties)) {
         if (it->second != "1") {
             throw std::runtime_error("SGF Game is not a Go game");
         } else {
@@ -131,14 +151,14 @@ void SGFTree::populate_states(void) {
 
     // board size
     it = m_properties.find("SZ");
-    if (it != m_properties.end()) {
-        std::string size = it->second;
+    if (it != end(m_properties)) {
+        const auto size = it->second;
         std::istringstream strm(size);
         int bsize;
         strm >> bsize;
         if (bsize == BOARD_SIZE) {
-            // Assume 7.5 komi if not specified
-            m_state.init_game(bsize, 7.5f);
+            // Assume default komi in config.h if not specified
+            m_state.init_game(bsize, KOMI);
             valid_size = true;
         } else {
             throw std::runtime_error("Board size not supported.");
@@ -147,14 +167,14 @@ void SGFTree::populate_states(void) {
 
     // komi
     it = m_properties.find("KM");
-    if (it != m_properties.end()) {
-        std::string foo = it->second;
+    if (it != end(m_properties)) {
+        const auto foo = it->second;
         std::istringstream strm(foo);
         float komi;
         strm >> komi;
-        int handicap = m_state.get_handicap();
+        const auto handicap = m_state.get_handicap();
         // last ditch effort: if no GM or SZ, assume 19x19 Go here
-        int bsize = 19;
+        auto bsize = 19;
         if (valid_size) {
             bsize = m_state.board.get_boardsize();
         }
@@ -166,10 +186,33 @@ void SGFTree::populate_states(void) {
         }
     }
 
+    // time
+    it = m_properties.find("TM");
+    if (it != end(m_properties)) {
+        const auto maintime = it->second;
+        it = m_properties.find("OT");
+        const auto byoyomi = (it != end(m_properties)) ? it->second : "";
+        it = m_properties.find("BL");
+        const auto black_time_left =
+            (it != end(m_properties)) ? it->second : "";
+        it = m_properties.find("WL");
+        const auto white_time_left =
+            (it != end(m_properties)) ? it->second : "";
+        it = m_properties.find("OB");
+        const auto black_moves_left =
+            (it != end(m_properties)) ? it->second : "";
+        it = m_properties.find("OW");
+        const auto white_moves_left =
+            (it != end(m_properties)) ? it->second : "";
+        m_timecontrol_ptr = TimeControl::make_from_text_sgf(
+            maintime, byoyomi, black_time_left, white_time_left,
+            black_moves_left, white_moves_left);
+    }
+
     // handicap
     it = m_properties.find("HA");
-    if (it != m_properties.end()) {
-        std::string size = it->second;
+    if (it != end(m_properties)) {
+        const auto size = it->second;
         std::istringstream strm(size);
         float handicap;
         strm >> handicap;
@@ -179,8 +222,8 @@ void SGFTree::populate_states(void) {
 
     // result
     it = m_properties.find("RE");
-    if (it != m_properties.end()) {
-        std::string result = it->second;
+    if (it != end(m_properties)) {
+        const auto result = it->second;
         if (boost::algorithm::find_first(result, "Time")) {
             // std::cerr << "Skipping: " << result << std::endl;
             m_winner = FastBoard::EMPTY;
@@ -191,7 +234,8 @@ void SGFTree::populate_states(void) {
                 m_winner = FastBoard::BLACK;
             } else {
                 m_winner = FastBoard::INVAL;
-                // std::cerr << "Could not parse game result: " << result << std::endl;
+                // std::cerr << "Could not parse game result: " << result <<
+                // std::endl;
             }
         }
     } else {
@@ -211,22 +255,22 @@ void SGFTree::populate_states(void) {
     }
     // Loop through the stone list and apply
     for (auto pit = prop_pair_ab.first; pit != prop_pair_ab.second; ++pit) {
-        auto move = pit->second;
-        int vtx = string_to_vertex(move);
+        const auto move = pit->second;
+        const auto vtx = string_to_vertex(move);
         apply_move(FastBoard::BLACK, vtx);
     }
 
     // XXX: count handicap stones
     const auto& prop_pair_aw = m_properties.equal_range("AW");
     for (auto pit = prop_pair_aw.first; pit != prop_pair_aw.second; ++pit) {
-        auto move = pit->second;
-        int vtx = string_to_vertex(move);
+        const auto move = pit->second;
+        const auto vtx = string_to_vertex(move);
         apply_move(FastBoard::WHITE, vtx);
     }
 
     it = m_properties.find("PL");
-    if (it != m_properties.end()) {
-        std::string who = it->second;
+    if (it != end(m_properties)) {
+        const auto who = it->second;
         if (who == "W") {
             m_state.set_to_move(FastBoard::WHITE);
         } else if (who == "B") {
@@ -241,9 +285,9 @@ void SGFTree::populate_states(void) {
 
         // XXX: maybe move this to the recursive call
         // get move for side to move
-        int move = child_state.get_move(m_state.get_to_move());
-        if (move != EOT) {
-            child_state.apply_move(move);
+        const auto colored_move = child_state.get_colored_move();
+        if (colored_move.first != FastBoard::INVAL) {
+            child_state.apply_move(colored_move.first, colored_move.second);
         }
 
         child_state.populate_states();
@@ -253,26 +297,27 @@ void SGFTree::populate_states(void) {
 void SGFTree::copy_state(const SGFTree& tree) {
     m_initialized = tree.m_initialized;
     m_state = tree.m_state;
+    m_timecontrol_ptr = tree.m_timecontrol_ptr;
 }
 
-void SGFTree::apply_move(int color, int move) {
+void SGFTree::apply_move(const int color, const int move) {
     if (move != FastBoard::PASS && move != FastBoard::RESIGN) {
-        int curr_sq = m_state.board.get_square(move);
-        if (curr_sq == !color || curr_sq == FastBoard::INVAL) {
+        auto vtx_state = m_state.board.get_state(move);
+        if (vtx_state == !color || vtx_state == FastBoard::INVAL) {
             throw std::runtime_error("Illegal move");
         }
-        // Playing on an occupied square is legal in SGF setup,
+        // Playing on an occupied intersection is legal in SGF setup,
         // but we can't really handle it. So just ignore and hope that works.
-        if (curr_sq == color) {
+        if (vtx_state == color) {
             return;
         }
-        assert(curr_sq == FastBoard::EMPTY);
+        assert(vtx_state == FastBoard::EMPTY);
     }
     m_state.play_move(color, move);
 }
 
-void SGFTree::apply_move(int move) {
-    int color = m_state.get_to_move();
+void SGFTree::apply_move(const int move) {
+    auto color = m_state.get_to_move();
     apply_move(color, move);
 }
 
@@ -280,7 +325,7 @@ void SGFTree::add_property(std::string property, std::string value) {
     m_properties.emplace(property, value);
 }
 
-SGFTree * SGFTree::add_child() {
+SGFTree* SGFTree::add_child() {
     // first allocation is better small
     if (m_children.size() == 0) {
         m_children.reserve(1);
@@ -323,8 +368,7 @@ int SGFTree::string_to_vertex(const std::string& movestring) const {
     }
 
     // catch illegal SGF
-    if (cc1 < 0 || cc1 >= bsize
-        || cc2 < 0 || cc2 >= bsize) {
+    if (cc1 < 0 || cc1 >= bsize || cc2 < 0 || cc2 >= bsize) {
         throw std::runtime_error("Illegal SGF move");
     }
 
@@ -333,7 +377,7 @@ int SGFTree::string_to_vertex(const std::string& movestring) const {
     return vtx;
 }
 
-int SGFTree::get_move(int tomove) {
+int SGFTree::get_move(const int tomove) const {
     std::string colorstring;
 
     if (tomove == FastBoard::BLACK) {
@@ -342,10 +386,8 @@ int SGFTree::get_move(int tomove) {
         colorstring = "W";
     }
 
-    PropertyMap::iterator it;
-    it = m_properties.find(colorstring);
-
-    if (it != m_properties.end()) {
+    auto it = m_properties.find(colorstring);
+    if (it != end(m_properties)) {
         std::string movestring = it->second;
         return string_to_vertex(movestring);
     }
@@ -353,19 +395,32 @@ int SGFTree::get_move(int tomove) {
     return SGFTree::EOT;
 }
 
-FastBoard::square_t SGFTree::get_winner() const {
+std::pair<int, int> SGFTree::get_colored_move() const {
+    for (const auto& prop : m_properties) {
+        if (prop.first == "B") {
+            return std::make_pair(FastBoard::BLACK,
+                                  string_to_vertex(prop.second));
+        } else if (prop.first == "W") {
+            return std::make_pair(FastBoard::WHITE,
+                                  string_to_vertex(prop.second));
+        }
+    }
+    return std::make_pair(FastBoard::INVAL, SGFTree::EOT);
+}
+
+FastBoard::vertex_t SGFTree::get_winner() const {
     return m_winner;
 }
 
-std::vector<int> SGFTree::get_mainline() {
+std::vector<int> SGFTree::get_mainline() const {
     std::vector<int> moves;
 
-    SGFTree * link = this;
-    int tomove = link->m_state.get_to_move();
+    const auto* link = this;
+    auto tomove = link->m_state.get_to_move();
     link = link->get_child(0);
 
     while (link != nullptr && link->is_initialized()) {
-        int move = link->get_move(tomove);
+        auto move = link->get_move(tomove);
         if (move != SGFTree::EOT) {
             moves.push_back(move);
         }
@@ -376,7 +431,7 @@ std::vector<int> SGFTree::get_mainline() {
     return moves;
 }
 
-std::string SGFTree::state_to_string(GameState& pstate, int compcolor) {
+std::string SGFTree::state_to_string(GameState& pstate, const int compcolor) {
     auto state = std::make_unique<GameState>();
 
     // make a working copy
@@ -385,8 +440,8 @@ std::string SGFTree::state_to_string(GameState& pstate, int compcolor) {
     std::string header;
     std::string moves;
 
-    float komi = state->get_komi();
-    int size = state->board.get_boardsize();
+    auto komi = state->get_komi();
+    auto size = state->board.get_boardsize();
     time_t now;
     time(&now);
     char timestr[sizeof "2017-10-16"];
@@ -421,17 +476,18 @@ std::string SGFTree::state_to_string(GameState& pstate, int compcolor) {
     state->rewind();
 
     // check handicap here (anchor point)
-    int handicap = 0;
+    auto handicap = 0;
     std::string handicapstr;
 
     for (int i = 0; i < size; i++) {
         for (int j = 0; j < size; j++) {
             int vertex = state->board.get_vertex(i, j);
-            int square = state->board.get_square(vertex);
+            int vtx_state = state->board.get_state(vertex);
 
-            if (square == FastBoard::BLACK) {
+            if (vtx_state == FastBoard::BLACK) {
                 handicap++;
-                handicapstr.append("[" + state->board.move_to_text_sgf(vertex) + "]");
+                handicapstr.append("[" + state->board.move_to_text_sgf(vertex)
+                                   + "]");
             }
         }
     }
@@ -464,8 +520,10 @@ std::string SGFTree::state_to_string(GameState& pstate, int compcolor) {
 
         if (score > 0.0f) {
             header.append("RE[B+" + str(boost::format("%.1f") % score) + "]");
-        } else {
+        } else if (score < 0.0f) {
             header.append("RE[W+" + str(boost::format("%.1f") % -score) + "]");
+        } else {
+            header.append("RE[0]");
         }
     } else {
         if (state->who_resigned() == FastBoard::WHITE) {
@@ -475,7 +533,8 @@ std::string SGFTree::state_to_string(GameState& pstate, int compcolor) {
         }
     }
 
-    header.append("\nC[" + std::string{PROGRAM_NAME} + " options:" + cfg_options_str + "]");
+    header.append("\nC[" + std::string{PROGRAM_NAME}
+                  + " options:" + cfg_options_str + "]");
 
     std::string result(header);
     result.append("\n");
diff --git a/src/SGFTree.h b/src/SGFTree.h
index 30270cef5..5bd593eb8 100644
--- a/src/SGFTree.h
+++ b/src/SGFTree.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef SGFTREE_H_INCLUDED
@@ -28,33 +39,36 @@
 #include "FastBoard.h"
 #include "GameState.h"
 #include "KoState.h"
+#include "TimeControl.h"
 
 class SGFTree {
 public:
-    static const int EOT = 0;               // End-Of-Tree marker
+    static constexpr auto EOT = 0; // End-Of-Tree marker
 
     SGFTree() = default;
     void init_state();
 
-    KoState * get_state();
-    GameState follow_mainline_state(unsigned int movenum = 999);
-    std::vector<int> get_mainline();
-    void load_from_file(std::string filename, int index = 0);
-    void load_from_string(std::string gamebuff);
+    const KoState* get_state() const;
+    GameState follow_mainline_state(unsigned int movenum = 999) const;
+    std::vector<int> get_mainline() const;
+
+    void load_from_file(const std::string& filename, int index = 0);
+    void load_from_string(const std::string& gamebuff);
 
     void add_property(std::string property, std::string value);
-    SGFTree * add_child();
-    SGFTree * get_child(size_t count);
-    int get_move(int tomove);
+    SGFTree* add_child();
+    const SGFTree* get_child(size_t count) const;
+    int get_move(int tomove) const;
+    std::pair<int, int> get_colored_move() const;
     bool is_initialized() const {
         return m_initialized;
     }
-    FastBoard::square_t get_winner() const;
+    FastBoard::vertex_t get_winner() const;
 
     static std::string state_to_string(GameState& state, int compcolor);
 
 private:
-    void populate_states(void);
+    void populate_states();
     void apply_move(int color, int move);
     void apply_move(int move);
     void copy_state(const SGFTree& state);
@@ -64,7 +78,8 @@ class SGFTree {
 
     bool m_initialized{false};
     KoState m_state;
-    FastBoard::square_t m_winner{FastBoard::INVAL};
+    std::shared_ptr<TimeControl> m_timecontrol_ptr;
+    FastBoard::vertex_t m_winner{FastBoard::INVAL};
     std::vector<SGFTree> m_children;
     PropertyMap m_properties;
 };
diff --git a/src/SMP.cpp b/src/SMP.cpp
index d46756c18..c004e1a29 100644
--- a/src/SMP.cpp
+++ b/src/SMP.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,25 +14,41 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
-#include "SMP.h"
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
 #include <cassert>
 #include <thread>
 
+#include "SMP.h"
+
 SMP::Mutex::Mutex() {
     m_lock = false;
 }
 
-SMP::Lock::Lock(Mutex & m) {
+SMP::Lock::Lock(Mutex& m) {
     m_mutex = &m;
     lock();
 }
 
 void SMP::Lock::lock() {
     assert(!m_owns_lock);
-    while (m_mutex->m_lock.exchange(true, std::memory_order_acquire) == true);
+    // Test and Test-and-Set reduces memory contention
+    // However, just trying to Test-and-Set first improves performance in almost
+    // all cases
+    while (m_mutex->m_lock.exchange(true, std::memory_order_acquire)) {
+        while (m_mutex->m_lock.load(std::memory_order_relaxed)) {}
+    }
     m_owns_lock = true;
 }
 
@@ -57,6 +73,6 @@ SMP::Lock::~Lock() {
     }
 }
 
-int SMP::get_num_cpus() {
+size_t SMP::get_num_cpus() {
     return std::thread::hardware_concurrency();
 }
diff --git a/src/SMP.h b/src/SMP.h
index fd16e53fe..c063b6823 100644
--- a/src/SMP.h
+++ b/src/SMP.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef SMP_H_INCLUDED
@@ -22,27 +33,30 @@
 #include "config.h"
 
 #include <atomic>
+#include <cstddef>
 
 namespace SMP {
-    int get_num_cpus();
+    size_t get_num_cpus();
 
     class Mutex {
     public:
         Mutex();
         ~Mutex() = default;
         friend class Lock;
+
     private:
         std::atomic<bool> m_lock;
     };
 
     class Lock {
     public:
-        explicit Lock(Mutex & m);
+        explicit Lock(Mutex& m);
         ~Lock();
         void lock();
         void unlock();
+
     private:
-        Mutex * m_mutex;
+        Mutex* m_mutex;
         bool m_owns_lock{false};
     };
 }
diff --git a/src/ThreadPool.h b/src/ThreadPool.h
index 250abd6e6..be1ed7b47 100644
--- a/src/ThreadPool.h
+++ b/src/ThreadPool.h
@@ -4,7 +4,7 @@
     Extended from code:
     Copyright (c) 2012 Jakob Progsch, Václav Zeman
     Modifications:
-    Copyright (c) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (c) 2017-2019 Gian-Carlo Pascutto and contributors
 
     This software is provided 'as-is', without any express or implied
     warranty. In no event will the authors be held liable for any damages
@@ -26,15 +26,15 @@
     distribution.
 */
 
-#include <cstddef>
-#include <vector>
-#include <thread>
-#include <queue>
-#include <mutex>
 #include <condition_variable>
-#include <memory>
-#include <future>
+#include <cstddef>
 #include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
 
 namespace Utils {
 
@@ -46,12 +46,14 @@ class ThreadPool {
     // create worker threads.  This version has no initializers.
     void initialize(std::size_t);
 
-    // add an extra thread.  The thread calls initializer() before doing anything,
-    // so that the user can initialize per-thread data structures before doing work.
+    // add an extra thread.  The thread calls initializer() before doing
+    // anything, so that the user can initialize per-thread data structures
+    // before doing work.
     void add_thread(std::function<void()> initializer);
-    template<class F, class... Args>
+    template <class F, class... Args>
     auto add_task(F&& f, Args&&... args)
         -> std::future<typename std::result_of<F(Args...)>::type>;
+
 private:
     std::vector<std::thread> m_threads;
     std::queue<std::function<void()>> m_tasks;
@@ -68,7 +70,8 @@ inline void ThreadPool::add_thread(std::function<void()> initializer) {
             std::function<void()> task;
             {
                 std::unique_lock<std::mutex> lock(m_mutex);
-                m_condvar.wait(lock, [this]{ return m_exit || !m_tasks.empty(); });
+                m_condvar.wait(lock,
+                               [this] { return m_exit || !m_tasks.empty(); });
                 if (m_exit && m_tasks.empty()) {
                     return;
                 }
@@ -80,25 +83,24 @@ inline void ThreadPool::add_thread(std::function<void()> initializer) {
     });
 }
 
-inline void ThreadPool::initialize(size_t threads) {
+inline void ThreadPool::initialize(const size_t threads) {
     for (size_t i = 0; i < threads; i++) {
-        add_thread([](){} /* null function */);
+        add_thread([]() {} /* null function */);
     }
 }
 
-template<class F, class... Args>
+template <class F, class... Args>
 auto ThreadPool::add_task(F&& f, Args&&... args)
     -> std::future<typename std::result_of<F(Args...)>::type> {
     using return_type = typename std::result_of<F(Args...)>::type;
 
-    auto task = std::make_shared< std::packaged_task<return_type()> >(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...)
-    );
+    auto task = std::make_shared<std::packaged_task<return_type()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
 
     std::future<return_type> res = task->get_future();
     {
         std::unique_lock<std::mutex> lock(m_mutex);
-        m_tasks.emplace([task](){(*task)();});
+        m_tasks.emplace([task]() { (*task)(); });
     }
     m_condvar.notify_one();
     return res;
@@ -110,27 +112,27 @@ inline ThreadPool::~ThreadPool() {
         m_exit = true;
     }
     m_condvar.notify_all();
-    for (std::thread & worker : m_threads) {
+    for (std::thread& worker : m_threads) {
         worker.join();
     }
 }
 
 class ThreadGroup {
 public:
-    ThreadGroup(ThreadPool & pool) : m_pool(pool) {}
-    template<class F, class... Args>
+    ThreadGroup(ThreadPool& pool) : m_pool(pool) {}
+    template <class F, class... Args>
     void add_task(F&& f, Args&&... args) {
         m_taskresults.emplace_back(
-            m_pool.add_task(std::forward<F>(f), std::forward<Args>(args)...)
-        );
+            m_pool.add_task(std::forward<F>(f), std::forward<Args>(args)...));
     }
     void wait_all() {
-        for (auto && result : m_taskresults) {
+        for (auto&& result : m_taskresults) {
             result.get();
         }
     }
+
 private:
-    ThreadPool & m_pool;
+    ThreadPool& m_pool;
     std::vector<std::future<void>> m_taskresults;
 };
 
diff --git a/src/TimeControl.cpp b/src/TimeControl.cpp
index f29158cd7..36e1ae199 100644
--- a/src/TimeControl.cpp
+++ b/src/TimeControl.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,13 +14,27 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
-#include "TimeControl.h"
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
+#include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include <algorithm>
+#include <memory>
+#include <regex>
+#include <sstream>
+
+#include "TimeControl.h"
 
 #include "GTP.h"
 #include "Timing.h"
@@ -28,35 +42,101 @@
 
 using namespace Utils;
 
-TimeControl::TimeControl(int boardsize, int maintime, int byotime,
-                         int byostones, int byoperiods)
+TimeControl::TimeControl(const int maintime, const int byotime,
+                         const int byostones, const int byoperiods)
     : m_maintime(maintime),
       m_byotime(byotime),
       m_byostones(byostones),
-      m_byoperiods(byoperiods),
-      m_boardsize(boardsize) {
+      m_byoperiods(byoperiods) {
 
     reset_clocks();
 }
 
-std::string TimeControl::to_text_sgf() {
+std::string TimeControl::stones_left_to_text_sgf(const int color) const {
+    auto s = std::string{};
+    // We must be in byo-yomi before interpreting stones.
+    if (m_inbyo[color]) {
+        const auto c = color == FastBoard::BLACK ? "OB[" : "OW[";
+        if (m_byostones) {
+            s += c + std::to_string(m_stones_left[color]) + "]";
+        } else if (m_byoperiods) {
+            // KGS extension.
+            s += c + std::to_string(m_periods_left[color]) + "]";
+        }
+    }
+    return s;
+}
+
+std::string TimeControl::to_text_sgf() const {
     if (m_byotime != 0 && m_byostones == 0 && m_byoperiods == 0) {
-        return ""; // infinite
+        return ""; // Infinite time.
     }
-    auto s = "TM[" + std::to_string(m_maintime/100) + "]";
+    auto s = "TM[" + std::to_string(m_maintime / 100) + "]";
     if (m_byotime) {
         if (m_byostones) {
             s += "OT[" + std::to_string(m_byostones) + "/";
-            s += std::to_string(m_byotime/100) + " Canadian]";
+            s += std::to_string(m_byotime / 100) + " Canadian]";
         } else {
             assert(m_byoperiods);
             s += "OT[" + std::to_string(m_byoperiods) + "x";
-            s += std::to_string(m_byotime/100) + " byo-yomi]";
+            s += std::to_string(m_byotime / 100) + " byo-yomi]";
         }
+        s += stones_left_to_text_sgf(FastBoard::BLACK);
+        s += stones_left_to_text_sgf(FastBoard::WHITE);
     }
+    // Generously round up to avoid a remaining time of 0 triggering byo-yomi
+    // to be started when the sgf is loaded. This happens because byo-yomi
+    // stones have to be only written to the sgf when actually in byo-yomi
+    // and this is interpreted in adjust_time() as a special case
+    // that starts byo-yomi.
+    const auto black_time_left =
+        (m_remaining_time[FastBoard::BLACK] + 99) / 100;
+    const auto white_time_left =
+        (m_remaining_time[FastBoard::WHITE] + 99) / 100;
+    s += "BL[" + std::to_string(black_time_left) + "]";
+    s += "WL[" + std::to_string(white_time_left) + "]";
     return s;
 }
 
+std::shared_ptr<TimeControl> TimeControl::make_from_text_sgf(
+    const std::string& maintime, const std::string& byoyomi,
+    const std::string& black_time_left, const std::string& white_time_left,
+    const std::string& black_moves_left, const std::string& white_moves_left) {
+    const auto maintime_centis = std::stoi(maintime) * 100;
+    auto byotime = 0;
+    auto byostones = 0;
+    auto byoperiods = 0;
+    if (!byoyomi.empty()) {
+        std::smatch m;
+        const auto re_canadian = std::regex{"(\\d+)/(\\d+) Canadian"};
+        const auto re_byoyomi = std::regex{"(\\d+)x(\\d+) byo-yomi"};
+        if (std::regex_match(byoyomi, m, re_canadian)) {
+            byostones = std::stoi(m[1]);
+            byotime = std::stoi(m[2]) * 100;
+        } else if (std::regex_match(byoyomi, m, re_byoyomi)) {
+            byoperiods = std::stoi(m[1]);
+            byotime = std::stoi(m[2]) * 100;
+        } else {
+            // Unrecognised byo-yomi syntax.
+        }
+    }
+    const auto timecontrol_ptr = std::make_shared<TimeControl>(
+        maintime_centis, byotime, byostones, byoperiods);
+    if (!black_time_left.empty()) {
+        const auto time = std::stoi(black_time_left) * 100;
+        const auto stones =
+            black_moves_left.empty() ? 0 : std::stoi(black_moves_left);
+        timecontrol_ptr->adjust_time(FastBoard::BLACK, time, stones);
+    }
+    if (!white_time_left.empty()) {
+        const auto time = std::stoi(white_time_left) * 100;
+        const auto stones =
+            white_moves_left.empty() ? 0 : std::stoi(white_moves_left);
+        timecontrol_ptr->adjust_time(FastBoard::WHITE, time, stones);
+    }
+    return timecontrol_ptr;
+}
+
 void TimeControl::reset_clocks() {
     m_remaining_time = {m_maintime, m_maintime};
     m_stones_left = {m_byostones, m_byostones};
@@ -72,11 +152,11 @@ void TimeControl::reset_clocks() {
     }
 }
 
-void TimeControl::start(int color) {
+void TimeControl::start(const int color) {
     m_times[color] = Time();
 }
 
-void TimeControl::stop(int color) {
+void TimeControl::stop(const int color) {
     Time stop;
     int elapsed_centis = Time::timediff_centis(m_times[color], stop);
 
@@ -111,8 +191,8 @@ void TimeControl::stop(int color) {
     }
 }
 
-void TimeControl::display_color_time(int color) {
-    auto rem = m_remaining_time[color] / 100;  /* centiseconds to seconds */
+void TimeControl::display_color_time(const int color) {
+    auto rem = m_remaining_time[color] / 100; /* centiseconds to seconds */
     auto minuteDiv = std::div(rem, 60);
     auto hourDiv = std::div(minuteDiv.quot, 60);
     auto seconds = minuteDiv.rem;
@@ -137,10 +217,11 @@ void TimeControl::display_times() {
     myprintf("\n");
 }
 
-int TimeControl::max_time_for_move(int color, int movenum) {
+int TimeControl::max_time_for_move(const int boardsize, const int color,
+                                   const size_t movenum) const {
     // default: no byo yomi (absolute)
     auto time_remaining = m_remaining_time[color];
-    auto moves_remaining = get_moves_expected(movenum);
+    auto moves_remaining = get_moves_expected(boardsize, movenum);
     auto extra_time_per_move = 0;
 
     if (m_byotime != 0) {
@@ -183,14 +264,15 @@ int TimeControl::max_time_for_move(int color, int movenum) {
 
     // always keep a cfg_lagbugger_cs centisecond margin
     // for network hiccups or GUI lag
-    auto base_time = std::max(time_remaining - cfg_lagbuffer_cs, 0) /
-                     std::max(moves_remaining, 1);
+    auto base_time = std::max(time_remaining - cfg_lagbuffer_cs, 0)
+                     / std::max(moves_remaining, 1);
     auto inc_time = std::max(extra_time_per_move - cfg_lagbuffer_cs, 0);
 
     return base_time + inc_time;
 }
 
-void TimeControl::adjust_time(int color, int time, int stones) {
+void TimeControl::adjust_time(const int color, const int time,
+                              const int stones) {
     m_remaining_time[color] = time;
     // From pachi: some GTP things send 0 0 at the end of main time
     if (!time && !stones) {
@@ -215,13 +297,14 @@ void TimeControl::adjust_time(int color, int time, int stones) {
     }
 }
 
-
-void TimeControl::set_boardsize(int boardsize) {
-    m_boardsize = boardsize;
+size_t TimeControl::opening_moves(const int boardsize) const {
+    auto num_intersections = boardsize * boardsize;
+    auto fast_moves = num_intersections / 6;
+    return fast_moves;
 }
 
-
-int TimeControl::get_moves_expected(int movenum) {
+int TimeControl::get_moves_expected(const int boardsize,
+                                    const size_t movenum) const {
     auto board_div = 5;
     if (cfg_timemanage != TimeManagement::OFF) {
         // We will take early exits with time management on, so
@@ -231,15 +314,10 @@ int TimeControl::get_moves_expected(int movenum) {
 
     // Note this is constant as we play, so it's fair
     // to underestimate quite a bit.
-    auto base_remaining = (m_boardsize * m_boardsize) / board_div;
+    auto base_remaining = (boardsize * boardsize) / board_div;
 
     // Don't think too long in the opening.
-    auto fast_moves = 60;
-    if (m_boardsize < 19) {
-        // Alternative value tuned for 9x9.
-        fast_moves = 16;
-    }
-
+    auto fast_moves = opening_moves(boardsize);
     if (movenum < fast_moves) {
         return (base_remaining + fast_moves) - movenum;
     } else {
@@ -250,7 +328,7 @@ int TimeControl::get_moves_expected(int movenum) {
 // Returns true if we are in a time control where we
 // can save up time. If not, we should not move quickly
 // even if certain of our move, but plough ahead.
-bool TimeControl::can_accumulate_time(int color) {
+bool TimeControl::can_accumulate_time(const int color) const {
     if (m_inbyo[color]) {
         // Cannot accumulate in Japanese byo yomi
         if (m_byoperiods) {
diff --git a/src/TimeControl.h b/src/TimeControl.h
index bd9f412bd..04cb6337c 100644
--- a/src/TimeControl.h
+++ b/src/TimeControl.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,14 +14,27 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef TIMECONTROL_H_INCLUDED
 #define TIMECONTROL_H_INCLUDED
 
+#include "config.h"
+
 #include <array>
+#include <memory>
 
-#include "config.h"
 #include "Timing.h"
 
 class TimeControl {
@@ -29,37 +42,40 @@ class TimeControl {
     /*
         Initialize time control. Timing info is per GTP and in centiseconds
     */
-    TimeControl(int boardsize = BOARD_SIZE,
-                int maintime = 60 * 60 * 100,
-                int byotime = 0, int byostones = 25,
-                int byoperiods = 0);
+    TimeControl(int maintime = 60 * 60 * 100, int byotime = 0,
+                int byostones = 0, int byoperiods = 0);
 
     void start(int color);
     void stop(int color);
-    int max_time_for_move(int color, int movenum);
+    int max_time_for_move(int boardsize, int color, size_t movenum) const;
     void adjust_time(int color, int time, int stones);
-    void set_boardsize(int boardsize);
     void display_times();
     void reset_clocks();
-    bool can_accumulate_time(int color);
-    std::string to_text_sgf();
+    bool can_accumulate_time(int color) const;
+    size_t opening_moves(int boardsize) const;
+    std::string to_text_sgf() const;
+    static std::shared_ptr<TimeControl> make_from_text_sgf(
+        const std::string& maintime, const std::string& byoyomi,
+        const std::string& black_time_left, const std::string& white_time_left,
+        const std::string& black_moves_left,
+        const std::string& white_moves_left);
 
 private:
+    std::string stones_left_to_text_sgf(int color) const;
     void display_color_time(int color);
-    int get_moves_expected(int movenum);
+    int get_moves_expected(int boardsize, size_t movenum) const;
 
     int m_maintime;
     int m_byotime;
     int m_byostones;
     int m_byoperiods;
-    int m_boardsize;
 
-    std::array<int,  2> m_remaining_time;    /* main time per player */
-    std::array<int,  2> m_stones_left;       /* stones to play in byo period */
-    std::array<int,  2> m_periods_left;      /* byo periods */
-    std::array<bool, 2> m_inbyo;             /* player is in byo yomi */
+    std::array<int,  2> m_remaining_time; /* main time per player */
+    std::array<int,  2> m_stones_left;    /* stones to play in byo period */
+    std::array<int,  2> m_periods_left;   /* byo periods */
+    std::array<bool, 2> m_inbyo;          /* player is in byo yomi */
 
-    std::array<Time, 2> m_times;             /* storage for player times */
+    std::array<Time, 2> m_times;          /* storage for player times */
 };
 
 #endif
diff --git a/src/Timing.cpp b/src/Timing.cpp
index 3d6292aa6..e3cf4cc10 100644
--- a/src/Timing.cpp
+++ b/src/Timing.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,22 +14,32 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
-#include "Timing.h"
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
 #include <chrono>
 
+#include "Timing.h"
 
-int Time::timediff_centis(Time start, Time end) {
+int Time::timediff_centis(const Time start, const Time end) {
     return std::chrono::duration_cast<std::chrono::milliseconds>
         (end.m_time - start.m_time).count() / 10;
 }
 
-double Time::timediff_seconds(Time start, Time end) {
+double Time::timediff_seconds(const Time start, const Time end) {
     return std::chrono::duration<double>(end.m_time - start.m_time).count();
 }
 
-Time::Time(void) {
+Time::Time() {
     m_time = std::chrono::steady_clock::now();
 }
diff --git a/src/Timing.h b/src/Timing.h
index ff4260280..badc2926f 100644
--- a/src/Timing.h
+++ b/src/Timing.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef TIMING_H_INCLUDED
@@ -24,7 +35,7 @@
 class Time {
 public:
     /* sets to current time */
-    Time(void);
+    Time();
 
     /* time difference in centiseconds */
     static int timediff_centis(Time start, Time end);
diff --git a/src/Training.cpp b/src/Training.cpp
index 0f9107fa4..cca0c465e 100644
--- a/src/Training.cpp
+++ b/src/Training.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,9 +14,18 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
 
-#include "Training.h"
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 
 #include <algorithm>
 #include <bitset>
@@ -29,6 +38,8 @@
 #include <stdexcept>
 #include <utility>
 
+#include "Training.h"
+
 #include "FastBoard.h"
 #include "FullBoard.h"
 #include "GTP.h"
@@ -44,7 +55,7 @@
 
 std::vector<TimeStep> Training::m_data{};
 
-std::ostream& operator <<(std::ostream& stream, const TimeStep& timestep) {
+std::ostream& operator<<(std::ostream& stream, const TimeStep& timestep) {
     stream << timestep.planes.size() << ' ';
     for (const auto plane : timestep.planes) {
         stream << plane << ' ';
@@ -61,7 +72,7 @@ std::ostream& operator <<(std::ostream& stream, const TimeStep& timestep) {
     return stream;
 }
 
-std::istream& operator>> (std::istream& stream, TimeStep& timestep) {
+std::istream& operator>>(std::istream& stream, TimeStep& timestep) {
     int planes_size;
     stream >> planes_size;
     for (auto i = 0; i < planes_size; ++i) {
@@ -84,16 +95,14 @@ std::istream& operator>> (std::istream& stream, TimeStep& timestep) {
     return stream;
 }
 
-std::string OutputChunker::gen_chunk_name(void) const {
+std::string OutputChunker::gen_chunk_name() const {
     auto base = std::string{m_basename};
     base.append("." + std::to_string(m_chunk_count) + ".gz");
     return base;
 }
 
-OutputChunker::OutputChunker(const std::string& basename,
-                             bool compress)
-    : m_basename(basename), m_compress(compress) {
-}
+OutputChunker::OutputChunker(const std::string& basename, bool compress)
+    : m_basename(basename), m_compress(compress) {}
 
 OutputChunker::~OutputChunker() {
     flush_chunks();
@@ -120,7 +129,7 @@ void OutputChunker::flush_chunks() {
         if (!comp_size) {
             throw std::runtime_error("Error in gzip output");
         }
-        Utils::myprintf("Writing chunk %d\n",  m_chunk_count);
+        Utils::myprintf("Writing chunk %d\n", m_chunk_count);
         gzclose(out);
     } else {
         auto chunk_name = m_basename;
@@ -146,20 +155,21 @@ TimeStep::NNPlanes Training::get_planes(const GameState* const state) {
     planes.resize(Network::INPUT_CHANNELS);
 
     for (auto c = size_t{0}; c < Network::INPUT_CHANNELS; c++) {
-        for (auto idx = 0; idx < BOARD_SQUARES; idx++) {
-            planes[c][idx] = bool(input_data[c * BOARD_SQUARES + idx]);
+        for (auto idx = 0; idx < NUM_INTERSECTIONS; idx++) {
+            planes[c][idx] = bool(input_data[c * NUM_INTERSECTIONS + idx]);
         }
     }
     return planes;
 }
 
-void Training::record(GameState& state, UCTNode& root) {
+void Training::record(Network& network, const GameState& state,
+                      const UCTNode& root) {
     auto step = TimeStep{};
     step.to_move = state.board.get_to_move();
     step.planes = get_planes(&state);
 
-    auto result =
-        Network::get_scored_moves(&state, Network::Ensemble::DIRECT, 0);
+    const auto result = network.get_output(&state, Network::Ensemble::DIRECT,
+                                           Network::IDENTITY_SYMMETRY);
     step.net_winrate = result.winrate;
 
     const auto& best_node = root.get_best_root_child(step.to_move);
@@ -167,7 +177,7 @@ void Training::record(GameState& state, UCTNode& root) {
     step.child_uct_winrate = best_node.get_eval(step.to_move);
     step.bestmove_visits = best_node.get_visits();
 
-    step.probabilities.resize((BOARD_SQUARES) + 1);
+    step.probabilities.resize(POTENTIAL_MOVES);
 
     // Get total visit amount. We count rather
     // than trust the root to avoid ttable issues.
@@ -191,14 +201,15 @@ void Training::record(GameState& state, UCTNode& root) {
             auto xy = state.board.get_xy(move);
             step.probabilities[xy.second * BOARD_SIZE + xy.first] = prob;
         } else {
-            step.probabilities[BOARD_SQUARES] = prob;
+            step.probabilities[NUM_INTERSECTIONS] = prob;
         }
     }
 
     m_data.emplace_back(step);
 }
 
-void Training::dump_training(int winner_color, const std::string& filename) {
+void Training::dump_training(const int winner_color,
+                             const std::string& filename) {
     auto chunker = OutputChunker{filename, true};
     dump_training(winner_color, chunker);
 }
@@ -231,7 +242,7 @@ void Training::load_training(std::ifstream& in) {
     }
 }
 
-void Training::dump_training(int winner_color, OutputChunker& outchunk) {
+void Training::dump_training(const int winner_color, OutputChunker& outchunk) {
     auto training_str = std::string{};
     for (const auto& step : m_data) {
         auto out = std::stringstream{};
@@ -240,13 +251,13 @@ void Training::dump_training(int winner_color, OutputChunker& outchunk) {
             const auto& plane = step.planes[p];
             // Write it out as a string of hex characters
             for (auto bit = size_t{0}; bit + 3 < plane.size(); bit += 4) {
-                auto hexbyte =  plane[bit]     << 3
-                              | plane[bit + 1] << 2
-                              | plane[bit + 2] << 1
-                              | plane[bit + 3] << 0;
+                auto hexbyte = plane[bit]     << 3
+                             | plane[bit + 1] << 2
+                             | plane[bit + 2] << 1
+                             | plane[bit + 3] << 0;
                 out << std::hex << hexbyte;
             }
-            // BOARD_SQUARES % 4 = 1 so the last bit goes by itself
+            // NUM_INTERSECTIONS % 4 = 1 so the last bit goes by itself
             // for odd sizes
             assert(plane.size() % 4 == 1);
             out << plane[plane.size() - 1];
@@ -255,9 +266,9 @@ void Training::dump_training(int winner_color, OutputChunker& outchunk) {
         // The side to move planes can be compactly encoded into a single
         // bit, 0 = black to move.
         out << (step.to_move == FastBoard::BLACK ? "0" : "1") << std::endl;
-        // Then a BOARD_SQUARES + 1 long array of float probabilities
-        for (auto it = begin(step.probabilities);
-            it != end(step.probabilities); ++it) {
+        // Then a POTENTIAL_MOVES long array of float probabilities
+        for (auto it = begin(step.probabilities); it != end(step.probabilities);
+             ++it) {
             out << *it;
             if (next(it) != end(step.probabilities)) {
                 out << " ";
@@ -300,7 +311,8 @@ void Training::dump_debug(OutputChunker& outchunk) {
     outchunk.append(debug_str);
 }
 
-void Training::process_game(GameState& state, size_t& train_pos, int who_won,
+void Training::process_game(GameState& state, size_t& train_pos,
+                            const int who_won,
                             const std::vector<int>& tree_moves,
                             OutputChunker& outchunker) {
     clear_training();
@@ -324,14 +336,14 @@ void Training::process_game(GameState& state, size_t& train_pos, int who_won,
             auto xy = state.board.get_xy(move_vertex);
             move_idx = (xy.second * BOARD_SIZE) + xy.first;
         } else {
-            move_idx = BOARD_SQUARES; // PASS
+            move_idx = NUM_INTERSECTIONS; // PASS
         }
 
         auto step = TimeStep{};
         step.to_move = to_move;
         step.planes = get_planes(&state);
 
-        step.probabilities.resize(BOARD_SQUARES + 1);
+        step.probabilities.resize(POTENTIAL_MOVES);
         step.probabilities[move_idx] = 1.0f;
 
         train_pos++;
@@ -392,8 +404,7 @@ void Training::dump_supervised(const std::string& sgf_name,
             continue;
         }
 
-        process_game(*state, train_pos, who_won, tree_moves,
-                    outchunker);
+        process_game(*state, train_pos, who_won, tree_moves, outchunker);
     }
 
     std::cout << "Dumped " << train_pos << " training positions." << std::endl;
diff --git a/src/Training.h b/src/Training.h
index 3e060ba93..818603b45 100644
--- a/src/Training.h
+++ b/src/Training.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef TRAINING_H_INCLUDED
@@ -33,7 +44,7 @@
 
 class TimeStep {
 public:
-    using BoardPlane = std::bitset<BOARD_SQUARES>;
+    using BoardPlane = std::bitset<NUM_INTERSECTIONS>;
     using NNPlanes = std::vector<BoardPlane>;
     NNPlanes planes;
     std::vector<float> probabilities;
@@ -44,8 +55,8 @@ class TimeStep {
     int bestmove_visits;
 };
 
-std::ostream& operator<< (std::ostream& stream, const TimeStep& timestep);
-std::istream& operator>> (std::istream& stream, TimeStep& timestep);
+std::ostream& operator<<(std::ostream& stream, const TimeStep& timestep);
+std::istream& operator>>(std::istream& stream, TimeStep& timestep);
 
 class OutputChunker {
 public:
@@ -55,6 +66,7 @@ class OutputChunker {
 
     // Group this many games in a batch.
     static constexpr size_t CHUNK_SIZE = 32;
+
 private:
     std::string gen_chunk_name() const;
     void flush_chunks();
@@ -71,7 +83,8 @@ class Training {
     static void dump_training(int winner_color,
                               const std::string& out_filename);
     static void dump_debug(const std::string& out_filename);
-    static void record(GameState& state, UCTNode& node);
+    static void record(Network& network, const GameState& state,
+                       const UCTNode& node);
 
     static void dump_supervised(const std::string& sgf_file,
                                 const std::string& out_filename);
@@ -79,12 +92,11 @@ class Training {
     static void load_training(const std::string& filename);
 
 private:
-    static TimeStep::NNPlanes get_planes(const GameState* const state);
+    static TimeStep::NNPlanes get_planes(const GameState* state);
     static void process_game(GameState& state, size_t& train_pos, int who_won,
                              const std::vector<int>& tree_moves,
                              OutputChunker& outchunker);
-    static void dump_training(int winner_color,
-                              OutputChunker& outchunker);
+    static void dump_training(int winner_color, OutputChunker& outchunker);
     static void dump_debug(OutputChunker& outchunker);
     static void save_training(std::ofstream& out);
     static void load_training(std::ifstream& in);
diff --git a/src/Tuner.cpp b/src/Tuner.cpp
index c61d0c521..4d4dadeb2 100644
--- a/src/Tuner.cpp
+++ b/src/Tuner.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
@@ -21,41 +32,63 @@
 #ifdef USE_OPENCL
 #include <array>
 #include <cassert>
+#include <cmath>
+#include <fstream>
 #include <iostream>
-#include <sstream>
-#include <string>
 #include <map>
 #include <random>
-#include <cmath>
-#include <fstream>
+#include <sstream>
+#include <string>
+#ifndef USE_BLAS
+#include <Eigen/Dense>
+#endif
 
 #include "GTP.h"
 #include "OpenCL.h"
+#include "Random.h"
 #include "Tuner.h"
 #include "Utils.h"
-#include "Random.h"
 
-#ifdef __APPLE__
-#include <Accelerate/Accelerate.h>
-#endif
-#ifdef USE_MKL
-#include <mkl.h>
-#endif
-#ifdef USE_OPENBLAS
-#include <cblas.h>
+const auto TUNER_FILE_LOCAL = std::string("leelaz_opencl_tuning");
+
+template <typename net_t> std::vector<std::string> Tuner<net_t>::tuned_devices;
+
+#ifndef USE_BLAS
+// Eigen helpers
+template <typename T> using EigenMatrixMap =
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T> using ConstEigenMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
 #endif
 
-const auto TUNER_FILE_LOCAL = std::string("leelaz_opencl_tuning");
+template <typename net_t> static std::string getTunerKernel();
+template <typename net_t> static float getTunerMaxError();
+
+template <>
+std::string getTunerKernel<float>() {
+    return std::string("XgemmBatched");
+}
+
+template <>
+float getTunerMaxError<float>() {
+    return 1e-4f;
+}
+
 #ifdef USE_HALF
-const auto TUNER_KERNEL = std::string("XgemmBatchedHalf");
-constexpr auto MAX_ERROR = 1e-2f;
-#else
-const auto TUNER_KERNEL = std::string("XgemmBatched");
-constexpr auto MAX_ERROR = 1e-4f;
+template <>
+std::string getTunerKernel<half_float::half>() {
+    return std::string("XgemmBatchedHalf");
+}
+
+template <>
+float getTunerMaxError<half_float::half>() {
+    return 1e-1f;
+}
 #endif
 
 using namespace Utils;
 
+template <typename net_t>
 static void sgemmBatched_ref(const std::vector<net_t>& a,
                              const std::vector<net_t>& b,
                              std::vector<net_t>& c,
@@ -72,60 +105,114 @@ static void sgemmBatched_ref(const std::vector<net_t>& a,
         auto offset_u = batch * m * k;
         auto offset_v = batch * n * k;
         auto offset_m = batch * m * n;
-
-        cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                    m, n, k,
-                    1.0f,
-                    &ar[offset_u], m,
-                    &br[offset_v], n,
-                    0.0f,
-                    &cr[offset_m], n);
+#ifdef USE_BLAS
+        // Calculates C = transpose(tranpose(A) * B) in row major, or
+        // C = A * transpose(B) in column major.
+        for (auto i = 0; i < m; i++) {
+            for (auto j = 0; j < n; j++) {
+                auto acc = 0.0f;
+                for (auto l = 0; l < k; l++) {
+                    acc += ar[l * m + i + offset_u] * br[l * n + j + offset_v];
+                }
+                cr[j * m + i + offset_m] = acc;
+            }
+        }
+#else
+        auto C = EigenMatrixMap<float>(cr.data() + offset_m, m, n);
+        auto A = ConstEigenMatrixMap<float>(ar.data() + offset_u, m, k);
+        auto B = ConstEigenMatrixMap<float>(br.data() + offset_v, n, k);
+        C.noalias() = (A * B.transpose());
+#endif
     }
 
     std::copy(begin(cr), end(cr), begin(c));
 }
 
-
 static bool IsMultiple(const size_t a, const size_t b) {
     return (a % b == 0);
 }
 
-bool Tuner::valid_config_sgemm(Parameters p, bool exhaustive) {
-    if (!IsMultiple(p["MWG"], p["MDIMC"]*p["VWM"])) {
-        return false;
-    }
-    if (!IsMultiple(p["NWG"], p["NDIMC"]*p["VWN"])) {
-        return false;
-    }
-    if (!IsMultiple(p["MWG"], p["MDIMA"]*p["VWM"])) {
-        return false;
-    }
-    if (!IsMultiple(p["NWG"], p["NDIMB"]*p["VWN"])) {
-        return false;
-    }
-    if (!IsMultiple(p["KWG"], p["MDIMC"]*p["NDIMC"]/p["MDIMA"])) {
-        return false;
-    }
-    if (!IsMultiple(p["KWG"], p["MDIMC"]*p["NDIMC"]/p["NDIMB"])) {
-        return false;
-    }
-    // Extra restrictions for a fast tuning run
-    if (!exhaustive) {
-        if (p["MDIMC"] != p["MDIMA"]) {
+template <typename net_t>
+bool Tuner<net_t>::valid_config_sgemm(Parameters p, const bool exhaustive) {
+    if (p["TCE"] == 0) {
+        if (!IsMultiple(p["MWG"], p["MDIMC"] * p["VWM"])) {
+            return false;
+        }
+        if (!IsMultiple(p["NWG"], p["NDIMC"] * p["VWN"])) {
+            return false;
+        }
+        if (!IsMultiple(p["MWG"], p["MDIMA"] * p["VWM"])) {
+            return false;
+        }
+        if (!IsMultiple(p["NWG"], p["NDIMB"] * p["VWN"])) {
+            return false;
+        }
+        if (!IsMultiple(p["KWG"], p["MDIMC"] * p["NDIMC"] / p["MDIMA"])) {
+            return false;
+        }
+        if (!IsMultiple(p["KWG"], p["MDIMC"] * p["NDIMC"] / p["NDIMB"])) {
+            return false;
+        }
+        // Extra restrictions for a fast tuning run
+        if (!exhaustive) {
+            if (p["MDIMC"] != p["MDIMA"]) {
+                return false;
+            }
+            if (p["NDIMC"] != p["NDIMB"]) {
+                return false;
+            }
+            if (p["SA"] != p["SB"]) {
+                return false;
+            }
+        }
+    } else {
+        if (!m_use_tensorcore) {
+            return false;
+        }
+
+        // In Tensor Core implementations, MDIMA and NDIMB represents the
+        // wmmv multiplication dimensions, that is,
+        // m16n16k16 / m32n8k16 / m8n32k16.  Thus m * n is fixed to 256.
+        if (p["MDIMA"] * p["NDIMB"] != 256) {
+            return false;
+        }
+        if (p["MWG"] < p["MDIMC"]) {
+            return false;
+        }
+        if (p["NWG"] < p["NDIMC"]) {
+            return false;
+        }
+        if (p["MDIMC"] < p["MDIMA"]) {
             return false;
         }
-        if (p["NDIMC"] != p["NDIMB"]) {
+        if (p["NDIMC"] < p["NDIMB"]) {
             return false;
         }
-        if (p["SA"] != p["SB"]) {
+        if (p["MWG"] < 32) {
+            return false;
+        }
+        if (p["NWG"] < 32) {
+            return false;
+        }
+        if (p["KWG"] < 32) {
+            return false;
+        }
+        // VWM / VWN has no meaning if we don't do SA / SB.
+        // Only test VWM / VWN == 2
+        if (p["SA"] == 0 && p["VWM"] != 2) {
+            return false;
+        }
+        if (p["SB"] == 0 && p["VWN"] != 2) {
             return false;
         }
     }
     return true;
 }
 
-Parameters Tuner::get_parameters_by_int(const std::vector<Configurations>& opts,
-                                        const int n) {
+template <typename net_t>
+Parameters Tuner<net_t>::get_parameters_by_int(
+    const std::vector<Configurations>& opts, const int n) {
+
     Parameters param;
     std::vector<size_t> choices(opts.size());
 
@@ -147,7 +234,8 @@ Parameters Tuner::get_parameters_by_int(const std::vector<Configurations>& opts,
     return param;
 }
 
-std::string Tuner::parameters_to_defines(const Parameters& p) {
+template <typename net_t>
+std::string Tuner<net_t>::parameters_to_defines(const Parameters& p) {
     std::string s;
     for (auto const& x : p) {
         s += " -D" + x.first + "=" + std::to_string(x.second);
@@ -155,7 +243,8 @@ std::string Tuner::parameters_to_defines(const Parameters& p) {
     return s;
 }
 
-std::string Tuner::parameters_to_string(const Parameters& p) {
+template <typename net_t>
+std::string Tuner<net_t>::parameters_to_string(const Parameters& p) {
     std::string s;
     for (auto const& x : p) {
         s += x.first + "=" + std::to_string(x.second) + " ";
@@ -170,48 +259,49 @@ static size_t next_power_of_two(const size_t x) {
     return 2 << size_t(std::ceil(std::log2(x)) - 1);
 }
 
-static void sgemm_generate_data(std::vector<net_t> &x,
-                                const int m, const int n,
-                                const int batch_size,
-                                const int m_ceil, const int n_ceil) {
+template <typename net_t>
+static void sgemm_generate_data(std::vector<net_t>& x, const int m, const int n,
+                                const int batch_size, const int m_ceil,
+                                const int n_ceil) {
     for (auto batch = 0; batch < batch_size; batch++) {
         for (auto i = 0; i < n_ceil; i++) {
             if (i < n) {
                 for (auto j = 0; j < m; j++) {
-                    x[batch*n_ceil*m_ceil + i*m_ceil + j] =
-                        0.01f*(((i ^ j) + batch - 50) % 100);
+                    x[batch * n_ceil * m_ceil + i * m_ceil + j] =
+                        (((i ^ j) + batch - 128) % 256) / 256.0f;
                 }
                 for (auto j = m; j < m_ceil; j++) {
-                    x[batch*n_ceil*m_ceil + i*m_ceil + j] = 0.0f;
+                    x[batch * n_ceil * m_ceil + i * m_ceil + j] = 0.0f;
                 }
             } else {
                 for (auto j = 0; j < m_ceil; j++) {
-                    x[batch*n_ceil*m_ceil + i*m_ceil + j] = 0.0f;
+                    x[batch * n_ceil * m_ceil + i * m_ceil + j] = 0.0f;
                 }
             }
         }
     }
 }
 
-static float compare_ref(std::vector<net_t> &x, std::vector<net_t> &ref,
+template <typename net_t>
+static float compare_ref(std::vector<net_t>& x, std::vector<net_t>& ref,
                          const int m, const int n, const int batch_size,
                          const int m_ceil, const int n_ceil) {
     auto sum = 0.0f;
     for (auto batch = 0; batch < batch_size; batch++) {
-        for (auto i = 0; i < n; i++) {
-            for (auto j = 0; j < m; j++) {
-                auto r = ref[batch*n*m + i*m + j];
-                auto y = x[batch*n_ceil*m_ceil + j*n_ceil + i];
+        for (auto j = 0; j < m; j++) {
+            for (auto i = 0; i < n; i++) {
+                auto r = ref[batch * n * m + j * n + i];
+                auto y = x[batch * n_ceil * m_ceil + j * n_ceil + i];
 
                 sum += (r - y) * (r - y);
             }
         }
     }
-    return sum / (m*n);
+    return sum / (m * n * batch_size);
 }
 
-std::string Tuner::tune_sgemm(const int m, const int n, const int k,
-                              const int batch_size, const int runs) {
+template <typename net_t>
+std::vector<Parameters> Tuner<net_t>::build_valid_params() {
     auto opts = std::vector<Configurations>();
     if (cfg_sgemm_exhaustive) {
         opts = {
@@ -248,18 +338,88 @@ std::string Tuner::tune_sgemm(const int m, const int n, const int k,
             {"SB", {1}},
         };
     }
+    // Tensor Core options
+    auto topts = std::vector<Configurations>();
+    if (cfg_sgemm_exhaustive) {
+        topts = {
+            {"MWG", {32, 64, 128, 256}},
+            {"NWG", {8, 16, 32, 64, 128, 256}},
+            {"KWG", {16, 32, 64, 128, 256}},
+            {"MDIMC", {8, 16, 32, 64}},
+            {"NDIMC", {8, 16, 32, 64}},
+            {"MDIMA", {8, 16, 32}},
+            {"NDIMB", {8, 16, 32}},
+            {"KWI", {2}},
+            {"VWM", {2, 4, 8}},
+            {"VWN", {2, 4, 8}},
+            {"STRM", {0}},
+            {"STRN", {0}},
+            {"SA", {0, 1}},
+            {"SB", {0, 1}},
+        };
+    } else {
+        topts = {
+            {"MWG", {32, 64, 128}},
+            {"NWG", {16, 32, 64, 128}},
+            {"KWG", {16, 32, 64, 128}},
+            {"MDIMC", {8, 16, 32}},
+            {"NDIMC", {8, 16, 32}},
+            {"MDIMA", {8, 16, 32}},
+            {"NDIMB", {8, 16, 32}},
+            {"KWI", {2}},
+            {"VWM", {2}},
+            {"VWN", {2}},
+            {"STRM", {0}},
+            {"STRN", {0}},
+            {"SA", {0}},
+            {"SB", {0}},
+        };
+    }
+
+    auto valid_params = std::vector<Parameters>{};
+    auto build_from = [this, &valid_params](std::vector<Configurations>& opts,
+                                            int tce) {
+        auto cfgs = 1;
+        for (auto c = size_t{0}; c < opts.size(); c++) {
+            cfgs *= opts[c].second.size();
+        }
+        for (auto i = 0; i < cfgs; i++) {
+            Parameters param = get_parameters_by_int(opts, i);
+            param["TCE"] = tce;
+            if (valid_config_sgemm(param, cfg_sgemm_exhaustive)) {
+                valid_params.push_back(param);
+            }
+        }
+    };
+    build_from(opts, 0);
+    build_from(topts, 1);
+
+    // Don't use thread RNG or determinism will depend on whether tuner ran.
+    auto rng = Random{0};
+    std::shuffle(begin(valid_params), end(valid_params), rng);
+
+    if (cfg_sgemm_exhaustive) {
+        // Likely too many valid params, cut out some of them
+        valid_params.resize(valid_params.size() / 16);
+    }
 
+    return valid_params;
+}
+
+template <typename net_t>
+std::string Tuner<net_t>::tune_sgemm(const int m, const int n, const int k,
+                                     const int batch_size, const int runs) {
     // This needs to be at minimum the maximum (MNK/WG) values above.
-    auto m_max = std::max(64, m);
-    auto n_max = std::max(64, n);
-    auto k_max = std::max(32, k);
+    auto m_max = std::max(256, m);
+    auto n_max = std::max(256, n);
+    auto k_max = std::max(256, k);
 
-    auto at_size = batch_size
-        * next_power_of_two(k_max) * next_power_of_two(m_max);
-    auto b_size = batch_size
-        * next_power_of_two(k_max) * next_power_of_two(n_max);
-    auto c_size = batch_size
-        * next_power_of_two(m_max) * next_power_of_two(n_max);
+    auto at_size =
+        batch_size * next_power_of_two(k_max) * next_power_of_two(m_max);
+    auto b_size =
+        batch_size * next_power_of_two(k_max) * next_power_of_two(n_max);
+    auto c_size =
+        batch_size * next_power_of_two(m_max) * next_power_of_two(n_max);
 
     auto total_flops = batch_size * 2.0 * m * n * k;
 
@@ -273,58 +433,39 @@ std::string Tuner::tune_sgemm(const int m, const int n, const int k,
 
     sgemmBatched_ref(at, b, c_ref, m, n, k, batch_size);
 
-    auto aBuffer = cl::Buffer(
-        m_context,
-        CL_MEM_READ_WRITE, sizeof(net_t) * at_size, nullptr, nullptr);
-    auto bBuffer = cl::Buffer(
-        m_context,
-        CL_MEM_READ_WRITE, sizeof(net_t) * b_size, nullptr, nullptr);
-    auto cBuffer = cl::Buffer(
-        m_context,
-        CL_MEM_READ_WRITE, sizeof(net_t) * c_size, nullptr, nullptr);
+    auto aBuffer = cl::Buffer(m_context, CL_MEM_READ_WRITE,
+                              sizeof(net_t) * at_size, nullptr, nullptr);
+    auto bBuffer = cl::Buffer(m_context, CL_MEM_READ_WRITE,
+                              sizeof(net_t) * b_size, nullptr, nullptr);
+    auto cBuffer = cl::Buffer(m_context, CL_MEM_READ_WRITE,
+                              sizeof(net_t) * c_size, nullptr, nullptr);
 
     myprintf("\nStarted OpenCL SGEMM tuner.\n");
 
-    auto valid_params = std::vector<int>{};
-    auto cfgs = 1;
-    for (auto c = size_t{0}; c < opts.size(); c++) {
-        cfgs *= opts[c].second.size();
-    }
-
-    // Don't use thead Rng or determism will depend on if tuner ran.
-    auto rng = Random{0};
+    auto valid_params = build_valid_params();
 
-    for (auto i = 0; i < cfgs; i++) {
-        Parameters param = get_parameters_by_int(opts, i);
-        if (valid_config_sgemm(param, cfg_sgemm_exhaustive)) {
-            if (cfg_sgemm_exhaustive) {
-                if (rng.randfix<16>() != 0) {
-                    continue;
-                }
-            }
-            valid_params.emplace_back(i);
-        }
-    }
     myprintf("Will try %zu valid configurations.\n", valid_params.size());
 
     std::string best_params;
     auto best_time = unsigned{0};
 
-    auto queue = cl::CommandQueue(m_context,
-                                  m_device,
-                                  CL_QUEUE_PROFILING_ENABLE);
+    auto queue =
+        cl::CommandQueue(m_context, m_device, CL_QUEUE_PROFILING_ENABLE);
     auto event = cl::Event();
-    auto program = cl::Program(m_context, sourceCode_sgemm);
+    auto program = cl::Program(m_context, sourceCode_common + sourceCode_sgemm);
 
     auto m_ceil_prev = 0;
     auto n_ceil_prev = 0;
     auto k_ceil_prev = 0;
     auto param_counter = size_t{0};
+    auto min_error = 100.0f;
+    auto failed_compile = 0;
+    auto failed_enqueue = 0;
+    auto failed_error = 0;
 
-    for (const auto& i : valid_params) {
+    for (auto& p : valid_params) {
         param_counter++;
 
-        auto p = get_parameters_by_int(opts, i);
         auto defines = parameters_to_defines(p);
 
         try {
@@ -332,18 +473,17 @@ std::string Tuner::tune_sgemm(const int m, const int n, const int k,
             program.build(args.c_str());
         } catch (const cl::Error&) {
             // Failed to compile, get next parameter
+            failed_compile++;
             continue;
         }
 
-        // The kernel is (for now) named the same even in USE_HALF
         auto sgemm_kernel = cl::Kernel(program, "XgemmBatched");
 
         auto m_ceil = int(ceilMultiple(ceilMultiple(m, p["MWG"]), p["VWM"]));
         auto n_ceil = int(ceilMultiple(ceilMultiple(n, p["NWG"]), p["VWN"]));
         auto k_ceil = int(ceilMultiple(ceilMultiple(k, p["KWG"]), p["VWM"]));
 
-        if (m_ceil != m_ceil_prev
-            || n_ceil != n_ceil_prev
+        if (m_ceil != m_ceil_prev || n_ceil != n_ceil_prev
             || k_ceil != k_ceil_prev) {
             m_ceil_prev = m_ceil;
             n_ceil_prev = n_ceil;
@@ -368,18 +508,26 @@ std::string Tuner::tune_sgemm(const int m, const int n, const int k,
 
         cl::NDRange local_sgemm = {p["MDIMC"], p["NDIMC"], 1};
 
-
         cl::NDRange size_sgemm = {(m_ceil * p["MDIMC"]) / p["MWG"],
                                   (n_ceil * p["NDIMC"]) / p["NWG"],
                                   size_t(batch_size)};
+        // Tensor Core implementation uses a different dimension.
+        if (p["TCE"]) {
+            local_sgemm = {32 * p["MDIMC"] / p["MDIMA"],
+                           p["NDIMC"] / p["NDIMB"], 1};
+            size_sgemm = {32 * m_ceil / p["MDIMA"] * p["MDIMC"] / p["MWG"],
+                          n_ceil / p["NDIMB"] * p["NDIMC"] / p["NWG"],
+                          size_t(batch_size)};
+        }
 
         auto sum = 0.0f;
-        auto max_error = 0.0f;
+        auto error = 0.0f;
+
         for (auto r = 0; r < runs; r++) {
             try {
                 queue.enqueueNDRangeKernel(sgemm_kernel, cl::NullRange,
-                                           size_sgemm, local_sgemm,
-                                           nullptr, &event);
+                                           size_sgemm, local_sgemm, nullptr,
+                                           &event);
                 queue.finish();
                 event.wait();
 
@@ -387,9 +535,9 @@ std::string Tuner::tune_sgemm(const int m, const int n, const int k,
                                         c_size * sizeof(net_t), c.data());
                 queue.finish();
 
-                auto this_error = compare_ref(c, c_ref, n, m, batch_size,
-                                              n_ceil, m_ceil);
-                max_error = std::max(max_error, this_error);
+                auto this_error =
+                    compare_ref(c, c_ref, n, m, batch_size, n_ceil, m_ceil);
+                error = std::max(error, this_error);
 
                 auto elapsed =
                     event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -
@@ -397,36 +545,63 @@ std::string Tuner::tune_sgemm(const int m, const int n, const int k,
 
                 sum += elapsed;
             } catch (const cl::Error&) {
-                // Failed to enqueue kernel. Set error to max.
-                max_error = MAX_ERROR;
+                // Failed to enqueue kernel. Set error to some big number.
+                failed_enqueue++;
+                error = std::numeric_limits<float>::max();
+                // This failure will be counted to be failed due to error,
+                // so preemptively subtract one from that count.
+                failed_error--;
                 break;
             }
         }
-        if (max_error < MAX_ERROR && (best_time == 0 || sum < best_time)) {
+
+        min_error = std::min(min_error, error);
+
+        if (error >= getTunerMaxError<net_t>()) {
+            failed_error++;
+        }
+
+        if (error < getTunerMaxError<net_t>()
+            && (best_time == 0 || sum < best_time)) {
             auto param_str = parameters_to_string(p);
             auto kernel_ms = 1e-6f * (sum / runs);
             // Timing is in nanoseconds (10^-9), Giga = 10^9, so this works out
             auto kernel_gflops = total_flops / (sum / runs);
             myprintf("(%u/%u) %s %.4f ms (%.1f GFLOPS)\n",
-               param_counter, valid_params.size(), param_str.c_str(),
-               kernel_ms, kernel_gflops);
+                     param_counter, valid_params.size(), param_str.c_str(),
+                     kernel_ms, kernel_gflops);
             best_time = sum;
             best_params = defines;
         }
     }
     if (best_time == 0) {
-        printf("Failed to find a working configuration.\nCheck your OpenCL drivers.\n");
+        if (failed_compile > 0) {
+            myprintf_error("Failed to compile: %d kernels.\n", failed_compile);
+        }
+        if (failed_enqueue > 0) {
+            myprintf_error("Failed to enqueue: %d kernels\n", failed_enqueue);
+        }
+        if (failed_error > 0) {
+            myprintf_error("Too high error: %d kernels\n", failed_error);
+        }
+        myprintf_error("Failed to find a working configuration.\n"
+                       "Check your OpenCL drivers.\n");
+        myprintf_error("Minimum error: %f. Error bound: %f\n",
+                       min_error, getTunerMaxError<net_t>());
         throw std::runtime_error("Tuner failed to find working configuration.");
     }
     return best_params;
 }
 
-void Tuner::store_sgemm_tuners(const int m, const int n, const int k,
-                               const int batch_size, std::string tuners) {
+template <typename net_t>
+void Tuner<net_t>::store_sgemm_tuners(const int m, const int n, const int k,
+                                      const int batch_size,
+                                      std::string tuners) {
+    auto tuner_file = leelaz_file(TUNER_FILE_LOCAL);
     auto file_contents = std::vector<std::string>();
     {
         // Read the previous contents to string
-        auto file = std::ifstream{TUNER_FILE_LOCAL};
+        auto file = std::ifstream{tuner_file};
         if (file.good()) {
             auto line = std::string{};
             while (std::getline(file, line)) {
@@ -434,14 +609,15 @@ void Tuner::store_sgemm_tuners(const int m, const int n, const int k,
             }
         }
     }
-    auto file = std::ofstream{TUNER_FILE_LOCAL};
+    auto file = std::ofstream{tuner_file};
 
     auto device_name = m_opencl.get_device_name();
     auto tuning_params = std::stringstream{};
     tuning_params << m << ";" << n << ";" << k << ";" << batch_size;
 
     auto tuning_line_prefix = std::to_string(TUNER_VERSION) + ";"
-        + TUNER_KERNEL + ";" + tuning_params.str() + ";";
+                              + getTunerKernel<net_t>() + ";"
+                              + tuning_params.str() + ";";
     auto tuning_line = tuning_line_prefix + tuners + ";" + device_name;
 
     // Write back previous data as long as it's not the device and
@@ -458,14 +634,14 @@ void Tuner::store_sgemm_tuners(const int m, const int n, const int k,
 
     if (file.fail()) {
         myprintf("Could not save the tuning result.\n");
-        myprintf("Do I have write permissions on %s?\n",
-            TUNER_FILE_LOCAL.c_str());
+        myprintf("Do I have write permissions on %s?\n", tuner_file.c_str());
     }
 }
 
-std::string Tuner::sgemm_tuners_from_line(std::string line,
-                                          const int m, const int n, const int k,
-                                          const int batch_size) {
+template <typename net_t>
+std::string Tuner<net_t>::sgemm_tuners_from_line(std::string line, const int m,
+                                                 const int n, const int k,
+                                                 const int batch_size) {
     auto s = std::vector<std::string>{};
     auto ss = std::stringstream{line};
     auto item = std::string{};
@@ -482,7 +658,7 @@ std::string Tuner::sgemm_tuners_from_line(std::string line,
         return "";
     }
 
-    if (s[1] != TUNER_KERNEL) {
+    if (s[1] != getTunerKernel<net_t>()) {
         return "";
     }
 
@@ -509,10 +685,27 @@ std::string Tuner::sgemm_tuners_from_line(std::string line,
     return s[6];
 }
 
-std::string Tuner::load_sgemm_tuners(const int m, const int n, const int k,
-                                     const int batch_size) {
-    auto file = std::ifstream{TUNER_FILE_LOCAL};
-    if (!cfg_sgemm_exhaustive && file.good()) {
+template <typename net_t>
+std::string Tuner<net_t>::load_sgemm_tuners(const int m, const int n,
+                                            const int k, const int batch_size) {
+    auto tuner_file = leelaz_file(TUNER_FILE_LOCAL);
+    auto file = std::ifstream{tuner_file};
+
+    auto try_prior_tuning = file.good();
+
+    // If we want full tuning, don't reuse previously tuned results
+    // except if the tuning was created from this run from a different
+    // GPU instance with the same name.  This prevents the tuner running
+    // for multiple times if the system has multiple same GPUs.
+    if (try_prior_tuning && cfg_sgemm_exhaustive) {
+        auto dev = m_opencl.get_device_name();
+        try_prior_tuning =
+            std::any_of(begin(tuned_devices), end(tuned_devices),
+                        [&dev](const std::string& x) { return dev == x; });
+    }
+    tuned_devices.emplace_back(m_opencl.get_device_name());
+
+    if (try_prior_tuning) {
         auto line = std::string{};
         while (std::getline(file, line)) {
             auto tuners = sgemm_tuners_from_line(line, m, n, k, batch_size);
@@ -527,4 +720,17 @@ std::string Tuner::load_sgemm_tuners(const int m, const int n, const int k,
     return tuners;
 }
 
+template <typename net_t>
+void Tuner<net_t>::enable_tensorcore() {}
+
+template <>
+void Tuner<half_float::half>::enable_tensorcore() {
+    m_use_tensorcore = true;
+}
+
+template class Tuner<float>;
+#ifdef USE_HALF
+template class Tuner<half_float::half>;
+#endif
+
 #endif
diff --git a/src/Tuner.h b/src/Tuner.h
index 7f4cf7093..d8df77749 100644
--- a/src/Tuner.h
+++ b/src/Tuner.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,45 +14,68 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef SGEMM_TUNER_H_INCLUDED
 #define SGEMM_TUNER_H_INCLUDED
 
 #include "config.h"
-#include <vector>
+
 #include <map>
 #include <string>
+#include <vector>
 
 using Configurations = std::pair<std::string, std::vector<size_t>>;
 using Parameters = std::map<std::string, size_t>;
 
-class OpenCL;
+template <typename net_t> class OpenCL;
 
+template <typename net_t>
 class Tuner {
-    OpenCL & m_opencl;
+    OpenCL<net_t>& m_opencl;
     cl::Context m_context;
     cl::Device m_device;
+    bool m_use_tensorcore = false;
+
 public:
-    std::string tune_sgemm(const int m, const int n, const int k,
-                           const int batch_size, const int runs = 4);
-    std::string load_sgemm_tuners(const int m, const int n, const int k,
-                                  const int batch_size);
-
-    static constexpr auto TUNER_VERSION = 0;
-    Tuner(OpenCL & opencl, cl::Context context, cl::Device device) :
-        m_opencl(opencl), m_context(context), m_device(device) {}
+    std::string tune_sgemm(int m, int n, int k, int batch_size, int runs = 4);
+    std::string load_sgemm_tuners(int m, int n, int k, int batch_size);
+
+    // list of device types that was tuned in this run.
+    // This is to prevent the same device from being tuned multiple times.
+    static std::vector<std::string> tuned_devices;
+
+    // version 0 : Initial release
+    // version 1 : Tuner with additional tensor cores (parameter TCE)
+    static constexpr auto TUNER_VERSION = 1;
+
+    Tuner(OpenCL<net_t>& opencl, cl::Context context, cl::Device device)
+        : m_opencl(opencl), m_context(context), m_device(device) {}
+
+    void enable_tensorcore();
+
 private:
-    void store_sgemm_tuners(const int m, const int n, const int k,
-                            const int batch_size, std::string tuners);
+    void store_sgemm_tuners(int m, int n, int k, int batch_size,
+                            std::string tuners);
     bool valid_config_sgemm(Parameters p, bool exhaustive);
     std::string parameters_to_defines(const Parameters& p);
     std::string parameters_to_string(const Parameters& p);
     Parameters get_parameters_by_int(const std::vector<Configurations>& opts,
-                                     const int n);
-    std::string sgemm_tuners_from_line(std::string line, const int m,
-                                       const int n, const int k,
-                                       const int batch_size);
+                                     int n);
+    std::string sgemm_tuners_from_line(std::string line, int m, int n, int k,
+                                       int batch_size);
+    std::vector<Parameters> build_valid_params();
 };
 
 #endif
diff --git a/src/UCTNode.cpp b/src/UCTNode.cpp
index f5649998c..b7d4e91a4 100644
--- a/src/UCTNode.cpp
+++ b/src/UCTNode.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,15 +14,26 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
 
-#include <cassert>
-#include <cstdio>
-#include <cstdint>
 #include <algorithm>
+#include <cassert>
 #include <cmath>
+#include <cstdint>
+#include <cstdio>
 #include <functional>
 #include <iterator>
 #include <limits>
@@ -31,6 +42,7 @@
 #include <vector>
 
 #include "UCTNode.h"
+
 #include "FastBoard.h"
 #include "FastState.h"
 #include "GTP.h"
@@ -40,59 +52,56 @@
 
 using namespace Utils;
 
-UCTNode::UCTNode(int vertex, float score) : m_move(vertex), m_score(score) {
-}
+UCTNode::UCTNode(const int vertex, const float policy)
+    : m_move(vertex), m_policy(policy) {}
 
 bool UCTNode::first_visit() const {
     return m_visits == 0;
 }
 
-SMP::Mutex& UCTNode::get_mutex() {
-    return m_nodemutex;
-}
-
-bool UCTNode::create_children(std::atomic<int>& nodecount,
-                              GameState& state,
-                              float& eval,
-                              float min_psa_ratio) {
-    // check whether somebody beat us to it (atomic)
-    if (!expandable(min_psa_ratio)) {
-        return false;
-    }
-    // acquire the lock
-    LOCK(get_mutex(), lock);
+bool UCTNode::create_children(Network& network, std::atomic<int>& nodecount,
+                              const GameState& state, float& eval,
+                              const float min_psa_ratio) {
     // no successors in final state
     if (state.get_passes() >= 2) {
         return false;
     }
-    // check whether somebody beat us to it (after taking the lock)
-    if (!expandable(min_psa_ratio)) {
+
+    // acquire the lock
+    if (!acquire_expanding()) {
         return false;
     }
-    // Someone else is running the expansion
-    if (m_is_expanding) {
+
+    // can we actually expand?
+    if (!expandable(min_psa_ratio)) {
+        expand_done();
         return false;
     }
-    // We'll be the one queueing this node for expansion, stop others
-    m_is_expanding = true;
-    lock.unlock();
 
-    const auto raw_netlist = Network::get_scored_moves(
-        &state, Network::Ensemble::RANDOM_SYMMETRY);
+    NNCache::Netresult raw_netlist;
+    try {
+        raw_netlist =
+            network.get_output(&state, Network::Ensemble::RANDOM_SYMMETRY);
+    } catch (NetworkHaltException&) {
+        expand_cancel();
+        throw;
+    }
 
     // DCNN returns winrate as side to move
-    m_net_eval = raw_netlist.winrate;
+    const auto stm_eval = raw_netlist.winrate;
     const auto to_move = state.board.get_to_move();
     // our search functions evaluate from black's point of view
-    if (state.board.white_to_move()) {
-        m_net_eval = 1.0f - m_net_eval;
+    if (to_move == FastBoard::WHITE) {
+        m_net_eval = 1.0f - stm_eval;
+    } else {
+        m_net_eval = stm_eval;
     }
     eval = m_net_eval;
 
-    std::vector<Network::ScoreVertexPair> nodelist;
+    std::vector<Network::PolicyVertexPair> nodelist;
 
     auto legal_sum = 0.0f;
-    for (auto i = 0; i < BOARD_SQUARES; i++) {
+    for (auto i = 0; i < NUM_INTERSECTIONS; i++) {
         const auto x = i % BOARD_SIZE;
         const auto y = i / BOARD_SIZE;
         const auto vertex = state.board.get_vertex(x, y);
@@ -101,8 +110,29 @@ bool UCTNode::create_children(std::atomic<int>& nodecount,
             legal_sum += raw_netlist.policy[i];
         }
     }
-    nodelist.emplace_back(raw_netlist.policy_pass, FastBoard::PASS);
-    legal_sum += raw_netlist.policy_pass;
+
+    // Always try passes if we're not trying to be clever.
+    auto allow_pass = cfg_dumbpass;
+
+    // Less than 20 available intersections in a 19x19 game.
+    if (int(nodelist.size()) <= std::max(5, BOARD_SIZE)) {
+        allow_pass = true;
+    }
+
+    // If we're clever, only try passing if we're winning on the
+    // net score and on the board count.
+    if (!allow_pass && stm_eval > 0.8f) {
+        const auto relative_score =
+            (to_move == FastBoard::BLACK ? 1 : -1) * state.final_score();
+        if (relative_score >= 0) {
+            allow_pass = true;
+        }
+    }
+
+    if (allow_pass) {
+        nodelist.emplace_back(raw_netlist.policy_pass, FastBoard::PASS);
+        legal_sum += raw_netlist.policy_pass;
+    }
 
     if (legal_sum > std::numeric_limits<float>::min()) {
         // re-normalize after removing illegal moves.
@@ -118,12 +148,17 @@ bool UCTNode::create_children(std::atomic<int>& nodecount,
     }
 
     link_nodelist(nodecount, nodelist, min_psa_ratio);
+    if (first_visit()) {
+        // Increment visit and assign eval.
+        update(eval);
+    }
+    expand_done();
     return true;
 }
 
 void UCTNode::link_nodelist(std::atomic<int>& nodecount,
-                            std::vector<Network::ScoreVertexPair>& nodelist,
-                            float min_psa_ratio) {
+                            std::vector<Network::PolicyVertexPair>& nodelist,
+                            const float min_psa_ratio) {
     assert(min_psa_ratio < m_min_psa_ratio_children);
 
     if (nodelist.empty()) {
@@ -133,17 +168,13 @@ void UCTNode::link_nodelist(std::atomic<int>& nodecount,
     // Use best to worst order, so highest go first
     std::stable_sort(rbegin(nodelist), rend(nodelist));
 
-    LOCK(get_mutex(), lock);
-
     const auto max_psa = nodelist[0].first;
     const auto old_min_psa = max_psa * m_min_psa_ratio_children;
     const auto new_min_psa = max_psa * min_psa_ratio;
     if (new_min_psa > 0.0f) {
-        m_children.reserve(
-            std::count_if(cbegin(nodelist), cend(nodelist),
-                [=](const auto& node) { return node.first >= new_min_psa; }
-            )
-        );
+        m_children.reserve(std::count_if(
+            cbegin(nodelist), cend(nodelist),
+            [=](const auto& node) { return node.first >= new_min_psa; }));
     } else {
         m_children.reserve(nodelist.size());
     }
@@ -159,14 +190,12 @@ void UCTNode::link_nodelist(std::atomic<int>& nodecount,
     }
 
     m_min_psa_ratio_children = skipped_children ? min_psa_ratio : 0.0f;
-    m_is_expanding = false;
 }
 
 const std::vector<UCTNodePointer>& UCTNode::get_children() const {
     return m_children;
 }
 
-
 int UCTNode::get_move() const {
     return m_move;
 }
@@ -179,9 +208,17 @@ void UCTNode::virtual_loss_undo() {
     m_virtual_loss -= VIRTUAL_LOSS_COUNT;
 }
 
-void UCTNode::update(float eval) {
+void UCTNode::update(const float eval) {
+    // Cache values to avoid race conditions.
+    auto old_eval = static_cast<float>(m_blackevals);
+    auto old_visits = static_cast<int>(m_visits);
+    auto old_delta = old_visits > 0 ? eval - old_eval / old_visits : 0.0f;
     m_visits++;
     accumulate_eval(eval);
+    auto new_delta = eval - (old_eval + eval) / (old_visits + 1);
+    // Welford's online algorithm for calculating variance.
+    auto delta = old_delta * new_delta;
+    atomic_add(m_squared_eval_diff, delta);
 }
 
 bool UCTNode::has_children() const {
@@ -189,40 +226,69 @@ bool UCTNode::has_children() const {
 }
 
 bool UCTNode::expandable(const float min_psa_ratio) const {
+#ifndef NDEBUG
+    if (m_min_psa_ratio_children == 0.0f) {
+        // If we figured out that we are fully expandable
+        // it is impossible that we stay in INITIAL state.
+        assert(m_expand_state.load() != ExpandState::INITIAL);
+    }
+#endif
     return min_psa_ratio < m_min_psa_ratio_children;
 }
 
-float UCTNode::get_score() const {
-    return m_score;
+float UCTNode::get_policy() const {
+    return m_policy;
 }
 
-void UCTNode::set_score(float score) {
-    m_score = score;
+void UCTNode::set_policy(const float policy) {
+    m_policy = policy;
+}
+
+float UCTNode::get_eval_variance(const float default_var) const {
+    return m_visits > 1 ? m_squared_eval_diff / (m_visits - 1) : default_var;
 }
 
 int UCTNode::get_visits() const {
     return m_visits;
 }
 
-float UCTNode::get_eval(int tomove) const {
-    // Due to the use of atomic updates and virtual losses, it is
-    // possible for the visit count to change underneath us. Make sure
-    // to return a consistent result to the caller by caching the values.
-    auto virtual_loss = int{m_virtual_loss};
+float UCTNode::get_eval_lcb(const int color) const {
+    // Lower confidence bound of winrate.
+    auto visits = get_visits();
+    if (visits < 2) {
+        // Return large negative value if not enough visits.
+        return -1e6f + visits;
+    }
+    auto mean = get_raw_eval(color);
+
+    auto stddev = std::sqrt(get_eval_variance(1.0f) / visits);
+    auto z = cached_t_quantile(visits - 1);
+
+    return mean - z * stddev;
+}
+
+float UCTNode::get_raw_eval(const int tomove, const int virtual_loss) const {
     auto visits = get_visits() + virtual_loss;
     assert(visits > 0);
     auto blackeval = get_blackevals();
     if (tomove == FastBoard::WHITE) {
         blackeval += static_cast<double>(virtual_loss);
     }
-    auto score = static_cast<float>(blackeval / double(visits));
+    auto eval = static_cast<float>(blackeval / double(visits));
     if (tomove == FastBoard::WHITE) {
-        score = 1.0f - score;
+        eval = 1.0f - eval;
     }
-    return score;
+    return eval;
+}
+
+float UCTNode::get_eval(const int tomove) const {
+    // Due to the use of atomic updates and virtual losses, it is
+    // possible for the visit count to change underneath us. Make sure
+    // to return a consistent result to the caller by caching the values.
+    return get_raw_eval(tomove, m_virtual_loss);
 }
 
-float UCTNode::get_net_eval(int tomove) const {
+float UCTNode::get_net_eval(const int tomove) const {
     if (tomove == FastBoard::WHITE) {
         return 1.0f - m_net_eval;
     }
@@ -233,12 +299,12 @@ double UCTNode::get_blackevals() const {
     return m_blackevals;
 }
 
-void UCTNode::accumulate_eval(float eval) {
+void UCTNode::accumulate_eval(const float eval) {
     atomic_add(m_blackevals, double(eval));
 }
 
-UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
-    LOCK(get_mutex(), lock);
+UCTNode* UCTNode::uct_select_child(const int color, const bool is_root) {
+    wait_expanded();
 
     // Count parentvisits manually to avoid issues with transpositions.
     auto total_visited_policy = 0.0f;
@@ -247,21 +313,19 @@ UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
         if (child.valid()) {
             parentvisits += child.get_visits();
             if (child.get_visits() > 0) {
-                total_visited_policy += child.get_score();
+                total_visited_policy += child.get_policy();
             }
         }
     }
 
-    auto numerator = std::sqrt(double(parentvisits));
-    auto fpu_reduction = 0.0f;
-    // Lower the expected eval for moves that are likely not the best.
-    // Do not do this if we have introduced noise at this node exactly
-    // to explore more.
-    if (!is_root || !cfg_noise) {
-        fpu_reduction = cfg_fpu_reduction * std::sqrt(total_visited_policy);
-    }
-    // Estimated eval for unknown nodes = original parent NN eval - reduction
-    auto fpu_eval = get_net_eval(color) - fpu_reduction;
+    const auto numerator = std::sqrt(
+        double(parentvisits)
+        * std::log(cfg_logpuct * double(parentvisits) + cfg_logconst));
+    const auto fpu_reduction =
+        (is_root ? cfg_fpu_root_reduction : cfg_fpu_reduction)
+        * std::sqrt(total_visited_policy);
+    // Estimated eval for unknown nodes = parent (not NN) eval - reduction
+    const auto fpu_eval = get_raw_eval(color) - fpu_reduction;
 
     auto best = static_cast<UCTNodePointer*>(nullptr);
     auto best_value = std::numeric_limits<double>::lowest();
@@ -272,13 +336,18 @@ UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
         }
 
         auto winrate = fpu_eval;
-        if (child.get_visits() > 0) {
+        if (child.is_inflated()
+            && child->m_expand_state.load() == ExpandState::EXPANDING) {
+            // Someone else is expanding this node, never select it
+            // if we can avoid so, because we'd block on it.
+            winrate = -1.0f - fpu_reduction;
+        } else if (child.get_visits() > 0) {
             winrate = child.get_eval(color);
         }
-        auto psa = child.get_score();
-        auto denom = 1.0 + child.get_visits();
-        auto puct = cfg_puct * psa * (numerator / denom);
-        auto value = winrate + puct;
+        const auto psa = child.get_policy();
+        const auto denom = 1.0 + child.get_visits();
+        const auto puct = cfg_puct * psa * (numerator / denom);
+        const auto value = winrate + puct;
         assert(value > std::numeric_limits<double>::lowest());
 
         if (value > best_value) {
@@ -292,50 +361,85 @@ UCTNode* UCTNode::uct_select_child(int color, bool is_root) {
     return best->get();
 }
 
-class NodeComp : public std::binary_function<UCTNodePointer&,
-                                             UCTNodePointer&, bool> {
+class NodeComp
+    : public std::binary_function<UCTNodePointer&, UCTNodePointer&, bool> {
 public:
-    NodeComp(int color) : m_color(color) {};
-    bool operator()(const UCTNodePointer& a,
-                    const UCTNodePointer& b) {
+    NodeComp(const int color, const float lcb_min_visits)
+        : m_color(color), m_lcb_min_visits(lcb_min_visits) {}
+
+    // WARNING : on very unusual cases this can be called on multithread
+    // contexts (e.g., UCTSearch::get_pv()) so beware of race conditions
+    bool operator()(const UCTNodePointer& a, const UCTNodePointer& b) {
+        auto a_visit = a.get_visits();
+        auto b_visit = b.get_visits();
+
+        // Need at least 2 visits for LCB.
+        if (m_lcb_min_visits < 2) {
+            m_lcb_min_visits = 2;
+        }
+
+        // Calculate the lower confidence bound for each node.
+        if ((a_visit > m_lcb_min_visits) && (b_visit > m_lcb_min_visits)) {
+            auto a_lcb = a.get_eval_lcb(m_color);
+            auto b_lcb = b.get_eval_lcb(m_color);
+
+            // Sort on lower confidence bounds
+            if (a_lcb != b_lcb) {
+                return a_lcb < b_lcb;
+            }
+        }
+
         // if visits are not same, sort on visits
-        if (a.get_visits() != b.get_visits()) {
-            return a.get_visits() < b.get_visits();
+        if (a_visit != b_visit) {
+            return a_visit < b_visit;
         }
 
-        // neither has visits, sort on prior score
-        if (a.get_visits() == 0) {
-            return a.get_score() < b.get_score();
+        // neither has visits, sort on policy prior
+        if (a_visit == 0) {
+            return a.get_policy() < b.get_policy();
         }
 
         // both have same non-zero number of visits
         return a.get_eval(m_color) < b.get_eval(m_color);
     }
+
 private:
     int m_color;
+    float m_lcb_min_visits;
 };
 
-void UCTNode::sort_children(int color) {
-    LOCK(get_mutex(), lock);
-    std::stable_sort(rbegin(m_children), rend(m_children), NodeComp(color));
+void UCTNode::sort_children(const int color, const float lcb_min_visits) {
+    std::stable_sort(rbegin(m_children), rend(m_children),
+                     NodeComp(color, lcb_min_visits));
 }
 
-UCTNode& UCTNode::get_best_root_child(int color) {
-    LOCK(get_mutex(), lock);
+UCTNode& UCTNode::get_best_root_child(const int color) const {
+    wait_expanded();
+
     assert(!m_children.empty());
 
-    auto ret = std::max_element(begin(m_children), end(m_children),
-                                NodeComp(color));
+    auto max_visits = 0;
+    for (const auto& node : m_children) {
+        max_visits = std::max(max_visits, node.get_visits());
+    }
+
+    auto ret =
+        std::max_element(begin(m_children), end(m_children),
+                         NodeComp(color, cfg_lcb_min_visit_ratio * max_visits));
     ret->inflate();
+
     return *(ret->get());
 }
 
-size_t UCTNode::count_nodes() const {
+size_t UCTNode::count_nodes_and_clear_expand_state() {
     auto nodecount = size_t{0};
     nodecount += m_children.size();
+    if (expandable()) {
+        m_expand_state = ExpandState::INITIAL;
+    }
     for (auto& child : m_children) {
-        if (child.get_visits() > 0) {
-            nodecount += child->count_nodes();
+        if (child.is_inflated()) {
+            nodecount += child->count_nodes_and_clear_expand_state();
         }
     }
     return nodecount;
@@ -358,3 +462,32 @@ bool UCTNode::valid() const {
 bool UCTNode::active() const {
     return m_status == ACTIVE;
 }
+
+bool UCTNode::acquire_expanding() {
+    auto expected = ExpandState::INITIAL;
+    auto newval = ExpandState::EXPANDING;
+    return m_expand_state.compare_exchange_strong(expected, newval);
+}
+
+void UCTNode::expand_done() {
+    auto v = m_expand_state.exchange(ExpandState::EXPANDED);
+#ifdef NDEBUG
+    (void)v;
+#endif
+    assert(v == ExpandState::EXPANDING);
+}
+void UCTNode::expand_cancel() {
+    auto v = m_expand_state.exchange(ExpandState::INITIAL);
+#ifdef NDEBUG
+    (void)v;
+#endif
+    assert(v == ExpandState::EXPANDING);
+}
+void UCTNode::wait_expanded() const {
+    while (m_expand_state.load() == ExpandState::EXPANDING) {}
+    auto v = m_expand_state.load();
+#ifdef NDEBUG
+    (void)v;
+#endif
+    assert(v == ExpandState::EXPANDED);
+}
diff --git a/src/UCTNode.h b/src/UCTNode.h
index 9c88961e7..bfad1c7a4 100644
--- a/src/UCTNode.h
+++ b/src/UCTNode.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef UCTNODE_H_INCLUDED
@@ -22,10 +33,10 @@
 #include "config.h"
 
 #include <atomic>
-#include <memory>
-#include <vector>
 #include <cassert>
 #include <cstring>
+#include <memory>
+#include <vector>
 
 #include "GameState.h"
 #include "Network.h"
@@ -39,49 +50,52 @@ class UCTNode {
     // search tree.
     static constexpr auto VIRTUAL_LOSS_COUNT = 3;
     // Defined in UCTNode.cpp
-    explicit UCTNode(int vertex, float score);
+    explicit UCTNode(int vertex, float policy);
     UCTNode() = delete;
     ~UCTNode() = default;
 
-    bool create_children(std::atomic<int>& nodecount,
-                         GameState& state, float& eval,
+    bool create_children(Network& network, std::atomic<int>& nodecount,
+                         const GameState& state, float& eval,
                          float min_psa_ratio = 0.0f);
 
     const std::vector<UCTNodePointer>& get_children() const;
-    void sort_children(int color);
-    UCTNode& get_best_root_child(int color);
+    void sort_children(int color, float lcb_min_visits);
+    UCTNode& get_best_root_child(int color) const;
     UCTNode* uct_select_child(int color, bool is_root);
 
-    size_t count_nodes() const;
-    SMP::Mutex& get_mutex();
+    size_t count_nodes_and_clear_expand_state();
     bool first_visit() const;
     bool has_children() const;
-    bool expandable(const float min_psa_ratio = 0.0f) const;
+    bool expandable(float min_psa_ratio = 0.0f) const;
     void invalidate();
-    void set_active(const bool active);
+    void set_active(bool active);
     bool valid() const;
     bool active() const;
     int get_move() const;
     int get_visits() const;
-    float get_score() const;
-    void set_score(float score);
+    float get_policy() const;
+    void set_policy(float policy);
+    float get_eval_variance(float default_var = 0.0f) const;
     float get_eval(int tomove) const;
+    float get_raw_eval(int tomove, int virtual_loss = 0) const;
     float get_net_eval(int tomove) const;
-    void virtual_loss(void);
-    void virtual_loss_undo(void);
+    void virtual_loss();
+    void virtual_loss_undo();
     void update(float eval);
+    float get_eval_lcb(int color) const;
 
     // Defined in UCTNodeRoot.cpp, only to be called on m_root in UCTSearch
     void randomize_first_proportionally();
-    void prepare_root_node(int color,
-                           std::atomic<int>& nodecount,
-                           GameState& state);
+    void prepare_root_node(Network& network, int color,
+                           std::atomic<int>& nodecount, GameState& state);
 
     UCTNode* get_first_child() const;
     UCTNode* get_nopass_child(FastState& state) const;
-    std::unique_ptr<UCTNode> find_child(const int move);
+    std::unique_ptr<UCTNode> find_child(int move);
     void inflate_all_children();
 
+    void clear_expand_state();
+
 private:
     enum Status : char {
         INVALID, // superko
@@ -89,11 +103,11 @@ class UCTNode {
         ACTIVE
     };
     void link_nodelist(std::atomic<int>& nodecount,
-                       std::vector<Network::ScoreVertexPair>& nodelist,
+                       std::vector<Network::PolicyVertexPair>& nodelist,
                        float min_psa_ratio);
     double get_blackevals() const;
     void accumulate_eval(float eval);
-    void kill_superkos(const KoState& state);
+    void kill_superkos(const GameState& state);
     void dirichlet_noise(float epsilon, float alpha);
 
     // Note : This class is very size-sensitive as we are going to create
@@ -106,18 +120,50 @@ class UCTNode {
     std::atomic<std::int16_t> m_virtual_loss{0};
     std::atomic<int> m_visits{0};
     // UCT eval
-    float m_score;
+    float m_policy;
     // Original net eval for this node (not children).
     float m_net_eval{0.0f};
+    // Variable used for calculating variance of evaluations.
+    // Initialized to small non-zero value to avoid accidental zero variances
+    // at low visits.
+    std::atomic<float> m_squared_eval_diff{1e-4f};
     std::atomic<double> m_blackevals{0.0};
     std::atomic<Status> m_status{ACTIVE};
-    // Is someone adding scores to this node?
-    bool m_is_expanding{false};
-    SMP::Mutex m_nodemutex;
+
+    // m_expand_state acts as the lock for m_children.
+    // see manipulation methods below for possible state transition
+    enum class ExpandState : std::uint8_t {
+        // initial state, no children
+        INITIAL = 0,
+
+        // creating children.  the thread that changed the node's state to
+        // EXPANDING is responsible of finishing the expansion and then
+        // move to EXPANDED, or revert to INITIAL if impossible
+        EXPANDING,
+
+        // expansion done.  m_children cannot be modified on a multi-thread
+        // context, until node is destroyed.
+        EXPANDED,
+    };
+    std::atomic<ExpandState> m_expand_state{ExpandState::INITIAL};
 
     // Tree data
     std::atomic<float> m_min_psa_ratio_children{2.0f};
     std::vector<UCTNodePointer> m_children;
+
+    //  m_expand_state manipulation methods
+    // INITIAL -> EXPANDING
+    // Return false if current state is not INITIAL
+    bool acquire_expanding();
+
+    // EXPANDING -> DONE
+    void expand_done();
+
+    // EXPANDING -> INITIAL
+    void expand_cancel();
+
+    // wait until we are on EXPANDED state
+    void wait_expanded() const;
 };
 
 #endif
diff --git a/src/UCTNodePointer.cpp b/src/UCTNodePointer.cpp
index bb51f83b0..2beabebe5 100644
--- a/src/UCTNodePointer.cpp
+++ b/src/UCTNodePointer.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2018-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,83 +14,151 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
 
 #include <atomic>
-#include <memory>
 #include <cassert>
 #include <cstring>
+#include <memory>
 
 #include "UCTNode.h"
 
+std::atomic<size_t> UCTNodePointer::m_tree_size = {0};
+
+size_t UCTNodePointer::get_tree_size() {
+    return m_tree_size.load();
+}
+
+void UCTNodePointer::increment_tree_size(const size_t sz) {
+    m_tree_size += sz;
+}
+
+void UCTNodePointer::decrement_tree_size(const size_t sz) {
+    assert(UCTNodePointer::m_tree_size >= sz);
+    m_tree_size -= sz;
+}
+
 UCTNodePointer::~UCTNodePointer() {
-    if (is_inflated()) {
-        delete read_ptr();
+    auto sz = sizeof(UCTNodePointer);
+    auto v = m_data.load();
+    if (is_inflated(v)) {
+        delete read_ptr(v);
+        sz += sizeof(UCTNode);
     }
+    decrement_tree_size(sz);
 }
 
 UCTNodePointer::UCTNodePointer(UCTNodePointer&& n) {
-    if (is_inflated()) {
-        delete read_ptr();
-    }
-    m_data = n.m_data;
-    n.m_data = 1; // non-inflated garbage
+    auto nv = std::atomic_exchange(&n.m_data, INVALID);
+    auto v = std::atomic_exchange(&m_data, nv);
+#ifdef NDEBUG
+    (void)v;
+#else
+    assert(v == INVALID);
+#endif
+    increment_tree_size(sizeof(UCTNodePointer));
 }
 
-UCTNodePointer::UCTNodePointer(std::int16_t vertex, float score) {
-    std::uint32_t i_score;
+UCTNodePointer::UCTNodePointer(const std::int16_t vertex, const float policy) {
+    std::uint32_t i_policy;
     auto i_vertex = static_cast<std::uint16_t>(vertex);
-    std::memcpy(&i_score, &score, sizeof(i_score));
+    std::memcpy(&i_policy, &policy, sizeof(i_policy));
 
-    m_data =  (static_cast<std::uint64_t>(i_score)  << 32)
-            | (static_cast<std::uint64_t>(i_vertex) << 16) | 1ULL;
+    m_data = (static_cast<std::uint64_t>(i_policy) << 32)
+           | (static_cast<std::uint64_t>(i_vertex) << 16);
+    increment_tree_size(sizeof(UCTNodePointer));
 }
 
 UCTNodePointer& UCTNodePointer::operator=(UCTNodePointer&& n) {
-    if (is_inflated()) {
-        delete read_ptr();
-    }
-    m_data = n.m_data;
-    n.m_data = 1;
+    auto nv = std::atomic_exchange(&n.m_data, INVALID);
+    auto v = std::atomic_exchange(&m_data, nv);
 
+    if (is_inflated(v)) {
+        decrement_tree_size(sizeof(UCTNode));
+        delete read_ptr(v);
+    }
     return *this;
 }
 
+UCTNode* UCTNodePointer::release() {
+    auto v = std::atomic_exchange(&m_data, INVALID);
+    decrement_tree_size(sizeof(UCTNode));
+    return read_ptr(v);
+}
+
 void UCTNodePointer::inflate() const {
-    if (is_inflated()) return;
-    m_data = reinterpret_cast<std::uint64_t>(
-        new UCTNode(read_vertex(), read_score()));
+    while (true) {
+        auto v = m_data.load();
+        if (is_inflated(v)) return;
+
+        auto v2 = reinterpret_cast<std::uint64_t>(
+            new UCTNode(read_vertex(v), read_policy(v)));
+        assert((v2 & 3ULL) == 0);
+        v2 |= POINTER;
+        bool success = m_data.compare_exchange_strong(v, v2);
+        if (success) {
+            increment_tree_size(sizeof(UCTNode));
+            return;
+        } else {
+            // this means that somebody else also modified this instance.
+            // Try again next time
+            delete read_ptr(v2);
+        }
+    }
 }
 
 bool UCTNodePointer::valid() const {
-    if (is_inflated()) return read_ptr()->valid();
+    auto v = m_data.load();
+    if (is_inflated(v)) return read_ptr(v)->valid();
     return true;
 }
 
 int UCTNodePointer::get_visits() const {
-    if (is_inflated()) return read_ptr()->get_visits();
+    auto v = m_data.load();
+    if (is_inflated(v)) return read_ptr(v)->get_visits();
     return 0;
 }
 
-float UCTNodePointer::get_score() const {
-    if (is_inflated()) return read_ptr()->get_score();
-    return read_score();
+float UCTNodePointer::get_policy() const {
+    auto v = m_data.load();
+    if (is_inflated(v)) return read_ptr(v)->get_policy();
+    return read_policy(v);
+}
+
+float UCTNodePointer::get_eval_lcb(const int color) const {
+    auto v = m_data.load();
+    assert(is_inflated(v));
+    return read_ptr(v)->get_eval_lcb(color);
 }
 
 bool UCTNodePointer::active() const {
-    if (is_inflated()) return read_ptr()->active();
+    auto v = m_data.load();
+    if (is_inflated(v)) return read_ptr(v)->active();
     return true;
 }
 
-float UCTNodePointer::get_eval(int tomove) const {
+float UCTNodePointer::get_eval(const int tomove) const {
     // this can only be called if it is an inflated pointer
-    assert(is_inflated());
-    return read_ptr()->get_eval(tomove);
+    auto v = m_data.load();
+    assert(is_inflated(v));
+    return read_ptr(v)->get_eval(tomove);
 }
 
 int UCTNodePointer::get_move() const {
-    if (is_inflated()) return read_ptr()->get_move();
-    return read_vertex();
+    auto v = m_data.load();
+    if (is_inflated(v)) return read_ptr(v)->get_move();
+    return read_vertex(v);
 }
diff --git a/src/UCTNodePointer.h b/src/UCTNodePointer.h
index d40bbc2ff..a3dae7a72 100644
--- a/src/UCTNodePointer.h
+++ b/src/UCTNodePointer.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Gian-Carlo Pascutto
+    Copyright (C) 2018-2019 Gian-Carlo Pascutto
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef UCTNODEPOINTER_H_INCLUDED
@@ -22,9 +33,9 @@
 #include "config.h"
 
 #include <atomic>
-#include <memory>
 #include <cassert>
 #include <cstring>
+#include <memory>
 
 #include "SMP.h"
 
@@ -39,77 +50,89 @@ class UCTNode;
 //  - std::unique_ptr<UCTNode> pointer;
 //  - std::pair<float, std::int16_t> args;
 
-// WARNING : inflate() is not thread-safe and hence has to be protected
-// by an external lock.
+// All methods should be thread-safe except destructor and when
+// the instanced is 'moved from'.
 
 class UCTNodePointer {
 private:
+    static constexpr std::uint64_t INVALID = 2;
+    static constexpr std::uint64_t POINTER = 1;
+    static constexpr std::uint64_t UNINFLATED = 0;
+
+    static std::atomic<size_t> m_tree_size;
+    static void increment_tree_size(size_t sz);
+    static void decrement_tree_size(size_t sz);
+
     // the raw storage used here.
-    // if bit 0 is 0, m_data is the actual pointer.
-    // if bit 0 is 1, bit [31:16] is the vertex value, bit [63:32] is the score.
+    // if bit [1:0] is 1, m_data is the actual pointer.
+    // if bit [1:0] is 0, bit [31:16] is the vertex value, bit [63:32] is the policy
+    // if bit [1:0] is other values, it should assert-fail
     // (C-style bit fields and unions are not portable)
-    mutable uint64_t m_data = 1;
+    mutable std::atomic<std::uint64_t> m_data{INVALID};
 
-    UCTNode * read_ptr() const {
-        assert(is_inflated());
-        return reinterpret_cast<UCTNode*>(m_data);
+    UCTNode* read_ptr(const uint64_t v) const {
+        assert((v & 3ULL) == POINTER);
+        return reinterpret_cast<UCTNode*>(v & ~(0x3ULL));
     }
 
-    std::int16_t read_vertex() const {
-        assert(!is_inflated());
-        return static_cast<std::int16_t>(m_data >> 16);
+    std::int16_t read_vertex(const uint64_t v) const {
+        assert((v & 3ULL) == UNINFLATED);
+        return static_cast<std::int16_t>(v >> 16);
     }
 
-    float read_score() const {
+    float read_policy(const uint64_t v) const {
         static_assert(sizeof(float) == 4,
-            "This code assumes floats are 32-bit");
-        assert(!is_inflated());
+                      "This code assumes floats are 32-bit");
+        assert((v & 3ULL) == UNINFLATED);
 
-        auto x = static_cast<std::uint32_t>(m_data >> 32);
+        auto x = static_cast<std::uint32_t>(v >> 32);
         float ret;
         std::memcpy(&ret, &x, sizeof(ret));
         return ret;
     }
 
+    bool is_inflated(const uint64_t v) const {
+        return (v & 3ULL) == POINTER;
+    }
+
 public:
+    static size_t get_tree_size();
+
     ~UCTNodePointer();
     UCTNodePointer(UCTNodePointer&& n);
-    UCTNodePointer(std::int16_t vertex, float score);
+    UCTNodePointer(std::int16_t vertex, float policy);
     UCTNodePointer(const UCTNodePointer&) = delete;
 
     bool is_inflated() const {
-        return (m_data & 1ULL) == 0;
+        return is_inflated(m_data.load());
     }
 
     // methods from std::unique_ptr<UCTNode>
-    typename std::add_lvalue_reference<UCTNode>::type operator*() const{
-        return *read_ptr();
+    typename std::add_lvalue_reference<UCTNode>::type operator*() const {
+        return *read_ptr(m_data.load());
     }
     UCTNode* operator->() const {
-        return read_ptr();
+        return read_ptr(m_data.load());
     }
     UCTNode* get() const {
-        return read_ptr();
+        return read_ptr(m_data.load());
     }
     UCTNodePointer& operator=(UCTNodePointer&& n);
-    UCTNode * release() {
-        auto ret = read_ptr();
-        m_data = 1;
-        return ret;
-    }
+    UCTNode* release();
 
-    // construct UCTNode instance from the vertex/score pair
+    // construct UCTNode instance from the vertex/policy pair
     void inflate() const;
 
     // proxy of UCTNode methods which can be called without
     // constructing UCTNode
     bool valid() const;
     int get_visits() const;
-    float get_score() const;
+    float get_policy() const;
     bool active() const;
     int get_move() const;
-    // this can only be called if it is an inflated pointer
+    // these can only be called if it is an inflated pointer
     float get_eval(int tomove) const;
+    float get_eval_lcb(int color) const;
 };
 
 #endif
diff --git a/src/UCTNodeRoot.cpp b/src/UCTNodeRoot.cpp
index 032a2f762..a214e2cf9 100644
--- a/src/UCTNodeRoot.cpp
+++ b/src/UCTNodeRoot.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Gian-Carlo Pascutto
+    Copyright (C) 2018-2019 Gian-Carlo Pascutto
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
@@ -26,14 +37,13 @@
 #include <utility>
 #include <vector>
 
-#include "UCTNode.h"
 #include "FastBoard.h"
 #include "FastState.h"
+#include "GTP.h"
 #include "KoState.h"
 #include "Random.h"
 #include "UCTNode.h"
 #include "Utils.h"
-#include "GTP.h"
 
 /*
  * These functions belong to UCTNode but should only be called on the root node
@@ -48,7 +58,10 @@ UCTNode* UCTNode::get_first_child() const {
     return m_children.front().get();
 }
 
-void UCTNode::kill_superkos(const KoState& state) {
+void UCTNode::kill_superkos(const GameState& state) {
+    UCTNodePointer* pass_child = nullptr;
+    size_t valid_count = 0;
+
     for (auto& child : m_children) {
         auto move = child->get_move();
         if (move != FastBoard::PASS) {
@@ -59,18 +72,29 @@ void UCTNode::kill_superkos(const KoState& state) {
                 // Don't delete nodes for now, just mark them invalid.
                 child->invalidate();
             }
+        } else {
+            pass_child = &child;
+        }
+        if (child->valid()) {
+            valid_count++;
         }
     }
 
+    if (valid_count > 1 && pass_child
+        && !state.is_move_legal(state.get_to_move(), FastBoard::PASS)) {
+        // Remove the PASS node according to "avoid" -- but only if there are
+        // other valid nodes left.
+        (*pass_child)->invalidate();
+    }
+
     // Now do the actual deletion.
     m_children.erase(
         std::remove_if(begin(m_children), end(m_children),
-                       [](const auto &child) { return !child->valid(); }),
-        end(m_children)
-    );
+                       [](const auto& child) { return !child->valid(); }),
+        end(m_children));
 }
 
-void UCTNode::dirichlet_noise(float epsilon, float alpha) {
+void UCTNode::dirichlet_noise(const float epsilon, const float alpha) {
     auto child_cnt = m_children.size();
 
     auto dirichlet_vector = std::vector<float>{};
@@ -79,8 +103,8 @@ void UCTNode::dirichlet_noise(float epsilon, float alpha) {
         dirichlet_vector.emplace_back(gamma(Random::get_Rng()));
     }
 
-    auto sample_sum = std::accumulate(begin(dirichlet_vector),
-                                      end(dirichlet_vector), 0.0f);
+    auto sample_sum =
+        std::accumulate(begin(dirichlet_vector), end(dirichlet_vector), 0.0f);
 
     // If the noise vector sums to 0 or a denormal, then don't try to
     // normalize.
@@ -94,10 +118,10 @@ void UCTNode::dirichlet_noise(float epsilon, float alpha) {
 
     child_cnt = 0;
     for (auto& child : m_children) {
-        auto score = child->get_score();
+        auto policy = child->get_policy();
         auto eta_a = dirichlet_vector[child_cnt++];
-        score = score * (1 - epsilon) + epsilon * eta_a;
-        child->set_score(score);
+        policy = policy * (1 - epsilon) + epsilon * eta_a;
+        child->set_policy(policy);
     }
 }
 
@@ -116,8 +140,7 @@ void UCTNode::randomize_first_proportionally() {
             }
         }
         if (visits > cfg_random_min_visits) {
-            accum += std::pow(visits / norm_factor,
-                              1.0 / cfg_random_temp);
+            accum += std::pow(visits / norm_factor, 1.0 / cfg_random_temp);
             accum_vector.emplace_back(accum);
         }
     }
@@ -161,7 +184,7 @@ UCTNode* UCTNode::get_nopass_child(FastState& state) const {
 std::unique_ptr<UCTNode> UCTNode::find_child(const int move) {
     for (auto& child : m_children) {
         if (child.get_move() == move) {
-             // no guarantee that this is a non-inflated node
+            // no guarantee that this is a non-inflated node
             child.inflate();
             return std::unique_ptr<UCTNode>(child.release());
         }
@@ -177,18 +200,17 @@ void UCTNode::inflate_all_children() {
     }
 }
 
-void UCTNode::prepare_root_node(int color,
+void UCTNode::prepare_root_node(Network& network, const int color,
                                 std::atomic<int>& nodes,
                                 GameState& root_state) {
     float root_eval;
     const auto had_children = has_children();
     if (expandable()) {
-        create_children(nodes, root_state, root_eval);
+        create_children(network, nodes, root_state, root_eval);
     }
     if (had_children) {
-        root_eval = get_eval(color);
+        root_eval = get_net_eval(color);
     } else {
-        update(root_eval);
         root_eval = (color == FastBoard::BLACK ? root_eval : 1.0f - root_eval);
     }
     Utils::myprintf("NN eval=%f\n", root_eval);
@@ -203,7 +225,7 @@ void UCTNode::prepare_root_node(int color,
 
     if (cfg_noise) {
         // Adjust the Dirichlet noise's alpha constant to the board size
-        auto alpha = 0.03f * 361.0f / BOARD_SQUARES;
+        auto alpha = 0.03f * 361.0f / NUM_INTERSECTIONS;
         dirichlet_noise(0.25f, alpha);
     }
 }
diff --git a/src/UCTSearch.cpp b/src/UCTSearch.cpp
index accb3352f..d31c200da 100644
--- a/src/UCTSearch.cpp
+++ b/src/UCTSearch.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,18 +14,34 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "UCTSearch.h"
 
+#include <algorithm>
+#include <boost/format.hpp>
+#include <boost/scope_exit.hpp>
 #include <cassert>
 #include <cmath>
 #include <cstddef>
+#include <functional>
 #include <limits>
 #include <memory>
 #include <type_traits>
 
+#include "UCTSearch.h"
+
 #include "FastBoard.h"
 #include "FastState.h"
 #include "FullBoard.h"
@@ -35,15 +51,72 @@
 #include "Timing.h"
 #include "Training.h"
 #include "Utils.h"
+#ifdef USE_OPENCL
+#include "OpenCLScheduler.h"
+#endif
 
 using namespace Utils;
 
 constexpr int UCTSearch::UNLIMITED_PLAYOUTS;
 
-UCTSearch::UCTSearch(GameState& g)
-    : m_rootstate(g) {
+class OutputAnalysisData {
+public:
+    OutputAnalysisData(std::string move, const int visits, const float winrate,
+                       const float policy_prior, std::string pv,
+                       const float lcb, const bool lcb_ratio_exceeded)
+        : m_move(std::move(move)),
+          m_visits(visits),
+          m_winrate(winrate),
+          m_policy_prior(policy_prior),
+          m_pv(std::move(pv)),
+          m_lcb(lcb),
+          m_lcb_ratio_exceeded(lcb_ratio_exceeded) {}
+
+    std::string get_info_string(const int order) const {
+        auto tmp = "info move " + m_move
+                 + " visits " + std::to_string(m_visits)
+                 + " winrate "
+                 + std::to_string(static_cast<int>(m_winrate * 10000))
+                 + " prior "
+                 + std::to_string(static_cast<int>(m_policy_prior * 10000.0f))
+                 + " lcb "
+                 + std::to_string(static_cast<int>(std::max(0.0f, m_lcb)
+                                                   * 10000));
+        if (order >= 0) {
+            tmp += " order " + std::to_string(order);
+        }
+        tmp += " pv " + m_pv;
+        return tmp;
+    }
+
+    friend bool operator<(const OutputAnalysisData& a,
+                          const OutputAnalysisData& b) {
+        if (a.m_lcb_ratio_exceeded && b.m_lcb_ratio_exceeded) {
+            if (a.m_lcb != b.m_lcb) {
+                return a.m_lcb < b.m_lcb;
+            }
+        }
+        if (a.m_visits == b.m_visits) {
+            return a.m_winrate < b.m_winrate;
+        }
+        return a.m_visits < b.m_visits;
+    }
+
+private:
+    std::string m_move;
+    int m_visits;
+    float m_winrate;
+    float m_policy_prior;
+    std::string m_pv;
+    float m_lcb;
+    bool m_lcb_ratio_exceeded;
+};
+
+UCTSearch::UCTSearch(GameState& g, Network& network)
+    : m_rootstate(g), m_network(network) {
     set_playout_limit(cfg_max_playouts);
     set_visit_limit(cfg_max_visits);
+
     m_root = std::make_unique<UCTNode>(FastBoard::PASS, 0.0f);
 }
 
@@ -64,7 +137,6 @@ bool UCTSearch::advance_to_new_rootstate() {
         return false;
     }
 
-
     auto test = std::make_unique<GameState>(m_rootstate);
     for (auto i = 0; i < depth; i++) {
         test->undo_move();
@@ -123,7 +195,7 @@ void UCTSearch::update_root() {
     m_playouts = 0;
 
 #ifndef NDEBUG
-    auto start_nodes = m_root->count_nodes();
+    auto start_nodes = m_root->count_nodes_and_clear_expand_state();
 #endif
 
     if (!advance_to_new_rootstate() || !m_root) {
@@ -133,23 +205,29 @@ void UCTSearch::update_root() {
     m_last_rootstate.reset(nullptr);
 
     // Check how big our search tree (reused or new) is.
-    m_nodes = m_root->count_nodes();
+    m_nodes = m_root->count_nodes_and_clear_expand_state();
 
 #ifndef NDEBUG
     if (m_nodes > 0) {
         myprintf("update_root, %d -> %d nodes (%.1f%% reused)\n",
-            start_nodes, m_nodes.load(), 100.0 * m_nodes.load() / start_nodes);
+                 start_nodes, m_nodes.load(),
+                 100.0 * m_nodes.load() / start_nodes);
     }
 #endif
 }
 
 float UCTSearch::get_min_psa_ratio() const {
-    const auto mem_full = m_nodes / static_cast<float>(MAX_TREE_SIZE);
+    const auto mem_full =
+        UCTNodePointer::get_tree_size() / static_cast<float>(cfg_max_tree_size);
     // If we are halfway through our memory budget, start trimming
     // moves with very low policy priors.
     if (mem_full > 0.5f) {
         // Memory is almost exhausted, trim more aggressively.
         if (mem_full > 0.95f) {
+            // if completely full just stop expansion by returning an impossible number
+            if (mem_full >= 1.0f) {
+                return 2.0f;
+            }
             return 0.01f;
         }
         return 0.001f;
@@ -157,25 +235,34 @@ float UCTSearch::get_min_psa_ratio() const {
     return 0.0f;
 }
 
-SearchResult UCTSearch::play_simulation(GameState & currstate,
+SearchResult UCTSearch::play_simulation(GameState& currstate,
                                         UCTNode* const node) {
     const auto color = currstate.get_to_move();
     auto result = SearchResult{};
+    auto new_node = false;
 
     node->virtual_loss();
 
+    // This will undo virtual loss even if something throws an exception.
+    BOOST_SCOPE_EXIT(node) {
+        node->virtual_loss_undo();
+    } BOOST_SCOPE_EXIT_END
+
     if (node->expandable()) {
         if (currstate.get_passes() >= 2) {
             auto score = currstate.final_score();
             result = SearchResult::from_score(score);
-        } else if (m_nodes < MAX_TREE_SIZE) {
+        } else {
             float eval;
             const auto had_children = node->has_children();
-            const auto success =
-                node->create_children(m_nodes, currstate, eval,
-                                      get_min_psa_ratio());
+
+            // Careful: create_children() can throw a NetworkHaltException when
+            // another thread requests draining the search.
+            const auto success = node->create_children(
+                m_network, m_nodes, currstate, eval, get_min_psa_ratio());
             if (!had_children && success) {
                 result = SearchResult::from_eval(eval);
+                new_node = true;
             }
         }
     }
@@ -192,23 +279,28 @@ SearchResult UCTSearch::play_simulation(GameState & currstate,
         }
     }
 
-    if (result.valid()) {
+    // New node was updated in create_children.
+    if (result.valid() && !new_node) {
         node->update(result.eval());
     }
-    node->virtual_loss_undo();
 
     return result;
 }
 
-void UCTSearch::dump_stats(FastState & state, UCTNode & parent) {
+void UCTSearch::dump_stats(const FastState& state, UCTNode& parent) {
     if (cfg_quiet || !parent.has_children()) {
         return;
     }
 
     const int color = state.get_to_move();
 
+    auto max_visits = 0;
+    for (const auto& node : parent.get_children()) {
+        max_visits = std::max(max_visits, node->get_visits());
+    }
+
     // sort children, put best move on top
-    parent.sort_children(color);
+    parent.sort_children(color, cfg_lcb_min_visit_ratio * max_visits);
 
     if (parent.get_first_child()->first_visit()) {
         return;
@@ -220,42 +312,71 @@ void UCTSearch::dump_stats(FastState & state, UCTNode & parent) {
         // only one move searched the user could get an idea why.
         if (++movecount > 2 && !node->get_visits()) break;
 
-        std::string move = state.move_to_text(node->get_move());
-        FastState tmpstate = state;
+        auto move = state.move_to_text(node->get_move());
+        auto tmpstate = FastState{state};
         tmpstate.play_move(node->get_move());
-        std::string pv = move + " " + get_pv(tmpstate, *node);
+        auto pv = move + " " + get_pv(tmpstate, *node);
 
-        myprintf("%4s -> %7d (V: %5.2f%%) (N: %5.2f%%) PV: %s\n",
-            move.c_str(),
-            node->get_visits(),
-            node->get_visits() ? node->get_eval(color)*100.0f : 0.0f,
-            node->get_score() * 100.0f,
-            pv.c_str());
+        myprintf("%4s -> %7d (V: %5.2f%%) (LCB: %5.2f%%) (N: %5.2f%%) PV: %s\n",
+                 move.c_str(), node->get_visits(),
+                 node->get_visits() ? node->get_raw_eval(color) * 100.0f : 0.0f,
+                 std::max(0.0f, node->get_eval_lcb(color) * 100.0f),
+                 node->get_policy() * 100.0f, pv.c_str());
     }
     tree_stats(parent);
 }
 
-void tree_stats_helper(const UCTNode& node, size_t depth,
-                       size_t& nodes, size_t& non_leaf_nodes,
-                       size_t& depth_sum, size_t& max_depth,
-                       size_t& children_count) {
-    nodes += 1;
-    non_leaf_nodes += node.get_visits() > 1;
-    depth_sum += depth;
-    if (depth > max_depth) max_depth = depth;
-
-    for (const auto& child : node.get_children()) {
-        if (child.get_visits() > 0) {
-            children_count += 1;
-            tree_stats_helper(*(child.get()), depth+1,
-                              nodes, non_leaf_nodes, depth_sum,
-                              max_depth, children_count);
-        } else {
-            nodes += 1;
-            depth_sum += depth+1;
-            if (depth+1 > max_depth) max_depth = depth+1;
+void UCTSearch::output_analysis(const FastState& state, const UCTNode& parent) {
+    // We need to make a copy of the data before sorting
+    auto sortable_data = std::vector<OutputAnalysisData>();
+
+    if (!parent.has_children()) {
+        return;
+    }
+
+    const auto color = state.get_to_move();
+
+    auto max_visits = 0;
+    for (const auto& node : parent.get_children()) {
+        max_visits = std::max(max_visits, node->get_visits());
+    }
+
+    for (const auto& node : parent.get_children()) {
+        // Send only variations with visits, unless more moves were
+        // requested explicitly.
+        if (!node->get_visits()
+            && sortable_data.size() >= cfg_analyze_tags.post_move_count()) {
+            continue;
         }
+        auto move = state.move_to_text(node->get_move());
+        auto tmpstate = FastState{state};
+        tmpstate.play_move(node->get_move());
+        auto rest_of_pv = get_pv(tmpstate, *node);
+        auto pv = move + (rest_of_pv.empty() ? "" : " " + rest_of_pv);
+        auto move_eval = node->get_visits() ? node->get_raw_eval(color) : 0.0f;
+        auto policy = node->get_policy();
+        auto lcb = node->get_eval_lcb(color);
+        auto visits = node->get_visits();
+        // Need at least 2 visits for valid LCB.
+        auto lcb_ratio_exceeded =
+            visits > 2 && visits > max_visits * cfg_lcb_min_visit_ratio;
+        // Store data in array
+        sortable_data.emplace_back(move, visits, move_eval, policy, pv, lcb,
+                                   lcb_ratio_exceeded);
+    }
+    // Sort array to decide order
+    std::stable_sort(rbegin(sortable_data), rend(sortable_data));
+
+    auto i = 0;
+    // Output analysis data in gtp stream
+    for (const auto& node : sortable_data) {
+        if (i > 0) {
+            gtp_printf_raw(" ");
+        }
+        gtp_printf_raw(node.get_info_string(i).c_str());
+        i++;
     }
+    gtp_printf_raw("\n");
 }
 
 void UCTSearch::tree_stats(const UCTNode& node) {
@@ -264,19 +385,37 @@ void UCTSearch::tree_stats(const UCTNode& node) {
     size_t depth_sum = 0;
     size_t max_depth = 0;
     size_t children_count = 0;
-    tree_stats_helper(node, 0,
-                      nodes, non_leaf_nodes, depth_sum,
-                      max_depth, children_count);
+
+    std::function<void(const UCTNode& node, size_t)> traverse =
+        [&](const UCTNode& node, size_t depth) {
+            nodes += 1;
+            non_leaf_nodes += node.get_visits() > 1;
+            depth_sum += depth;
+            if (depth > max_depth) max_depth = depth;
+
+            for (const auto& child : node.get_children()) {
+                if (child.get_visits() > 0) {
+                    children_count += 1;
+                    traverse(*(child.get()), depth + 1);
+                } else {
+                    nodes += 1;
+                    depth_sum += depth + 1;
+                    if (depth >= max_depth) max_depth = depth + 1;
+                }
+            }
+        };
+
+    traverse(node, 0);
 
     if (nodes > 0) {
         myprintf("%.1f average depth, %d max depth\n",
-                 (1.0f*depth_sum) / nodes, max_depth);
+                 (1.0f * depth_sum) / nodes, max_depth);
         myprintf("%d non leaf nodes, %.2f average children\n",
-                 non_leaf_nodes, (1.0f*children_count) / non_leaf_nodes);
+                 non_leaf_nodes, (1.0f * children_count) / non_leaf_nodes);
     }
 }
 
-bool UCTSearch::should_resign(passflag_t passflag, float bestscore) {
+bool UCTSearch::should_resign(const passflag_t passflag, const float besteval) {
     if (passflag & UCTSearch::NORESIGN) {
         // resign not allowed
         return false;
@@ -287,9 +426,9 @@ bool UCTSearch::should_resign(passflag_t passflag, float bestscore) {
         return false;
     }
 
-    const size_t board_squares = m_rootstate.board.get_boardsize()
-                               * m_rootstate.board.get_boardsize();
-    const auto move_threshold = board_squares / 4;
+    const size_t num_intersections =
+        m_rootstate.board.get_boardsize() * m_rootstate.board.get_boardsize();
+    const auto move_threshold = num_intersections / 4;
     const auto movenum = m_rootstate.get_movenum();
     if (movenum <= move_threshold) {
         // too early in game to resign
@@ -301,36 +440,45 @@ bool UCTSearch::should_resign(passflag_t passflag, float bestscore) {
     const auto is_default_cfg_resign = cfg_resignpct < 0;
     const auto resign_threshold =
         0.01f * (is_default_cfg_resign ? 10 : cfg_resignpct);
-    if (bestscore > resign_threshold) {
+    if (besteval > resign_threshold) {
         // eval > cfg_resign
         return false;
     }
 
-    if ((m_rootstate.get_handicap() > 0)
-            && (color == FastBoard::WHITE)
-            && is_default_cfg_resign) {
+    if ((m_rootstate.get_handicap() > 0) && (color == FastBoard::WHITE)
+        && is_default_cfg_resign) {
         const auto handicap_resign_threshold =
             resign_threshold / (1 + m_rootstate.get_handicap());
 
         // Blend the thresholds for the first ~215 moves.
-        auto blend_ratio = std::min(1.0f, movenum / (0.6f * board_squares));
-        auto blended_resign_threshold = blend_ratio * resign_threshold
+        auto blend_ratio = std::min(1.0f, movenum / (0.6f * num_intersections));
+        auto blended_resign_threshold =
+            blend_ratio * resign_threshold
             + (1 - blend_ratio) * handicap_resign_threshold;
-        if (bestscore > blended_resign_threshold) {
+        if (besteval > blended_resign_threshold) {
             // Allow lower eval for white in handicap games
             // where opp may fumble.
             return false;
         }
     }
 
+    if (!m_rootstate.is_move_legal(color, FastBoard::RESIGN)) {
+        return false;
+    }
+
     return true;
 }
 
-int UCTSearch::get_best_move(passflag_t passflag) {
+int UCTSearch::get_best_move(const passflag_t passflag) {
     int color = m_rootstate.board.get_to_move();
 
+    auto max_visits = 0;
+    for (const auto& node : m_root->get_children()) {
+        max_visits = std::max(max_visits, node->get_visits());
+    }
+
     // Make sure best is first
-    m_root->sort_children(color);
+    m_root->sort_children(color, cfg_lcb_min_visit_ratio * max_visits);
 
     // Check whether to randomize the best move proportional
     // to the playout counts, early game only.
@@ -343,28 +491,31 @@ int UCTSearch::get_best_move(passflag_t passflag) {
     assert(first_child != nullptr);
 
     auto bestmove = first_child->get_move();
-    auto bestscore = first_child->get_eval(color);
+    auto besteval =
+        first_child->first_visit() ? 0.5f : first_child->get_raw_eval(color);
 
     // do we want to fiddle with the best move because of the rule set?
     if (passflag & UCTSearch::NOPASS) {
         // were we going to pass?
         if (bestmove == FastBoard::PASS) {
-            UCTNode * nopass = m_root->get_nopass_child(m_rootstate);
+            UCTNode* nopass = m_root->get_nopass_child(m_rootstate);
 
             if (nopass != nullptr) {
                 myprintf("Preferring not to pass.\n");
                 bestmove = nopass->get_move();
                 if (nopass->first_visit()) {
-                    bestscore = 1.0f;
+                    besteval = 1.0f;
                 } else {
-                    bestscore = nopass->get_eval(color);
+                    besteval = nopass->get_raw_eval(color);
                 }
             } else {
                 myprintf("Pass is the only acceptable move.\n");
             }
         }
-    } else {
-        if (!cfg_dumbpass && bestmove == FastBoard::PASS) {
+    } else if (!cfg_dumbpass) {
+        const auto relative_score =
+            (color == FastBoard::BLACK ? 1 : -1) * m_rootstate.final_score();
+        if (bestmove == FastBoard::PASS) {
             // Either by forcing or coincidence passing is
             // on top...check whether passing loses instantly
             // do full count including dead stones.
@@ -383,51 +534,67 @@ int UCTSearch::get_best_move(passflag_t passflag) {
             // heuristic so the engine can "clean up" the board. It will still
             // only clean up the bare necessity to win. For full dead stone
             // removal, kgs-genmove_cleanup and the NOPASS mode must be used.
-            float score = m_rootstate.final_score();
+
             // Do we lose by passing?
-            if ((score > 0.0f && color == FastBoard::WHITE)
-                ||
-                (score < 0.0f && color == FastBoard::BLACK)) {
+            if (relative_score < 0.0f) {
                 myprintf("Passing loses :-(\n");
                 // Find a valid non-pass move.
-                UCTNode * nopass = m_root->get_nopass_child(m_rootstate);
+                UCTNode* nopass = m_root->get_nopass_child(m_rootstate);
                 if (nopass != nullptr) {
                     myprintf("Avoiding pass because it loses.\n");
                     bestmove = nopass->get_move();
                     if (nopass->first_visit()) {
-                        bestscore = 1.0f;
+                        besteval = 1.0f;
                     } else {
-                        bestscore = nopass->get_eval(color);
+                        besteval = nopass->get_raw_eval(color);
                     }
                 } else {
                     myprintf("No alternative to passing.\n");
                 }
-            } else {
+            } else if (relative_score > 0.0f) {
                 myprintf("Passing wins :-)\n");
+            } else {
+                myprintf("Passing draws :-|\n");
+                // Find a valid non-pass move.
+                const auto nopass = m_root->get_nopass_child(m_rootstate);
+                if (nopass != nullptr && !nopass->first_visit()) {
+                    const auto nopass_eval = nopass->get_raw_eval(color);
+                    if (nopass_eval > 0.5f) {
+                        myprintf("Avoiding pass because there could be a winning alternative.\n");
+                        bestmove = nopass->get_move();
+                        besteval = nopass_eval;
+                    }
+                }
+                if (bestmove == FastBoard::PASS) {
+                    myprintf("No seemingly better alternative to passing.\n");
+                }
             }
-        } else if (!cfg_dumbpass
-                   && m_rootstate.get_last_move() == FastBoard::PASS) {
+        } else if (m_rootstate.get_last_move() == FastBoard::PASS) {
             // Opponents last move was passing.
             // We didn't consider passing. Should we have and
             // end the game immediately?
-            float score = m_rootstate.final_score();
-            // do we lose by passing?
-            if ((score > 0.0f && color == FastBoard::WHITE)
-                ||
-                (score < 0.0f && color == FastBoard::BLACK)) {
+
+            if (!m_rootstate.is_move_legal(color, FastBoard::PASS)) {
+                myprintf("Passing is forbidden, I'll play on.\n");
+            } else if (relative_score < 0.0f) {
                 myprintf("Passing loses, I'll play on.\n");
-            } else {
+            } else if (relative_score > 0.0f) {
                 myprintf("Passing wins, I'll pass out.\n");
                 bestmove = FastBoard::PASS;
+            } else {
+                myprintf("Passing draws, make it depend on evaluation.\n");
+                if (besteval < 0.5f) {
+                    bestmove = FastBoard::PASS;
+                }
             }
         }
     }
 
     // if we aren't passing, should we consider resigning?
     if (bestmove != FastBoard::PASS) {
-        if (should_resign(passflag, bestscore)) {
+        if (should_resign(passflag, besteval)) {
             myprintf("Eval (%.2f%%) looks bad. Resigning.\n",
-                     100.0f * bestscore);
+                     100.0f * besteval);
             bestmove = FastBoard::RESIGN;
         }
     }
@@ -435,11 +602,19 @@ int UCTSearch::get_best_move(passflag_t passflag) {
     return bestmove;
 }
 
-std::string UCTSearch::get_pv(FastState & state, UCTNode& parent) {
+std::string UCTSearch::get_pv(FastState& state, const UCTNode& parent) {
     if (!parent.has_children()) {
         return std::string();
     }
 
+    if (parent.expandable()) {
+        // Not fully expanded. This means someone could expand
+        // the node while we want to traverse the children.
+        // Avoid the race conditions and don't go through the rabbit hole
+        // of trying to print things from this node.
+        return std::string();
+    }
+
     auto& best_child = parent.get_best_root_child(state.get_to_move());
     if (best_child.first_visit()) {
         return std::string();
@@ -456,25 +631,22 @@ std::string UCTSearch::get_pv(FastState & state, UCTNode& parent) {
     return res;
 }
 
-void UCTSearch::dump_analysis(int playouts) {
-    if (cfg_quiet) {
-        return;
-    }
-
+std::string UCTSearch::get_analysis(const int playouts) {
     FastState tempstate = m_rootstate;
     int color = tempstate.board.get_to_move();
 
-    std::string pvstring = get_pv(tempstate, *m_root);
-    float winrate = 100.0f * m_root->get_eval(color);
-    myprintf("Playouts: %d, Win: %5.2f%%, PV: %s\n",
-             playouts, winrate, pvstring.c_str());
+    auto pvstring = get_pv(tempstate, *m_root);
+    float winrate = 100.0f * m_root->get_raw_eval(color);
+    return str(boost::format("Playouts: %d, Win: %5.2f%%, PV: %s")
+               % playouts % winrate % pvstring.c_str());
 }
 
 bool UCTSearch::is_running() const {
-    return m_run && m_nodes < MAX_TREE_SIZE;
+    return m_run && UCTNodePointer::get_tree_size() < cfg_max_tree_size;
 }
 
-int UCTSearch::est_playouts_left(int elapsed_centis, int time_for_move) const {
+int UCTSearch::est_playouts_left(const int elapsed_centis,
+                                 const int time_for_move) const {
     auto playouts = m_playouts.load();
     const auto playouts_left =
         std::max(0, std::min(m_maxplayouts - playouts,
@@ -491,14 +663,21 @@ int UCTSearch::est_playouts_left(int elapsed_centis, int time_for_move) const {
                     static_cast<int>(std::ceil(playout_rate * time_left)));
 }
 
-size_t UCTSearch::prune_noncontenders(int elapsed_centis, int time_for_move) {
+size_t UCTSearch::prune_noncontenders(const int color, const int elapsed_centis,
+                                      const int time_for_move,
+                                      const bool prune) {
+    auto lcb_max = 0.0f;
     auto Nfirst = 0;
     // There are no cases where the root's children vector gets modified
     // during a multithreaded search, so it is safe to walk it here without
     // taking the (root) node lock.
     for (const auto& node : m_root->get_children()) {
         if (node->valid()) {
-            Nfirst = std::max(Nfirst, node->get_visits());
+            const auto visits = node->get_visits();
+            if (visits > 0) {
+                lcb_max = std::max(lcb_max, node->get_eval_lcb(color));
+            }
+            Nfirst = std::max(Nfirst, visits);
         }
     }
     const auto min_required_visits =
@@ -506,11 +685,18 @@ size_t UCTSearch::prune_noncontenders(int elapsed_centis, int time_for_move) {
     auto pruned_nodes = size_t{0};
     for (const auto& node : m_root->get_children()) {
         if (node->valid()) {
-            const auto has_enough_visits =
-                node->get_visits() >= min_required_visits;
-
-            node->set_active(has_enough_visits);
-            if (!has_enough_visits) {
+            const auto visits = node->get_visits();
+            const auto has_enough_visits = visits >= min_required_visits;
+            // Avoid pruning moves that could have the best lower confidence
+            // bound.
+            const auto high_winrate =
+                visits > 0 ? node->get_raw_eval(color) >= lcb_max : false;
+            const auto prune_this_node = !(has_enough_visits || high_winrate);
+
+            if (prune) {
+                node->set_active(!prune_this_node);
+            }
+            if (prune_this_node) {
                 ++pruned_nodes;
             }
         }
@@ -520,11 +706,16 @@ size_t UCTSearch::prune_noncontenders(int elapsed_centis, int time_for_move) {
     return pruned_nodes;
 }
 
-bool UCTSearch::have_alternate_moves(int elapsed_centis, int time_for_move) {
+bool UCTSearch::have_alternate_moves(const int elapsed_centis,
+                                     const int time_for_move) {
     if (cfg_timemanage == TimeManagement::OFF) {
         return true;
     }
-    auto pruned = prune_noncontenders(elapsed_centis, time_for_move);
+    auto my_color = m_rootstate.get_to_move();
+    // For self play use. Disables pruning of non-contenders to not bias the training data.
+    auto prune = cfg_timemanage != TimeManagement::NO_PRUNING;
+    auto pruned =
+        prune_noncontenders(my_color, elapsed_centis, time_for_move, prune);
     if (pruned < m_root->get_children().size() - 1) {
         return true;
     }
@@ -533,7 +724,6 @@ bool UCTSearch::have_alternate_moves(int elapsed_centis, int time_for_move) {
     // which will cause Leela to quickly respond to obvious/forced moves.
     // That comes at the cost of some playing strength as she now cannot
     // think ahead about her next moves in the remaining time.
-    auto my_color = m_rootstate.get_to_move();
     auto tc = m_rootstate.get_timecontrol();
     if (!tc.can_accumulate_time(my_color)
         || m_maxplayouts < UCTSearch::UNLIMITED_PLAYOUTS) {
@@ -547,32 +737,36 @@ bool UCTSearch::have_alternate_moves(int elapsed_centis, int time_for_move) {
     // save at least half a second.
     if (time_for_move - elapsed_centis > 50) {
         myprintf("%.1fs left, stopping early.\n",
-                    (time_for_move - elapsed_centis) / 100.0f);
+                 (time_for_move - elapsed_centis) / 100.0f);
     }
     return false;
 }
 
-bool UCTSearch::stop_thinking(int elapsed_centis, int time_for_move) const {
-    return m_playouts >= m_maxplayouts
-           || m_root->get_visits() >= m_maxvisits
+bool UCTSearch::stop_thinking(const int elapsed_centis,
+                              const int time_for_move) const {
+    return m_playouts >= m_maxplayouts || m_root->get_visits() >= m_maxvisits
            || elapsed_centis >= time_for_move;
 }
 
 void UCTWorker::operator()() {
-    do {
-        auto currstate = std::make_unique<GameState>(m_rootstate);
-        auto result = m_search->play_simulation(*currstate, m_root);
-        if (result.valid()) {
-            m_search->increment_playouts();
-        }
-    } while (m_search->is_running());
+    try {
+        do {
+            auto currstate = std::make_unique<GameState>(m_rootstate);
+            auto result = m_search->play_simulation(*currstate, m_root);
+            if (result.valid()) {
+                m_search->increment_playouts();
+            }
+        } while (m_search->is_running());
+    } catch (NetworkHaltException&) {
+        // intentionally empty
+    }
 }
 
 void UCTSearch::increment_playouts() {
     m_playouts++;
 }
 
-int UCTSearch::think(int color, passflag_t passflag) {
+int UCTSearch::think(const int color, const passflag_t passflag) {
     // Start counting time for us
     m_rootstate.start_clock(color);
 
@@ -583,52 +777,61 @@ int UCTSearch::think(int color, passflag_t passflag) {
     // set side to move
     m_rootstate.board.set_to_move(color);
 
-    m_rootstate.get_timecontrol().set_boardsize(
-        m_rootstate.board.get_boardsize());
-    auto time_for_move = m_rootstate.get_timecontrol().max_time_for_move(color, m_rootstate.get_movenum());
+    auto time_for_move = m_rootstate.get_timecontrol().max_time_for_move(
+        m_rootstate.board.get_boardsize(), color, m_rootstate.get_movenum());
 
-    myprintf("Thinking at most %.1f seconds...\n", time_for_move/100.0f);
+    myprintf("Thinking at most %.1f seconds...\n", time_for_move / 100.0f);
 
     // create a sorted list of legal moves (make sure we
     // play something legal and decent even in time trouble)
-    m_root->prepare_root_node(color, m_nodes, m_rootstate);
+    m_root->prepare_root_node(m_network, color, m_nodes, m_rootstate);
 
     m_run = true;
     int cpus = cfg_num_threads;
     ThreadGroup tg(thread_pool);
-    for (int i = 1; i < cpus; i++) {
+    for (int i = 0; i < cpus; i++) {
         tg.add_task(UCTWorker(m_rootstate, this, m_root.get()));
     }
 
-    bool keeprunning = true;
-    int last_update = 0;
+    auto keeprunning = true;
+    auto last_update = 0;
+    auto last_output = 0;
     do {
-        auto currstate = std::make_unique<GameState>(m_rootstate);
-
-        auto result = play_simulation(*currstate, m_root.get());
-        if (result.valid()) {
-            increment_playouts();
-        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
 
         Time elapsed;
         int elapsed_centis = Time::timediff_centis(start, elapsed);
 
+        if (cfg_analyze_tags.interval_centis()
+            && elapsed_centis - last_output
+                   > cfg_analyze_tags.interval_centis()) {
+            last_output = elapsed_centis;
+            output_analysis(m_rootstate, *m_root);
+        }
+
         // output some stats every few seconds
         // check if we should still search
-        if (elapsed_centis - last_update > 250) {
+        if (!cfg_quiet && elapsed_centis - last_update > 250) {
             last_update = elapsed_centis;
-            dump_analysis(static_cast<int>(m_playouts));
+            myprintf("%s\n", get_analysis(m_playouts.load()).c_str());
         }
-        keeprunning  = is_running();
+        keeprunning = is_running();
         keeprunning &= !stop_thinking(elapsed_centis, time_for_move);
         keeprunning &= have_alternate_moves(elapsed_centis, time_for_move);
     } while (keeprunning);
 
-    // stop the search
+    // Make sure to post at least once.
+    if (cfg_analyze_tags.interval_centis() && last_output == 0) {
+        output_analysis(m_rootstate, *m_root);
+    }
+
+    // Stop the search.
     m_run = false;
+    m_network.drain_evals();
     tg.wait_all();
+    m_network.resume_evals();
 
-    // reactivate all pruned root children
+    // Reactivate all pruned root children.
     for (const auto& node : m_root->get_children()) {
         node->set_active(true);
     }
@@ -638,74 +841,112 @@ int UCTSearch::think(int color, passflag_t passflag) {
         return FastBoard::PASS;
     }
 
-    // display search info
+    // Display search info.
     myprintf("\n");
     dump_stats(m_rootstate, *m_root);
-    Training::record(m_rootstate, *m_root);
+    Training::record(m_network, m_rootstate, *m_root);
 
     Time elapsed;
     int elapsed_centis = Time::timediff_centis(start, elapsed);
-    if (elapsed_centis+1 > 0) {
-        myprintf("%d visits, %d nodes, %d playouts, %.0f n/s\n\n",
-                 m_root->get_visits(),
-                 static_cast<int>(m_nodes),
-                 static_cast<int>(m_playouts),
-                 (m_playouts * 100.0) / (elapsed_centis+1));
-    }
+    myprintf("%d visits, %d nodes, %d playouts, %.0f n/s\n\n",
+             m_root->get_visits(), m_nodes.load(), m_playouts.load(),
+             (m_playouts * 100.0) / (elapsed_centis + 1));
+
+#ifdef USE_OPENCL
+#ifndef NDEBUG
+    myprintf("batch stats: %d %d\n",
+             batch_stats.single_evals.load(), batch_stats.batch_evals.load());
+#endif
+#endif
+
     int bestmove = get_best_move(passflag);
 
+    // Save the explanation.
+    m_think_output =
+        str(boost::format("move %d, %c => %s\n%s")
+            % m_rootstate.get_movenum()
+            % (color == FastBoard::BLACK ? 'B' : 'W')
+            % m_rootstate.move_to_text(bestmove).c_str()
+            % get_analysis(m_root->get_visits()).c_str());
+
     // Copy the root state. Use to check for tree re-use in future calls.
     m_last_rootstate = std::make_unique<GameState>(m_rootstate);
     return bestmove;
 }
 
+// Brief output from last think() call.
+std::string UCTSearch::explain_last_think() const {
+    return m_think_output;
+}
+
 void UCTSearch::ponder() {
+    auto disable_reuse = cfg_analyze_tags.has_move_restrictions();
+    if (disable_reuse) {
+        m_last_rootstate.reset(nullptr);
+    }
+
     update_root();
 
-    m_root->prepare_root_node(m_rootstate.board.get_to_move(),
+    m_root->prepare_root_node(m_network, m_rootstate.board.get_to_move(),
                               m_nodes, m_rootstate);
 
     m_run = true;
     ThreadGroup tg(thread_pool);
-    for (int i = 1; i < cfg_num_threads; i++) {
+    for (auto i = size_t{0}; i < cfg_num_threads; i++) {
         tg.add_task(UCTWorker(m_rootstate, this, m_root.get()));
     }
+    Time start;
     auto keeprunning = true;
+    auto last_output = 0;
     do {
-        auto currstate = std::make_unique<GameState>(m_rootstate);
-        auto result = play_simulation(*currstate, m_root.get());
-        if (result.valid()) {
-            increment_playouts();
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        if (cfg_analyze_tags.interval_centis()) {
+            Time elapsed;
+            int elapsed_centis = Time::timediff_centis(start, elapsed);
+            if (elapsed_centis - last_output
+                > cfg_analyze_tags.interval_centis()) {
+                last_output = elapsed_centis;
+                output_analysis(m_rootstate, *m_root);
+            }
         }
-        keeprunning  = is_running();
+        keeprunning = is_running();
         keeprunning &= !stop_thinking(0, 1);
     } while (!Utils::input_pending() && keeprunning);
 
-    // stop the search
+    // Make sure to post at least once.
+    if (cfg_analyze_tags.interval_centis() && last_output == 0) {
+        output_analysis(m_rootstate, *m_root);
+    }
+
+    // Stop the search.
     m_run = false;
+    m_network.drain_evals();
     tg.wait_all();
+    m_network.resume_evals();
 
-    // display search info
+    // Display search info.
     myprintf("\n");
     dump_stats(m_rootstate, *m_root);
 
     myprintf("\n%d visits, %d nodes\n\n", m_root->get_visits(), m_nodes.load());
 
     // Copy the root state. Use to check for tree re-use in future calls.
-    m_last_rootstate = std::make_unique<GameState>(m_rootstate);
+    if (!disable_reuse) {
+        m_last_rootstate = std::make_unique<GameState>(m_rootstate);
+    }
 }
 
-void UCTSearch::set_playout_limit(int playouts) {
-    static_assert(std::is_convertible<decltype(playouts),
-                                      decltype(m_maxplayouts)>::value,
-                  "Inconsistent types for playout amount.");
+void UCTSearch::set_playout_limit(const int playouts) {
+    static_assert(
+        std::is_convertible<decltype(playouts), decltype(m_maxplayouts)>::value,
+        "Inconsistent types for playout amount.");
     m_maxplayouts = std::min(playouts, UNLIMITED_PLAYOUTS);
 }
 
-void UCTSearch::set_visit_limit(int visits) {
-    static_assert(std::is_convertible<decltype(visits),
-                                      decltype(m_maxvisits)>::value,
-                  "Inconsistent types for visits amount.");
+void UCTSearch::set_visit_limit(const int visits) {
+    static_assert(
+        std::is_convertible<decltype(visits), decltype(m_maxvisits)>::value,
+        "Inconsistent types for visits amount.");
     // Limit to type max / 2 to prevent overflow when multithreading.
     m_maxvisits = std::min(visits, UNLIMITED_PLAYOUTS);
 }
diff --git a/src/UCTSearch.h b/src/UCTSearch.h
index a99e10f91..8269f4026 100644
--- a/src/UCTSearch.h
+++ b/src/UCTSearch.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,34 +14,49 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #ifndef UCTSEARCH_H_INCLUDED
 #define UCTSEARCH_H_INCLUDED
 
-#include <list>
 #include <atomic>
+#include <future>
+#include <list>
 #include <memory>
 #include <string>
 #include <tuple>
-#include <future>
 
-#include "ThreadPool.h"
 #include "FastBoard.h"
 #include "FastState.h"
 #include "GameState.h"
+#include "Network.h"
+#include "ThreadPool.h"
 #include "UCTNode.h"
 
-
 class SearchResult {
 public:
     SearchResult() = default;
-    bool valid() const { return m_valid;  }
-    float eval() const { return m_eval;  }
-    static SearchResult from_eval(float eval) {
+    bool valid() const {
+        return m_valid;
+    }
+    float eval() const {
+        return m_eval;
+    }
+    static SearchResult from_eval(const float eval) {
         return SearchResult(eval);
     }
-    static SearchResult from_score(float board_score) {
+    static SearchResult from_score(const float board_score) {
         if (board_score > 0.0f) {
             return SearchResult(1.0f);
         } else if (board_score < 0.0f) {
@@ -50,16 +65,16 @@ class SearchResult {
             return SearchResult(0.5f);
         }
     }
+
 private:
-    explicit SearchResult(float eval)
-        : m_valid(true), m_eval(eval) {}
+    explicit SearchResult(const float eval) : m_valid(true), m_eval(eval) {}
     bool m_valid{false};
     float m_eval{0.0f};
 };
 
 namespace TimeManagement {
     enum enabled_t {
-        AUTO = -1, OFF = 0, ON = 1, FAST = 2
+        AUTO = -1, OFF = 0, ON = 1, FAST = 2, NO_PRUNING = 3
     };
 };
 
@@ -76,12 +91,16 @@ class UCTSearch {
     static constexpr passflag_t NORESIGN = 1 << 1;
 
     /*
-        Maximum size of the tree in memory. Nodes are about
-        48 bytes, so limit to ~1.2G on 32-bits and about 5.5G
-        on 64-bits.
+        Default memory limit in bytes.
+        ~1.6GiB on 32-bits and about 5.2GiB on 64-bits.
+    */
+    static constexpr size_t DEFAULT_MAX_MEMORY =
+        (sizeof(void*) == 4 ? 1'600'000'000 : 5'200'000'000);
+
+    /*
+        Minimum allowed size for maximum tree size.
     */
-    static constexpr auto MAX_TREE_SIZE =
-        (sizeof(void*) == 4 ? 25'000'000 : 100'000'000);
+    static constexpr size_t MIN_TREE_SPACE = 100'000'000;
 
     /*
         Value representing unlimited visits or playouts. Due to
@@ -91,31 +110,34 @@ class UCTSearch {
     static constexpr auto UNLIMITED_PLAYOUTS =
         std::numeric_limits<int>::max() / 2;
 
-    UCTSearch(GameState& g);
+    UCTSearch(GameState& g, Network& network);
     int think(int color, passflag_t passflag = NORMAL);
     void set_playout_limit(int playouts);
     void set_visit_limit(int visits);
     void ponder();
     bool is_running() const;
     void increment_playouts();
-    SearchResult play_simulation(GameState& currstate, UCTNode* const node);
+    std::string explain_last_think() const;
+    SearchResult play_simulation(GameState& currstate, UCTNode* node);
 
 private:
     float get_min_psa_ratio() const;
-    void dump_stats(FastState& state, UCTNode& parent);
+    void dump_stats(const FastState& state, UCTNode& parent);
     void tree_stats(const UCTNode& node);
-    std::string get_pv(FastState& state, UCTNode& parent);
-    void dump_analysis(int playouts);
-    bool should_resign(passflag_t passflag, float bestscore);
+    std::string get_pv(FastState& state, const UCTNode& parent);
+    std::string get_analysis(int playouts);
+    bool should_resign(passflag_t passflag, float besteval);
     bool have_alternate_moves(int elapsed_centis, int time_for_move);
     int est_playouts_left(int elapsed_centis, int time_for_move) const;
-    size_t prune_noncontenders(int elapsed_centis = 0, int time_for_move = 0);
+    size_t prune_noncontenders(int color, int elapsed_centis = 0,
+                               int time_for_move = 0, bool prune = true);
     bool stop_thinking(int elapsed_centis = 0, int time_for_move = 0) const;
     int get_best_move(passflag_t passflag);
     void update_root();
     bool advance_to_new_rootstate();
+    void output_analysis(const FastState& state, const UCTNode& parent);
 
-    GameState & m_rootstate;
+    GameState& m_rootstate;
     std::unique_ptr<GameState> m_last_rootstate;
     std::unique_ptr<UCTNode> m_root;
     std::atomic<int> m_nodes{0};
@@ -123,19 +145,23 @@ class UCTSearch {
     std::atomic<bool> m_run{false};
     int m_maxplayouts;
     int m_maxvisits;
+    std::string m_think_output;
 
     std::list<Utils::ThreadGroup> m_delete_futures;
+
+    Network& m_network;
 };
 
 class UCTWorker {
 public:
-    UCTWorker(GameState & state, UCTSearch * search, UCTNode * root)
-      : m_rootstate(state), m_search(search), m_root(root) {}
+    UCTWorker(GameState& state, UCTSearch* const search, UCTNode* const root)
+        : m_rootstate(state), m_search(search), m_root(root) {}
     void operator()();
+
 private:
-    GameState & m_rootstate;
-    UCTSearch * m_search;
-    UCTNode * m_root;
+    GameState& m_rootstate;
+    UCTSearch* m_search;
+    UCTNode* m_root;
 };
 
 #endif
diff --git a/src/Utils.cpp b/src/Utils.cpp
index 84d291362..6ac2bdac0 100644
--- a/src/Utils.cpp
+++ b/src/Utils.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,32 +14,74 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
-#include "Utils.h"
 
-#include <mutex>
+#include <boost/filesystem.hpp>
+#include <boost/math/distributions/students_t.hpp>
 #include <cstdarg>
 #include <cstdio>
+#include <mutex>
+
+#include "Utils.h"
 
 #ifdef _WIN32
 #include <windows.h>
 #else
+#include <pwd.h>
 #include <sys/select.h>
+#include <sys/types.h>
+#include <unistd.h>
 #endif
 
 #include "GTP.h"
 
 Utils::ThreadPool thread_pool;
 
-bool Utils::input_pending(void) {
+auto constexpr z_entries = 1000;
+std::array<float, z_entries> z_lookup;
+
+void Utils::create_z_table() {
+    for (auto i = 1; i < z_entries + 1; i++) {
+        boost::math::students_t dist(i);
+        auto z =
+            boost::math::quantile(boost::math::complement(dist, cfg_ci_alpha));
+        z_lookup[i - 1] = z;
+    }
+}
+
+float Utils::cached_t_quantile(const int v) {
+    if (v < 1) {
+        return z_lookup[0];
+    }
+    if (v < z_entries) {
+        return z_lookup[v - 1];
+    }
+    // z approaches constant when v is high enough.
+    // With default lookup table size the function is flat enough that we
+    // can just return the last entry for all v bigger than it.
+    return z_lookup[z_entries - 1];
+}
+
+bool Utils::input_pending() {
 #ifdef HAVE_SELECT
     fd_set read_fds;
     FD_ZERO(&read_fds);
-    FD_SET(0,&read_fds);
-    struct timeval timeout{0,0};
-    select(1,&read_fds,nullptr,nullptr,&timeout);
+    FD_SET(0, &read_fds);
+    struct timeval timeout{0, 0};
+    select(1, &read_fds, nullptr, nullptr, &timeout);
     return FD_ISSET(0, &read_fds);
 #else
     static int init = 0, pipe;
@@ -51,7 +93,8 @@ bool Utils::input_pending(void) {
         inh = GetStdHandle(STD_INPUT_HANDLE);
         pipe = !GetConsoleMode(inh, &dw);
         if (!pipe) {
-            SetConsoleMode(inh, dw & ~(ENABLE_MOUSE_INPUT | ENABLE_WINDOW_INPUT));
+            SetConsoleMode(inh,
+                           dw & ~(ENABLE_MOUSE_INPUT | ENABLE_WINDOW_INPUT));
             FlushConsoleInputBuffer(inh);
         }
     }
@@ -77,52 +120,78 @@ bool Utils::input_pending(void) {
 
 static std::mutex IOmutex;
 
-void Utils::myprintf(const char *fmt, ...) {
+static void myprintf_base(const char* const fmt, va_list ap) {
+    va_list ap2;
+    va_copy(ap2, ap);
+
+    vfprintf(stderr, fmt, ap);
+
+    if (cfg_logfile_handle) {
+        std::lock_guard<std::mutex> lock(IOmutex);
+        vfprintf(cfg_logfile_handle, fmt, ap2);
+    }
+    va_end(ap2);
+}
+
+void Utils::myprintf(const char* const fmt, ...) {
     if (cfg_quiet) {
         return;
     }
+
     va_list ap;
     va_start(ap, fmt);
-    vfprintf(stderr, fmt, ap);
+    myprintf_base(fmt, ap);
     va_end(ap);
+}
 
-    if (cfg_logfile_handle) {
-        std::lock_guard<std::mutex> lock(IOmutex);
-        va_start(ap, fmt);
-        vfprintf(cfg_logfile_handle, fmt, ap);
-        va_end(ap);
-    }
+void Utils::myprintf_error(const char* const fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    myprintf_base(fmt, ap);
+    va_end(ap);
 }
 
-static void gtp_fprintf(FILE* file, const std::string& prefix,
-                        const char *fmt, va_list ap) {
+static void gtp_fprintf(FILE* const file, const std::string& prefix,
+                        const char* const fmt, va_list ap) {
     fprintf(file, "%s ", prefix.c_str());
     vfprintf(file, fmt, ap);
     fprintf(file, "\n\n");
 }
 
-static void gtp_base_printf(int id, std::string prefix,
-                            const char *fmt, va_list ap) {
+static void gtp_base_printf(const int id, std::string prefix,
+                            const char* const fmt, va_list ap) {
     if (id != -1) {
         prefix += std::to_string(id);
     }
-
     gtp_fprintf(stdout, prefix, fmt, ap);
-
     if (cfg_logfile_handle) {
         std::lock_guard<std::mutex> lock(IOmutex);
         gtp_fprintf(cfg_logfile_handle, prefix, fmt, ap);
     }
 }
 
-void Utils::gtp_printf(int id, const char *fmt, ...) {
+void Utils::gtp_printf(const int id, const char* const fmt, ...) {
     va_list ap;
     va_start(ap, fmt);
     gtp_base_printf(id, "=", fmt, ap);
     va_end(ap);
 }
 
-void Utils::gtp_fail_printf(int id, const char *fmt, ...) {
+void Utils::gtp_printf_raw(const char* const fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    vfprintf(stdout, fmt, ap);
+    va_end(ap);
+
+    if (cfg_logfile_handle) {
+        std::lock_guard<std::mutex> lock(IOmutex);
+        va_start(ap, fmt);
+        vfprintf(cfg_logfile_handle, fmt, ap);
+        va_end(ap);
+    }
+}
+
+void Utils::gtp_fail_printf(const int id, const char* const fmt, ...) {
     va_list ap;
     va_start(ap, fmt);
     gtp_base_printf(id, "?", fmt, ap);
@@ -136,7 +205,7 @@ void Utils::log_input(const std::string& input) {
     }
 }
 
-size_t Utils::ceilMultiple(size_t a, size_t b) {
+size_t Utils::ceilMultiple(const size_t a, const size_t b) {
     if (a % b == 0) {
         return a;
     }
@@ -144,3 +213,25 @@ size_t Utils::ceilMultiple(size_t a, size_t b) {
     auto ret = a + (b - a % b);
     return ret;
 }
+
+std::string Utils::leelaz_file(const std::string& file) {
+#if defined(_WIN32) || defined(__ANDROID__)
+    boost::filesystem::path dir(boost::filesystem::current_path());
+#else
+    // https://stackoverflow.com/a/26696759
+    const char* homedir;
+    if ((homedir = getenv("HOME")) == nullptr) {
+        struct passwd* pwd;
+        // NOLINTNEXTLINE(runtime/threadsafe_fn)
+        if ((pwd = getpwuid(getuid())) == nullptr) {
+            return std::string();
+        }
+        homedir = pwd->pw_dir;
+    }
+    boost::filesystem::path dir(homedir);
+    dir /= ".local/share/leela-zero";
+#endif
+    boost::filesystem::create_directories(dir);
+    dir /= file;
+    return dir.string();
+}
diff --git a/src/Utils.h b/src/Utils.h
index 33a10aef7..95fe53e97 100644
--- a/src/Utils.h
+++ b/src/Utils.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,10 +14,21 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
-#ifndef UTILS_H_DEFINED
-#define UTILS_H_DEFINED
+#ifndef UTILS_H_INCLUDED
+#define UTILS_H_INCLUDED
 
 #include "config.h"
 
@@ -30,28 +41,35 @@
 extern Utils::ThreadPool thread_pool;
 
 namespace Utils {
-    void myprintf(const char *fmt, ...);
-    void gtp_printf(int id, const char *fmt, ...);
-    void gtp_fail_printf(int id, const char *fmt, ...);
+    void myprintf_error(const char* fmt, ...);
+    void myprintf(const char* fmt, ...);
+    void gtp_printf(int id, const char* fmt, ...);
+    void gtp_printf_raw(const char* fmt, ...);
+    void gtp_fail_printf(int id, const char* fmt, ...);
     void log_input(const std::string& input);
     bool input_pending();
 
-    template<class T>
-    void atomic_add(std::atomic<T> &f, T d) {
+    template <class T>
+    void atomic_add(std::atomic<T>& f, const T d) {
         T old = f.load();
-        while (!f.compare_exchange_weak(old, old + d));
+        while (!f.compare_exchange_weak(old, old + d)) {}
     }
 
-    template<typename T>
+    template <typename T>
     T rotl(const T x, const int k) {
         return (x << k) | (x >> (std::numeric_limits<T>::digits - k));
     }
 
-    inline bool is7bit(int c) {
+    inline bool is7bit(const int c) {
         return c >= 0 && c <= 127;
     }
 
     size_t ceilMultiple(size_t a, size_t b);
+
+    std::string leelaz_file(const std::string& file);
+
+    void create_z_table();
+    float cached_t_quantile(int v);
 }
 
 #endif
diff --git a/src/Zobrist.cpp b/src/Zobrist.cpp
index 0dd888253..3f94db715 100644
--- a/src/Zobrist.cpp
+++ b/src/Zobrist.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,35 +14,48 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include "config.h"
+
 #include "Zobrist.h"
+
 #include "Random.h"
 
-std::array<std::array<std::uint64_t, FastBoard::MAXSQ>,     4> Zobrist::zobrist;
-std::array<std::uint64_t, FastBoard::MAXSQ>                    Zobrist::zobrist_ko;
-std::array<std::array<std::uint64_t, FastBoard::MAXSQ * 2>, 2> Zobrist::zobrist_pris;
-std::array<std::uint64_t, 5>                                   Zobrist::zobrist_pass;
+std::array<std::array<std::uint64_t, FastBoard::NUM_VERTICES>,     4> Zobrist::zobrist;
+std::array<std::uint64_t, FastBoard::NUM_VERTICES>                    Zobrist::zobrist_ko;
+std::array<std::array<std::uint64_t, FastBoard::NUM_VERTICES * 2>, 2> Zobrist::zobrist_pris;
+std::array<std::uint64_t, 5>                                          Zobrist::zobrist_pass;
 
 void Zobrist::init_zobrist(Random& rng) {
     for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < FastBoard::MAXSQ; j++) {
+        for (int j = 0; j < FastBoard::NUM_VERTICES; j++) {
             Zobrist::zobrist[i][j] = rng.randuint64();
         }
     }
 
-    for (int j = 0; j < FastBoard::MAXSQ; j++) {
+    for (int j = 0; j < FastBoard::NUM_VERTICES; j++) {
         Zobrist::zobrist_ko[j] = rng.randuint64();
     }
 
     for (int i = 0; i < 2; i++) {
-        for (int j = 0; j < FastBoard::MAXSQ * 2; j++) {
+        for (int j = 0; j < FastBoard::NUM_VERTICES * 2; j++) {
             Zobrist::zobrist_pris[i][j] = rng.randuint64();
         }
     }
 
     for (int i = 0; i < 5; i++) {
-        Zobrist::zobrist_pass[i]  = rng.randuint64();
+        Zobrist::zobrist_pass[i] = rng.randuint64();
     }
 }
diff --git a/src/Zobrist.h b/src/Zobrist.h
index 7c0d5b83d..36f97faf4 100644
--- a/src/Zobrist.h
+++ b/src/Zobrist.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 #ifndef ZOBRIST_H_INCLUDED
 #define ZOBRIST_H_INCLUDED
@@ -31,10 +42,10 @@ class Zobrist {
     static constexpr auto zobrist_empty = 0x1234567887654321;
     static constexpr auto zobrist_blacktomove = 0xABCDABCDABCDABCD;
 
-    static std::array<std::array<std::uint64_t, FastBoard::MAXSQ>,     4> zobrist;
-    static std::array<std::uint64_t, FastBoard::MAXSQ>                    zobrist_ko;
-    static std::array<std::array<std::uint64_t, FastBoard::MAXSQ * 2>, 2> zobrist_pris;
-    static std::array<std::uint64_t, 5>                                   zobrist_pass;
+    static std::array<std::array<std::uint64_t, FastBoard::NUM_VERTICES>,     4> zobrist;
+    static std::array<std::uint64_t, FastBoard::NUM_VERTICES>                    zobrist_ko;
+    static std::array<std::array<std::uint64_t, FastBoard::NUM_VERTICES * 2>, 2> zobrist_pris;
+    static std::array<std::uint64_t, 5>                                          zobrist_pass;
 
     static void init_zobrist(Random& rng);
 };
diff --git a/src/clblast_level3/xgemm_batched.opencl b/src/clblast_level3/xgemm_batched.opencl
deleted file mode 100644
index 041a05b9f..000000000
--- a/src/clblast_level3/xgemm_batched.opencl
+++ /dev/null
@@ -1,62 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the batched version of the non-direct GEMM kernel. See part 1 for information
-// about the non-batched version of the kernel.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
-                  const __global realM* restrict agm,
-                  const __global realN* restrict bgm,
-                  __global realM* restrict cgm) {
-  const int batch = get_group_id(2);
-
-  // Sets the offsets
-  const int a_offset = kSizeM*kSizeK*batch;
-  const int b_offset = kSizeK*kSizeN*batch;
-  const int c_offset = kSizeM*kSizeN*batch;
-  const __global realM* restrict agm_ = &agm[a_offset / VWM];
-  const __global realN* restrict bgm_ = &bgm[b_offset / VWN];
-  __global realM* restrict cgm_ = &cgm[c_offset / VWM];
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in global memory
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, alm);
-  #elif SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_, blm);
-  #else
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_);
-  #endif
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/clblast_level3/xgemm_part1.opencl b/src/clblast_level3/xgemm_part1.opencl
deleted file mode 100644
index 4e1c3e611..000000000
--- a/src/clblast_level3/xgemm_part1.opencl
+++ /dev/null
@@ -1,316 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains an optimized matrix-multiplication kernel inspired by the paper by Matsumoto
-// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
-// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
-// supports different data-types (SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM) through a pre-processor define.
-//
-// Matrices are accessed as follows:
-// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
-// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n)
-// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m)
-//
-// Or as an image (assuming column-major)
-//       K                      
-//    o-------o                 
-//    |       |                 
-//  N | [B^T] |                 
-//    |       |                 
-//    o-------o                 
-//        K               N     
-//    o-------o        o-----o  
-//  M |  [A]  |      M | [C] |  
-//    |       |        |     |  
-//    o-------o        o-----o  
-//                              
-//
-// This kernel is separated into three files. This is part 1 out of 4.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef MWG
-  #define MWG 8      // Tile-size in dimension M (e.g. 64, 128)
-#endif
-#ifndef NWG
-  #define NWG 8      // Tile-size in dimension N (e.g. 64, 128)
-#endif
-#ifndef KWG
-  #define KWG 8      // Tile-size in dimension K (e.g. 8, 16)
-#endif
-#ifndef MDIMC
-  #define MDIMC 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
-#endif
-#ifndef NDIMC
-  #define NDIMC 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
-#endif
-#ifndef MDIMA
-  #define MDIMA 8    // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
-#endif
-#ifndef NDIMB
-  #define NDIMB 8    // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
-#endif
-#ifndef KWI
-  #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
-#endif
-#ifndef VWM
-  #define VWM 1      // Vector width of matrices A and C
-#endif
-#ifndef VWN
-  #define VWN 1      // Vector width of matrix B
-#endif
-#ifndef STRM
-  #define STRM 0     // Use strided access within a thread in the M-dimension (1) or not (0)
-#endif
-#ifndef STRN
-  #define STRN 0     // Use strided access within a thread in the N-dimension (1) or not (0)
-#endif
-#ifndef SA
-  #define SA 0       // Use local/shared memory to cache matrix A (1) or not (0)
-#endif
-#ifndef SB
-  #define SB 0       // Use local/shared memory to cache matrix B (1) or not (0)
-#endif
-
-// Helper parameters based on the above tuning parameters
-#define MWI (MWG/MDIMC)               // Work per work-item (M-dimension)
-#define NWI (NWG/NDIMC)               // Work per work-item (N-dimension)
-#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA
-#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB
-#define MWA (MWG/MDIMA)               // Amount of loads-per-thread for matrix A (M-dimension)
-#define KWA (KWG/KDIMA)               // Amount of loads-per-thread for matrix A (K-dimension)
-#define KWB (KWG/KDIMB)               // Amount of loads-per-thread for matrix B (K-dimension)
-#define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
-
-// Settings
-#ifndef USE_VECTOR_MAD
-  #define USE_VECTOR_MAD 0      // Unroll (0) or don't (1) unroll the vector MAD manually
-#endif
-#ifndef GLOBAL_MEM_FENCE
-  #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
-#endif
-
-// =================================================================================================
-
-// Data-widths in dimension M
-#if VWM == 1
-    typedef real realM;
-#elif VWM == 2
-    typedef real2 realM;
-#elif VWM == 4
-    typedef real4 realM;
-#elif VWM == 8
-    typedef real8 realM;
-#elif VWM == 16
-    typedef real16 realM;
-#endif
-
-// Data-widths in dimension N
-#if VWN == 1
-    typedef real realN;
-#elif VWN == 2
-    typedef real2 realN;
-#elif VWN == 4
-    typedef real4 realN;
-#elif VWN == 8
-    typedef real8 realN;
-#elif VWN == 16
-    typedef real16 realN;
-#endif
-
-// =================================================================================================
-
-// Initializes the accumulation registers to zero
-INLINE_FUNC realM InitAccRegisters() {
-  realM result;
-  #if VWM == 1
-    SetToZero(result);
-  #elif VWM == 2
-    SetToZero(result.x);
-    SetToZero(result.y);
-  #elif VWM == 4
-    SetToZero(result.x);
-    SetToZero(result.y);
-    SetToZero(result.z);
-    SetToZero(result.w);
-  #elif VWM == 8
-    SetToZero(result.s0);
-    SetToZero(result.s1);
-    SetToZero(result.s2);
-    SetToZero(result.s3);
-    SetToZero(result.s4);
-    SetToZero(result.s5);
-    SetToZero(result.s6);
-    SetToZero(result.s7);
-  #elif VWM == 16
-    SetToZero(result.s0);
-    SetToZero(result.s1);
-    SetToZero(result.s2);
-    SetToZero(result.s3);
-    SetToZero(result.s4);
-    SetToZero(result.s5);
-    SetToZero(result.s6);
-    SetToZero(result.s7);
-    SetToZero(result.s8);
-    SetToZero(result.s9);
-    SetToZero(result.sA);
-    SetToZero(result.sB);
-    SetToZero(result.sC);
-    SetToZero(result.sD);
-    SetToZero(result.sE);
-    SetToZero(result.sF);
-  #endif
-  return result;
-}
-
-// =================================================================================================
-
-// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
-// caching the A input matrix.
-#if SA == 1
-INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, LOCAL_PTR realM* alm,
-                                const int kSizeM, const int tid, const int kwg) {
-  const int la0 = tid % MDIMA;
-  const int la1 = tid / MDIMA;
-  #pragma unroll
-  for (int _mia = 0; _mia < MWA/VWM; _mia += 1) {
-    #pragma unroll
-    for (int _kia = 0; _kia < KWA; _kia += 1) {
-
-      // Computes the indices based on strided/non-strided access
-      #if STRM == 0
-        int mg = _mia + la0*(MWA/VWM);
-      #elif STRM == 1
-        int mg = la0 + _mia*MDIMA;
-      #endif
-
-      // Computes the indices for the global memory
-      int kg = _kia + la1*KWA;
-      int idm = mg + GetGroupID0() * (MWG/VWM);
-      int idk = kg + kwg;
-
-      // Loads the data from global memory (not transposed) into the local memory
-      alm[kg*(MWG/VWM) + mg] = agm[idk*(kSizeM/VWM) + idm];
-    }
-  }
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 1
-INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, LOCAL_PTR realN* blm,
-                                const int kSizeN, const int tid, const int kwg) {
-  const int lb0 = tid % NDIMB;
-  const int lb1 = tid / NDIMB;
-  #pragma unroll
-  for (int _kib = 0; _kib < KWB; _kib += 1) {
-    #pragma unroll
-    for (int _nib = 0; _nib < NWB/VWN; _nib += 1) {
-
-      // Computes the indices based on strided/non-strided access
-      #if STRN == 0
-        int ng = _nib + lb0*(NWB/VWN);
-      #elif STRN == 1
-        int ng = lb0 + _nib*NDIMB;
-      #endif
-
-      // Computes the indices for the global memory
-      int kg = _kib + lb1*KWB;
-      int idn = ng + GetGroupID1() * (NWG/VWN);
-      int idk = kg + kwg;
-
-      // Loads the data from global memory (transposed) into the local memory
-      blm[kg*(NWG/VWN) + ng] = bgm[idk*(kSizeN/VWN) + idn];
-    }
-  }
-}
-#endif
-
-// =================================================================================================
-
-// Caches global off-chip memory directly into per-thread private memory (registers). This function
-// is specific for caching the A input matrix.
-#if SA == 0
-INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm, const int _mi,
-                                   const int kSizeM, const int idk, const int kwg) {
-  // Computes the indices based on strided/non-strided access
-  #if STRM == 0
-    int mg = _mi + get_local_id(0)*(MWI/VWM);
-  #elif STRM == 1
-    int mg = get_local_id(0) + _mi*MDIMC;
-  #endif
-
-  // Computes the indices for the global memory
-  int idm = mg + GetGroupID0() * (MWG/VWM);
-
-  // Loads the data from global memory (not transposed) and stores into registers
-  return agm[idk*(kSizeM/VWM) + idm];
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 0
-INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm, const int _ni,
-                                   const int kSizeN, const int idk) {
-  // Computes the indices based on strided/non-strided access
-  #if STRN == 0
-    int ng = _ni + get_local_id(1)*(NWI/VWN);
-  #elif STRN == 1
-    int ng = get_local_id(1) + _ni*NDIMC;
-  #endif
-
-  // Computes the indices for the global memory
-  int idn = ng + GetGroupID1() * (NWG/VWN);
-
-  // Loads the data from global memory (transposed) and stores into registers
-  return bgm[idk*(kSizeN/VWN) + idn];
-}
-#endif
-
-// =================================================================================================
-
-// Caches on-chip local memory into per-thread private memory (registers). This function is specific
-// for caching the A input matrix.
-#if SA == 1
-INLINE_FUNC realM LocalToPrivateA(LOCAL_PTR realM* alm, const int _mi, const int kg) {
-  #if STRM == 0
-    int mg = _mi + get_local_id(0)*(MWI/VWM);
-  #elif STRM == 1
-    int mg = get_local_id(0) + _mi*MDIMC;
-  #endif
-  return alm[kg*(MWG/VWM) + mg];
-}
-#endif
-
-// Same as above, but now for the B input matrix
-#if SB == 1
-INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int kg) {
-  #if STRN == 0
-    int ng = _ni + get_local_id(1)*(NWI/VWN);
-  #elif STRN == 1
-    int ng = get_local_id(1) + _ni*NDIMC;
-  #endif
-  return blm[kg*(NWG/VWN) + ng];
-}
-#endif
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/clblast_level3/xgemm_part2.opencl b/src/clblast_level3/xgemm_part2.opencl
deleted file mode 100644
index e2dd89b54..000000000
--- a/src/clblast_level3/xgemm_part2.opencl
+++ /dev/null
@@ -1,99 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This is part 2 of 4 of the GEMM kernel. See part 1 for more information.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// The vectorised multiply-add function
-INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
-  #if USE_VECTOR_MAD == 1
-    cvec += avec * bval;
-  #else
-    #if VWM == 1
-      MultiplyAdd(cvec,    avec,    bval);
-    #elif VWM == 2
-      MultiplyAdd(cvec.x , avec.x,  bval);
-      MultiplyAdd(cvec.y , avec.y,  bval);
-    #elif VWM == 4
-      MultiplyAdd(cvec.x , avec.x,  bval);
-      MultiplyAdd(cvec.y , avec.y,  bval);
-      MultiplyAdd(cvec.z , avec.z,  bval);
-      MultiplyAdd(cvec.w , avec.w,  bval);
-    #elif VWM == 8
-      MultiplyAdd(cvec.s0, avec.s0, bval);
-      MultiplyAdd(cvec.s1, avec.s1, bval);
-      MultiplyAdd(cvec.s2, avec.s2, bval);
-      MultiplyAdd(cvec.s3, avec.s3, bval);
-      MultiplyAdd(cvec.s4, avec.s4, bval);
-      MultiplyAdd(cvec.s5, avec.s5, bval);
-      MultiplyAdd(cvec.s6, avec.s6, bval);
-      MultiplyAdd(cvec.s7, avec.s7, bval);
-    #elif VWM == 16
-      MultiplyAdd(cvec.s0, avec.s0, bval);
-      MultiplyAdd(cvec.s1, avec.s1, bval);
-      MultiplyAdd(cvec.s2, avec.s2, bval);
-      MultiplyAdd(cvec.s3, avec.s3, bval);
-      MultiplyAdd(cvec.s4, avec.s4, bval);
-      MultiplyAdd(cvec.s5, avec.s5, bval);
-      MultiplyAdd(cvec.s6, avec.s6, bval);
-      MultiplyAdd(cvec.s7, avec.s7, bval);
-      MultiplyAdd(cvec.s8, avec.s8, bval);
-      MultiplyAdd(cvec.s9, avec.s9, bval);
-      MultiplyAdd(cvec.sA, avec.sA, bval);
-      MultiplyAdd(cvec.sB, avec.sB, bval);
-      MultiplyAdd(cvec.sC, avec.sC, bval);
-      MultiplyAdd(cvec.sD, avec.sD, bval);
-      MultiplyAdd(cvec.sE, avec.sE, bval);
-      MultiplyAdd(cvec.sF, avec.sF, bval);
-    #endif
-  #endif
-  return cvec;
-}
-
-// =================================================================================================
-
-// Merges the results in Cpm with the global array in Cgm.
-INLINE_FUNC void StoreResults(__global realM* cgm, realM cpm[NWI*MWI/VWM], const int kSizeM) {
-  #pragma unroll
-  for (int _ni = 0; _ni < NWI; _ni += 1) {
-    #pragma unroll
-    for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-      #if STRM == 0
-        int mg = _mi + get_local_id(0)*(MWI/VWM);
-      #elif STRM == 1
-        int mg = get_local_id(0) + _mi*MDIMC;
-      #endif
-      #if STRN == 0
-        int ng = _ni + get_local_id(1)*NWI;
-      #elif STRN == 1
-        int ng = _ni%VWN + get_local_id(1)*VWN + (_ni/VWN)*VWN*NDIMC;
-      #endif
-      int idm = mg + GetGroupID0() * (MWG/VWM);
-      int idn = ng + GetGroupID1() * NWG;
-      int index = idn*(kSizeM/VWM) + idm;
-
-      cgm[index] = cpm[_ni * (MWI/VWM) + _mi];
-
-    }
-  }
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/clblast_level3/xgemm_part3.opencl b/src/clblast_level3/xgemm_part3.opencl
deleted file mode 100644
index 54ddaf5db..000000000
--- a/src/clblast_level3/xgemm_part3.opencl
+++ /dev/null
@@ -1,169 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This is part 3 of 4 of the GEMM kernel. See part 1 for more information.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
-INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                           const __global realM* restrict agm, const __global realN* restrict bgm,
-                           __global realM* cgm
-                           #if SA == 1 && SB == 1
-                             , LOCAL_PTR realM* alm, LOCAL_PTR realN* blm
-                           #elif SA == 1
-                             , LOCAL_PTR realM* alm
-                           #elif SB == 1
-                             , LOCAL_PTR realN* blm
-                           #endif
-                           ) {
-
-  // Allocates workitem-private memory (registers)
-  #pragma promote_to_registers
-  realM apm[MWI/VWM];
-  #pragma promote_to_registers
-  realN bpm[NWI/VWN];
-  #pragma promote_to_registers
-  realM cpm[NWI*(MWI/VWM)];
-
-  // Combined thread identifier (volatile to disable caching)
-  #if SA == 1 || SB == 1
-    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
-  #endif
-
-  // Initializes the accumulation registers
-  #pragma unroll
-  for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-    #pragma unroll
-    for (int _ni = 0; _ni < NWI; _ni += 1) {
-      cpm[_ni * (MWI/VWM) + _mi] = InitAccRegisters();
-    }
-  }
-
-
-  // Loops over all workgroup tiles
-  for (int kwg = 0; kwg < kSizeK; kwg += KWG) {
-
-    // Loads data: off-chip --> local (matrix A)
-    #if SA == 1
-      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
-    #endif
-    // Loads data: off-chip --> local (matrix B)
-    #if SB == 1
-      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
-    #endif
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-
-    // Loops over all workitem tiles, unrolled by a factor KWI
-    for (int pwi = 0; pwi < KWG; pwi += KWI) {
-      #pragma unroll
-      for (int _pit = 0; _pit < KWI; _pit += 1) {
-        #if SA == 0 || SB == 0
-          int idk = kwg + pwi + _pit;
-        #endif
-        #if SA == 1 || SB == 1
-          int kg = pwi + _pit;
-        #endif
-
-        #pragma unroll
-        for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-          // Loads data: local --> private (matrix A)
-          #if SA == 1
-            apm[_mi] = LocalToPrivateA(alm, _mi, kg);
-          // Loads data: off-chip --> private (matrix A)
-          #else
-            apm[_mi] = GlobalToPrivateA(agm, _mi, kSizeM, idk, kwg);
-          #endif
-        }
-
-        // Loads data: local --> private (matrix B)
-        #pragma unroll
-        for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
-          #if SB == 1
-            bpm[_ni] = LocalToPrivateB(blm, _ni, kg);
-          // Loads data: off-chip --> private (matrix B)
-          #else
-            bpm[_ni] = GlobalToPrivateB(bgm, _ni, kSizeN, idk);
-          #endif
-        }
-
-        // Performs the accumulation (Cpm += Apm * Bpm)
-        #pragma unroll
-        for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
-          #pragma unroll
-          for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-            const realM aval = apm[_mi];
-            #if VWN == 1
-              cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni]);
-            #elif VWN == 2
-              cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni].x);
-              cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi], aval, bpm[_ni].y);
-            #elif VWN == 4
-              cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni].x);
-              cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi], aval, bpm[_ni].y);
-              cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi], aval, bpm[_ni].z);
-              cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi], aval, bpm[_ni].w);
-            #elif VWN == 8
-              cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni].s0);
-              cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1)*(MWI/VWM) + _mi], aval, bpm[_ni].s1);
-              cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 2)*(MWI/VWM) + _mi], aval, bpm[_ni].s2);
-              cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 3)*(MWI/VWM) + _mi], aval, bpm[_ni].s3);
-              cpm[(_ni*VWN + 4)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 4)*(MWI/VWM) + _mi], aval, bpm[_ni].s4);
-              cpm[(_ni*VWN + 5)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 5)*(MWI/VWM) + _mi], aval, bpm[_ni].s5);
-              cpm[(_ni*VWN + 6)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 6)*(MWI/VWM) + _mi], aval, bpm[_ni].s6);
-              cpm[(_ni*VWN + 7)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 7)*(MWI/VWM) + _mi], aval, bpm[_ni].s7);
-            #elif VWN == 16
-              cpm[(_ni*VWN + 0 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0 )*(MWI/VWM) + _mi], aval, bpm[_ni].s0);
-              cpm[(_ni*VWN + 1 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 1 )*(MWI/VWM) + _mi], aval, bpm[_ni].s1);
-              cpm[(_ni*VWN + 2 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 2 )*(MWI/VWM) + _mi], aval, bpm[_ni].s2);
-              cpm[(_ni*VWN + 3 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 3 )*(MWI/VWM) + _mi], aval, bpm[_ni].s3);
-              cpm[(_ni*VWN + 4 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 4 )*(MWI/VWM) + _mi], aval, bpm[_ni].s4);
-              cpm[(_ni*VWN + 5 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 5 )*(MWI/VWM) + _mi], aval, bpm[_ni].s5);
-              cpm[(_ni*VWN + 6 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 6 )*(MWI/VWM) + _mi], aval, bpm[_ni].s6);
-              cpm[(_ni*VWN + 7 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 7 )*(MWI/VWM) + _mi], aval, bpm[_ni].s7);
-              cpm[(_ni*VWN + 8 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 8 )*(MWI/VWM) + _mi], aval, bpm[_ni].s8);
-              cpm[(_ni*VWN + 9 )*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 9 )*(MWI/VWM) + _mi], aval, bpm[_ni].s9);
-              cpm[(_ni*VWN + 10)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 10)*(MWI/VWM) + _mi], aval, bpm[_ni].sA);
-              cpm[(_ni*VWN + 11)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 11)*(MWI/VWM) + _mi], aval, bpm[_ni].sB);
-              cpm[(_ni*VWN + 12)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 12)*(MWI/VWM) + _mi], aval, bpm[_ni].sC);
-              cpm[(_ni*VWN + 13)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 13)*(MWI/VWM) + _mi], aval, bpm[_ni].sD);
-              cpm[(_ni*VWN + 14)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 14)*(MWI/VWM) + _mi], aval, bpm[_ni].sE);
-              cpm[(_ni*VWN + 15)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 15)*(MWI/VWM) + _mi], aval, bpm[_ni].sF);
-            #endif
-          }
-        }
-
-      }
-    }
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-  }
-  #if GLOBAL_MEM_FENCE == 1
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  #endif
-
-  // Stores an MWG * NWG tile of results
-  StoreResults(cgm, cpm, kSizeM);
-}
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/clblast_level3_half/common.opencl b/src/clblast_level3_half/common.opencl
deleted file mode 100644
index 6faf9d2c1..000000000
--- a/src/clblast_level3_half/common.opencl
+++ /dev/null
@@ -1,170 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common defines and type-defs for the CLBlast OpenCL kernels.
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-// =================================================================================================
-
-#define ROUTINE_GEMMBATCHED
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this file is used outside of the CLBlast library.
-#ifndef PRECISION
-  #define PRECISION 32      // Data-types: half, single or double precision, complex or regular
-#endif
-
-// =================================================================================================
-#ifndef CUDA
-  // Enable support for double-precision
-  #if PRECISION == 16
-    #pragma OPENCL EXTENSION cl_khr_fp16: enable
-  #endif
-#endif
-
-// Half-precision
-#if PRECISION == 16
-  typedef half real;
-  typedef half2 real2;
-  typedef half4 real4;
-  typedef half8 real8;
-  typedef half16 real16;
-  #define ZERO 0
-  #define ONE 1
-  #define SMALLEST -1.0e14
-
-// Single-precision
-#elif PRECISION == 32
-  typedef float real;
-  typedef float2 real2;
-  typedef float4 real4;
-  typedef float8 real8;
-  typedef float16 real16;
-  #define ZERO 0.0f
-  #define ONE 1.0f
-  #define SMALLEST -1.0e37f
-#endif
-
-// Single-element version of a complex number
-  typedef real singlereal;
-
-// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
-// conversion, but half-precision is not supported as kernel argument so it is converted from float.
-#if PRECISION == 16
-  typedef float real_arg;
-  #define GetRealArg(x) (half)x
-#else
-  typedef real real_arg;
-  #define GetRealArg(x) x
-#endif
-
-// Pointers to local memory objects (using a define because CUDA doesn't need them)
-#ifndef LOCAL_PTR
-  #define LOCAL_PTR __local
-#endif
-
-// =================================================================================================
-
-// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
-// devices, this is enabled (see src/routine.cpp).
-#ifndef USE_CL_MAD
-  #define USE_CL_MAD 0
-#endif
-
-// Sets a variable to zero
-#define SetToZero(a) a = ZERO
-
-// Sets a variable to zero (only the imaginary part)
-#define ImagToZero(a)
-
-// Sets a variable to one
-#define SetToOne(a) a = ONE
-
-// Determines whether a variable is zero
-#define IsZero(a) (a == ZERO)
-
-// The absolute value (component-wise)
-#define AbsoluteValue(value) value = fabs(value)
-
-// Negation (component-wise)
-#define Negate(value) value = -(value)
-
-// Adds two complex variables
-#define Add(c,a,b) c = a + b
-
-// Subtracts two complex variables
-#define Subtract(c,a,b) c = a - b
-
-// The scalar multiply function
-#define Multiply(c,a,b) c = a * b
-
-// The scalar multiply-add function
-#if USE_CL_MAD == 1
-  #define MultiplyAdd(c,a,b) c = mad(a, b, c)
-#else
-  #define MultiplyAdd(c,a,b) c += a * b
-#endif
-
-// The scalar multiply-subtract function
-#define MultiplySubtract(c,a,b) c -= a * b
-
-// The scalar division function: full division
-#define DivideFull(c,a,b) c = a / b
-
-// The scalar AXPBY function
-#define AXPBY(e,a,b,c,d) e = a*b + c*d
-
-// The complex conjugate operation for complex transforms
-#define COMPLEX_CONJUGATE(value)
-
-// =================================================================================================
-
-// Force inlining functions or not: some compilers don't support the inline keyword
-#ifdef USE_INLINE_KEYWORD
-  #define INLINE_FUNC inline
-#else
-  #define INLINE_FUNC
-#endif
-
-// =================================================================================================
-
-// Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is
-// enabled (see src/routine.cc).
-#ifndef USE_STAGGERED_INDICES
-  #define USE_STAGGERED_INDICES 0
-#endif
-
-// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
-// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
-// More details: https://github.com/CNugteren/CLBlast/issues/53
-#if USE_STAGGERED_INDICES == 1
-  INLINE_FUNC int GetGroupIDFlat() {
-    return get_group_id(0) + get_num_groups(0) * get_group_id(1);
-  }
-  INLINE_FUNC int GetGroupID1() {
-    return (GetGroupIDFlat()) % get_num_groups(1);
-  }
-  INLINE_FUNC int GetGroupID0() {
-    return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
-  }
-#else
-  INLINE_FUNC int GetGroupID1() { return get_group_id(1); }
-  INLINE_FUNC int GetGroupID0() { return get_group_id(0); }
-#endif
-
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/config.h b/src/config.h
index 100e3e998..50c759714 100644
--- a/src/config.h
+++ b/src/config.h
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,10 +14,21 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
-#ifndef CONFIG_INCLUDED
-#define CONFIG_INCLUDED
+#ifndef CONFIG_H_INCLUDED
+#define CONFIG_H_INCLUDED
 
 /*
  * We need to check for input while we are thinking.
@@ -34,30 +45,39 @@
  * BOARD_SIZE: Define size of the board to compile Leela with, must be an odd
    number due to winograd tiles
  */
-#define BOARD_SIZE 19
-#define BOARD_SQUARES (BOARD_SIZE*BOARD_SIZE)
+static constexpr auto BOARD_SIZE = 19;
+static_assert(BOARD_SIZE % 2 == 1,
+              "Code assumes odd board size, remove at your own risk!");
 
-#if (BOARD_SIZE % 2 == 0)
-#error Code assumes odd board size, remove at your own risk!
-#endif
+static constexpr auto NUM_INTERSECTIONS = BOARD_SIZE * BOARD_SIZE;
+static constexpr auto POTENTIAL_MOVES = NUM_INTERSECTIONS + 1; // including pass
+
+/*
+ * KOMI: Define the default komi to use when training.
+ */
+static constexpr auto KOMI = 7.5f;
 
 /*
  * Features
  *
- * USE_BLAS: Use a basic linear algebra library.
- * We currently require this, as not all operations are performed on
- * the GPU - some operations won't get any speedup from it.
+ * USE_BLAS: Optionally use a basic linear algebra library.
+ * This is may perform faster than the included Eigen library,
+ * and some BLAS libraries can target multiple CPU models.
+ * Not all operations are performed on the GPU -
+ * some operations won't get any speedup from it.
  * Also used for OpenCL self-checks.
  */
-#define USE_BLAS
+//#define USE_BLAS
 
 /*
  * We use OpenBLAS by default, except on macOS, which has a fast BLAS
  * built-in. (Accelerate)
  */
 #if !defined(__APPLE__) && !defined(__MACOSX)
+#if defined(USE_BLAS)
 #define USE_OPENBLAS
 #endif
+#endif
 
 /*
  * USE_MKL: Optionally allows using Intel Math Kernel library as
@@ -73,42 +93,51 @@
  */
 #ifndef USE_CPU_ONLY
 #define USE_OPENCL
+
+/*
+ * USE_HALF: Include the half-precision OpenCL implementation when building.
+ * The current implementation autodetects whether half-precision is better
+ * or single-precision is better (half precision is chosen if it's 5% faster)
+ * Half-precision OpenCL gains performance on some GPUs while losing some
+ * accuracy on the calculation, but generally it is worth using half precision
+ * if it is at least 5% faster.
+ */
+#define USE_HALF
+
 #endif
+
 /*
  * USE_TUNER: Expose some extra command line parameters that allow tuning the
  * search algorithm.
  */
 //#define USE_TUNER
 
-#define PROGRAM_NAME "Leela Zero"
-#define PROGRAM_VERSION "0.15"
+static constexpr auto PROGRAM_NAME = "Leela Zero";
+static constexpr auto PROGRAM_VERSION = "0.17";
 
 /*
  * OpenBLAS limitation: the default configuration on some Linuxes
  * is limited to 64 cores.
  */
 #if defined(USE_BLAS) && defined(USE_OPENBLAS)
-#define MAX_CPUS 64
+static constexpr auto MAX_CPUS = 64;
 #else
-#define MAX_CPUS 128
+static constexpr auto MAX_CPUS = 256;
 #endif
 
 #ifdef USE_HALF
 #include "half/half.hpp"
-using net_t = half_float::half;
-#else
-using net_t = float;
 #endif
 
-#if defined(USE_BLAS) && defined(USE_OPENCL) && !defined(USE_HALF)
-// If both BLAS and OpenCL are fully usable, then check the OpenCL
-// results against BLAS with some probability.
+#ifdef USE_OPENCL
+// If OpenCL are fully usable, then check the OpenCL against CPU
+// implementation with some probability.
 #define USE_OPENCL_SELFCHECK
-#define SELFCHECK_PROBABILITY 2000
+static constexpr auto SELFCHECK_PROBABILITY = 2000;
 #endif
 
 #if (_MSC_VER >= 1400) /* VC8+ Disable all deprecation warnings */
-    #pragma warning(disable : 4996)
+#pragma warning(disable : 4996)
 #endif /* VC8+ */
 
 #endif
diff --git a/src/kernels/clblast/hgemm_tensorcore.opencl b/src/kernels/clblast/hgemm_tensorcore.opencl
new file mode 100644
index 000000000..fd4ab9fd3
--- /dev/null
+++ b/src/kernels/clblast/hgemm_tensorcore.opencl
@@ -0,0 +1,331 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2017-2018 Junhee Yoo and contributors
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+// This is the tensor core implementation of XgemmBatched.  Can only be used on
+// GPUs with NVIDIA's Volta / Turing architectures with wmmv instructions.
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+
+R"(
+#ifndef SA
+#define SA 1
+#endif
+
+#ifndef SB
+#define SB 1
+#endif
+
+#ifndef VWM
+#define VWM 4
+#endif
+
+#ifndef VWN
+#define VWN 2
+#endif
+
+#if VWM == 1
+#define vstoreM vstore
+#define vloadM vload
+#elif VWM == 2
+#define vstoreM vstore2
+#define vloadM vload2
+#elif VWM == 4
+#define vstoreM vstore4
+#define vloadM vload4
+#elif VWM == 8
+#define vstoreM vstore8
+#define vloadM vload8
+#elif VWM == 16
+#define vstoreM vstore16
+#define vloadM vload16
+#endif
+
+#if VWN == 1
+#define vstoreN vstore
+#define vloadN vload
+#elif VWN == 2
+#define vstoreN vstore2
+#define vloadN vload2
+#elif VWN == 4
+#define vstoreN vstore4
+#define vloadN vload4
+#elif VWN == 8
+#define vstoreN vstore8
+#define vloadN vload8
+#elif VWN == 16
+#define vstoreN vstore16
+#define vloadN vload16
+#endif
+
+#define WARP_SIZE 32
+
+#if MDIMA == 32 && NDIMB == 8
+#define WMMA_SHAPE "m32n8k16"
+#elif MDIMA == 16 && NDIMB == 16
+#define WMMA_SHAPE "m16n16k16"
+#elif MDIMA == 8 && NDIMB == 32
+#define WMMA_SHAPE "m8n32k16"
+#else
+#error Unsupported MDIMA / NDIMB combination
+#endif
+
+
+void GlobalToLocalA(int tid, int stride, __local short * alm, __global short * agm)
+{
+    const int copy_size = KWG * MWG;
+    const int dest_stride = MWG;
+    const int num_threads = MDIMC * NDIMC * WARP_SIZE / (MDIMA * NDIMB);
+
+#pragma unroll
+    for(int i=tid * VWM; i < copy_size; i += num_threads * VWM) {
+        int x = i % dest_stride;
+        int y = i / dest_stride;
+
+        vstoreM( vloadM((y * stride + x) / VWM, agm), i / VWM, alm);
+    }
+}
+
+
+void GlobalToLocalB(int tid, int stride, __local short * blm, __global short * bgm)
+{
+    const int copy_size = KWG * NWG;
+    const int dest_stride = NWG;
+    const int num_threads = MDIMC * NDIMC * WARP_SIZE / (MDIMA * NDIMB);
+#pragma unroll
+    for(int i=tid * VWN; i < copy_size; i += num_threads * VWN) {
+        int x = i % dest_stride;
+        int y = i / dest_stride;
+        vstoreN( vloadN((y * stride + x) / VWN, bgm), i / VWN, blm);
+    }
+}
+
+
+void HgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                  #if SA == 1
+                    __local short* alm,
+                  #endif
+                  #if SB == 1
+                    __local short* blm,
+                  #endif
+                  const __global half* restrict agm,
+                  const __global half* restrict bgm,
+                  __global half* restrict cgm)
+{
+    int laneid;
+    asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
+
+    // the base location of the MDIMA * NDIMB tile number this thread is responsible of
+    int tile_m = get_global_id(0) / WARP_SIZE * MWG / MDIMC;
+    int tile_n = get_global_id(1) * NWG / NDIMC;
+
+    // the base pointers of agm, bgm and cgm
+    const __global half * agm_ = agm + MDIMA * tile_m;
+    const __global half * bgm_ = bgm + NDIMB * tile_n;
+    __global half * cgm_ = cgm + kSizeM * NDIMB * tile_n + MDIMA * tile_m;
+
+    // the (m,n) position within the warp
+    int offset_number = laneid;
+    int offset_m = offset_number % (MDIMA/2);
+    int offset_n = offset_number / (MDIMA/2);
+
+    if(laneid != get_global_id(0) % WARP_SIZE) {
+        // this is just to make sure we crash ourselves if the basic assumption doesn't hold
+        return;
+    }
+
+    int k, m, n, mb, nb, kb, kwg;
+    int zero_pair;
+    asm("{\n"
+        ".reg .b16 xh;\n"
+        ".reg .b32 x;\n"
+        "mov.f32 x, 0.0;\n"
+        "cvt.rz.f16.f32 xh, x;\n"
+        "mov.b32 %0, {xh,xh};\n"
+        "}": "=r"(zero_pair)
+    );
+
+#pragma promote_to_registers
+    int c0[MWG/MDIMC][NWG/NDIMC];
+#pragma promote_to_registers
+    int c1[MWG/MDIMC][NWG/NDIMC];
+#pragma promote_to_registers
+    int c2[MWG/MDIMC][NWG/NDIMC];
+#pragma promote_to_registers
+    int c3[MWG/MDIMC][NWG/NDIMC];
+    #pragma unroll
+    for(mb = 0; mb < MWG / MDIMC; mb += 1) {
+        #pragma unroll
+        for(nb = 0; nb < NWG / NDIMC; nb += 1) {
+            c0[mb][nb] = zero_pair;
+            c1[mb][nb] = zero_pair;
+            c2[mb][nb] = zero_pair;
+            c3[mb][nb] = zero_pair;
+        }
+    }
+    for(kwg = 0; kwg < kSizeK; kwg += KWG) {
+#if SA == 1
+        GlobalToLocalA(get_local_id(0) + get_local_id(1) * WARP_SIZE * MDIMC / MDIMA, kSizeM,
+            alm,
+            (__global short *)(agm + get_group_id(0) * MWG + kwg * kSizeM)
+        );
+#endif
+
+#if SB == 1
+        GlobalToLocalB(get_local_id(0) +  get_local_id(1) * WARP_SIZE * MDIMC / MDIMA, kSizeN,
+            blm,
+            (__global short *)(bgm + get_group_id(1) * NWG + kwg * kSizeN)
+        );
+
+#endif
+
+#if SA == 1 || SB == 1
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#pragma unroll
+        for(kb = 0; kb < KWG; kb += 16) {
+#pragma promote_to_registers
+            int b[NWG/NDIMC][8];
+            for(nb = 0; nb < NWG / NDIMC; nb += 1) {
+#if SB == 1
+                const int block_loc_n = (get_local_id(1)) % (NDIMC/NDIMB);
+                const int bgm_stride = NWG;
+                const __local half * b_bgm_ = (const __local half *)(blm + (nb + block_loc_n * (NWG/NDIMC)) * NDIMB);
+                const __local half * bb_bgm_ = b_bgm_ + bgm_stride * kb;
+#else
+                const int bgm_stride = kSizeN;
+                const __global half * b_bgm_ = bgm_ + nb * NDIMB;
+                const __global half * bb_bgm_ = b_bgm_ + kSizeN * (kb + kwg);
+#endif
+                asm("{\n"
+#if SB == 1
+                    "wmma.load.b.sync.aligned." WMMA_SHAPE ".shared.row.f16 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8], %9;\n"
+#else
+                    "wmma.load.b.sync.aligned." WMMA_SHAPE ".row.f16 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8], %9;\n"
+#endif
+                "}": "=r"(b[nb][0]), "=r"(b[nb][1]), "=r"(b[nb][2]), "=r"(b[nb][3]), "=r"(b[nb][4]), "=r"(b[nb][5]), "=r"(b[nb][6]), "=r"(b[nb][7]) : "l"(bb_bgm_), "r"(bgm_stride));
+            }
+#pragma unroll
+            for(mb = 0; mb < MWG / MDIMC; mb += 1) {
+#pragma promote_to_registers
+                int a[8];
+#if SA == 1
+                const int block_loc_m = (get_local_id(0)/WARP_SIZE) % (MDIMC/MDIMA);
+                const int agm_stride = MWG;
+                const __local half * b_agm_ = (const __local half *)(alm + (mb + block_loc_m * (MWG/MDIMC)) * MDIMA);
+                const __local half * bb_agm_ = b_agm_ + agm_stride * kb;
+#else
+                const int agm_stride = kSizeM;
+                const __global half * b_agm_ = agm_ + mb * MDIMA;
+                const __global half * bb_agm_ = b_agm_ + kSizeM * (kb + kwg);
+#endif
+                asm("{\n"
+#if SA == 1
+                    "wmma.load.a.sync.aligned." WMMA_SHAPE ".shared.col.f16 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8], %9;\n"
+#else
+                    "wmma.load.a.sync.aligned." WMMA_SHAPE ".col.f16 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8], %9;\n"
+#endif
+                    "}": "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]), "=r"(a[4]), "=r"(a[5]), "=r"(a[6]), "=r"(a[7]) : "l"(bb_agm_), "r"(agm_stride));
+
+#pragma unroll
+                for(nb = 0; nb < NWG / NDIMC; nb += 1) {
+                    int d0_, d1_, d2_, d3_;
+                    int c0_ = c0[mb][nb];
+                    int c1_ = c1[mb][nb];
+                    int c2_ = c2[mb][nb];
+                    int c3_ = c3[mb][nb];
+                    asm("{\n"
+                        "wmma.mma.sync.aligned.col.row." WMMA_SHAPE ".f16.f16 "
+                        "    {%0,%1,%2,%3},\n"
+                        "    {%8,%9,%10,%11,%12,%13,%14,%15},\n"
+                        "    {%16,%17,%18,%19,%20,%21,%22,%23},\n"
+                        "    {%4,%5,%6,%7};\n"
+                        "}": "=r"(d0_), "=r"(d1_), "=r"(d2_), "=r"(d3_) : "r"(c0_), "r"(c1_), "r"(c2_), "r"(c3_),
+                            "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
+                            "r"(b[nb][0]), "r"(b[nb][1]), "r"(b[nb][2]), "r"(b[nb][3]), "r"(b[nb][4]), "r"(b[nb][5]), "r"(b[nb][6]), "r"(b[nb][7])
+                    );
+                    c0[mb][nb] = d0_;
+                    c1[mb][nb] = d1_;
+                    c2[mb][nb] = d2_;
+                    c3[mb][nb] = d3_;
+                }
+            }
+        }
+    }
+
+#pragma unroll
+    for(mb = 0; mb < MWG / MDIMC; mb += 1) {
+#pragma unroll
+        for(nb = 0; nb < NWG / NDIMC; nb += 1) {
+            int c0_ = c0[mb][nb];
+            int c1_ = c1[mb][nb];
+            int c2_ = c2[mb][nb];
+            int c3_ = c3[mb][nb];
+            __global half * b_cgm_ = cgm_ + kSizeM * nb * NDIMB + mb * MDIMA;
+            asm("{\n"
+                "wmma.store.d.sync.aligned.col." WMMA_SHAPE ".f16 [%4], {%0,%1,%2,%3}, %5;"
+                "}" : : "r"(c0_), "r"(c1_), "r"(c2_), "r"(c3_), "l"(b_cgm_), "r"(kSizeM));
+        }
+    }
+}
+
+struct alm_t {short alm[KWG * MWG];} __attribute__((aligned(32)));
+struct blm_t {short blm[KWG * NWG];} __attribute__((aligned(32)));
+
+__kernel __attribute__((reqd_work_group_size(32*MDIMC/MDIMA, NDIMC/NDIMB, 1)))
+void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
+                  const __global half* restrict agm,
+                  const __global half* restrict bgm,
+                  __global half* restrict cgm)
+{
+    // Sets the offsets
+    const int batch = get_group_id(2);
+    const int a_offset = kSizeM*kSizeK*batch;
+    const int b_offset = kSizeK*kSizeN*batch;
+    const int c_offset = kSizeM*kSizeN*batch;
+
+    const __global half* restrict agm_ = &agm[a_offset];
+    const __global half* restrict bgm_ = &bgm[b_offset];
+    __global half* restrict cgm_ = &cgm[c_offset];
+
+    // Allocates workgroup-private memory (local memory)
+    #if SA == 1
+      __local struct alm_t alm;
+    #endif
+    #if SB == 1
+      __local struct blm_t blm;
+    #endif
+
+    #if SA == 1 && SB == 1
+        HgemmBody(kSizeM, kSizeN, kSizeK, alm.alm, blm.blm, agm_, bgm_, cgm_);
+    #elif SA == 1
+        HgemmBody(kSizeM, kSizeN, kSizeK, alm.alm, agm_, bgm_, cgm_);
+    #elif SB == 1
+        HgemmBody(kSizeM, kSizeN, kSizeK, blm.blm, agm_, bgm_, cgm_);
+    #else
+        HgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, cgm_);
+    #endif
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+// =================================================================================================
diff --git a/src/clblast_level3_half/xgemm_batched.opencl b/src/kernels/clblast/xgemm_batched.opencl
similarity index 100%
rename from src/clblast_level3_half/xgemm_batched.opencl
rename to src/kernels/clblast/xgemm_batched.opencl
diff --git a/src/clblast_level3_half/xgemm_part1.opencl b/src/kernels/clblast/xgemm_part1.opencl
similarity index 80%
rename from src/clblast_level3_half/xgemm_part1.opencl
rename to src/kernels/clblast/xgemm_part1.opencl
index 289efa3a4..7922761af 100644
--- a/src/clblast_level3_half/xgemm_part1.opencl
+++ b/src/kernels/clblast/xgemm_part1.opencl
@@ -107,39 +107,77 @@ R"(
 // =================================================================================================
 
 // Data-widths in dimension M
-#if VWM == 1
-    typedef real realM;
-    typedef short memM;
-#elif VWM == 2
-    typedef real2 realM;
-    typedef short2 memM;
-#elif VWM == 4
-    typedef real4 realM;
-    typedef short4 memM;
-#elif VWM == 8
-    typedef real8 realM;
-    typedef short8 memM;
-#elif VWM == 16
-    typedef real16 realM;
-    typedef short16 memM;
+#ifdef FP16_STORAGE
+  #if VWM == 1
+      typedef real realM;
+      typedef short memM;
+  #elif VWM == 2
+      typedef real2 realM;
+      typedef short2 memM;
+  #elif VWM == 4
+      typedef real4 realM;
+      typedef short4 memM;
+  #elif VWM == 8
+      typedef real8 realM;
+      typedef short8 memM;
+  #elif VWM == 16
+      typedef real16 realM;
+      typedef short16 memM;
+  #endif
+#else
+  #if VWM == 1
+      typedef real realM;
+      typedef real memM;
+  #elif VWM == 2
+      typedef real2 realM;
+      typedef real2 memM;
+  #elif VWM == 4
+      typedef real4 realM;
+      typedef real4 memM;
+  #elif VWM == 8
+      typedef real8 realM;
+      typedef real8 memM;
+  #elif VWM == 16
+      typedef real16 realM;
+      typedef real16 memM;
+  #endif
 #endif
 
 // Data-widths in dimension N
-#if VWN == 1
-    typedef real realN;
-    typedef short memN;
-#elif VWN == 2
-    typedef real2 realN;
-    typedef short2 memN;
-#elif VWN == 4
-    typedef real4 realN;
-    typedef short4 memN;
-#elif VWN == 8
-    typedef real8 realN;
-    typedef short8 memN;
-#elif VWN == 16
-    typedef real16 realN;
-    typedef short16 memN;
+#ifdef FP16_STORAGE
+  #if VWN == 1
+      typedef real realN;
+      typedef short memN;
+  #elif VWN == 2
+      typedef real2 realN;
+      typedef short2 memN;
+  #elif VWN == 4
+      typedef real4 realN;
+      typedef short4 memN;
+  #elif VWN == 8
+      typedef real8 realN;
+      typedef short8 memN;
+  #elif VWN == 16
+      typedef real16 realN;
+      typedef short16 memN;
+  #endif
+#else
+  #if VWN == 1
+      typedef real realN;
+      typedef real memN;
+  #elif VWN == 2
+      typedef real2 realN;
+      typedef real2 memN;
+  #elif VWN == 4
+      typedef real4 realN;
+      typedef real4 memN;
+  #elif VWN == 8
+      typedef real8 realN;
+      typedef real8 memN;
+  #elif VWN == 16
+      typedef real16 realN;
+      typedef real16 memN;
+  #endif
 #endif
 
 // =================================================================================================
@@ -268,16 +306,20 @@ INLINE_FUNC realM GlobalToPrivateA(const __global memM* restrict agm, const int
   int idm = mg + GetGroupID0() * (MWG/VWM);
 
   // Loads the data from global memory (not transposed) and stores into registers
-#if VWM == 1
-  return vloada_half(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
-#elif VWM == 2
-  return vloada_half2(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
-#elif VWM == 4
-  return vloada_half4(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
-#elif VWM == 8
-  return vloada_half8(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
-#elif VWM == 16
-  return vloada_half16(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
+#ifdef FP16_STORAGE
+  #if VWM == 1
+    return vloada_half(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
+  #elif VWM == 2
+    return vloada_half2(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
+  #elif VWM == 4
+    return vloada_half4(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
+  #elif VWM == 8
+    return vloada_half8(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
+  #elif VWM == 16
+    return vloada_half16(idk*(kSizeM/VWM) + idm, (const __global half*)agm);
+  #endif
+#else
+  return agm[idk*(kSizeM/VWM) + idm];
 #endif
 }
 #endif
@@ -297,16 +339,20 @@ INLINE_FUNC realN GlobalToPrivateB(const __global memN* restrict bgm, const int
   int idn = ng + GetGroupID1() * (NWG/VWN);
 
   // Loads the data from global memory (transposed) and stores into registers
-#if VWN == 1
-  return vloada_half(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
-#elif VWN == 2
-  return vloada_half2(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
-#elif VWN == 4
-  return vloada_half4(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
-#elif VWN == 8
-  return vloada_half8(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
-#elif VWN == 16
-  return vloada_half16(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
+#ifdef FP16_STORAGE
+  #if VWN == 1
+    return vloada_half(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
+  #elif VWN == 2
+    return vloada_half2(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
+  #elif VWN == 4
+    return vloada_half4(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
+  #elif VWN == 8
+    return vloada_half8(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
+  #elif VWN == 16
+    return vloada_half16(idk*(kSizeN/VWN) + idn, (const __global half*)bgm);
+  #endif
+#else
+  return bgm[idk*(kSizeN/VWN) + idn];
 #endif
 }
 #endif
@@ -322,6 +368,7 @@ INLINE_FUNC realM LocalToPrivateA(LOCAL_PTR memM* alm, const int _mi, const int
   #elif STRM == 1
     int mg = get_local_id(0) + _mi*MDIMC;
   #endif
+#ifdef FP16_STORAGE
   #if VWM == 1
     return vloada_half(kg*(MWG/VWM) + mg, (LOCAL_PTR half*)alm);
   #elif VWM == 2
@@ -333,6 +380,9 @@ INLINE_FUNC realM LocalToPrivateA(LOCAL_PTR memM* alm, const int _mi, const int
   #elif VWM == 16
     return vloada_half16(kg*(MWG/VWM) + mg, (LOCAL_PTR half*)alm);
   #endif
+#else
+  return alm[kg*(MWG/VWM) + mg];
+#endif
 }
 #endif
 
@@ -345,6 +395,7 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR memN* blm, const int _ni, const int
     int ng = get_local_id(1) + _ni*NDIMC;
   #endif
 
+#ifdef FP16_STORAGE
   #if VWN == 1
     return vloada_half(kg*(NWG/VWN) + ng, (LOCAL_PTR half*)blm);
   #elif VWN == 2
@@ -356,6 +407,9 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR memN* blm, const int _ni, const int
   #elif VWN == 16
     return vloada_half16(kg*(NWG/VWN) + ng, (LOCAL_PTR half*)blm);
   #endif
+#else
+  return blm[kg*(NWG/VWN) + ng];
+#endif
 }
 #endif
 
diff --git a/src/clblast_level3_half/xgemm_part2.opencl b/src/kernels/clblast/xgemm_part2.opencl
similarity index 98%
rename from src/clblast_level3_half/xgemm_part2.opencl
rename to src/kernels/clblast/xgemm_part2.opencl
index 44ea588cf..b9ff537f2 100644
--- a/src/clblast_level3_half/xgemm_part2.opencl
+++ b/src/kernels/clblast/xgemm_part2.opencl
@@ -85,6 +85,7 @@ INLINE_FUNC void StoreResults(__global memM* cgm, realM cpm[NWI*MWI/VWM], const
       int idn = ng + GetGroupID1() * NWG;
       int index = idn*(kSizeM/VWM) + idm;
 
+#ifdef FP16_STORAGE
 #if VWM == 1
       vstorea_half(cpm[_ni * (MWI/VWM) + _mi], index, (__global half*)cgm);
 #elif VWM == 2
@@ -96,7 +97,9 @@ INLINE_FUNC void StoreResults(__global memM* cgm, realM cpm[NWI*MWI/VWM], const
 #elif VWM == 16
       vstorea_half16(cpm[_ni * (MWI/VWM) + _mi], index, (__global half*)cgm);
 #endif
-
+#else
+      cgm[index] = cpm[_ni * (MWI/VWM) + _mi];
+#endif
     }
   }
 }
diff --git a/src/clblast_level3_half/xgemm_part3.opencl b/src/kernels/clblast/xgemm_part3.opencl
similarity index 100%
rename from src/clblast_level3_half/xgemm_part3.opencl
rename to src/kernels/clblast/xgemm_part3.opencl
diff --git a/src/clblast_level3/common.opencl b/src/kernels/common.opencl
similarity index 87%
rename from src/clblast_level3/common.opencl
rename to src/kernels/common.opencl
index 6faf9d2c1..a220cb79f 100644
--- a/src/clblast_level3/common.opencl
+++ b/src/kernels/common.opencl
@@ -18,10 +18,34 @@ R"(
 
 #define ROUTINE_GEMMBATCHED
 
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this file is used outside of the CLBlast library.
+#ifdef USE_HALF
+    #ifdef FP16_SUPPORT
+        #define FP16_COMPUTE
+    #else
+        #define FP16_STORAGE
+    #endif
+#endif
+
 #ifndef PRECISION
-  #define PRECISION 32      // Data-types: half, single or double precision, complex or regular
+    #ifdef FP16_COMPUTE
+      #define PRECISION 16
+    #else 
+      #define PRECISION 32      // Data-types: half, single or double precision, complex or regular
+    #endif
+#endif
+
+#ifdef FP16_STORAGE
+    typedef half net_t;
+    #define vload_net_t(offset,p) vload_half(offset,p)
+    #define vstore_net_t(data,offset,p) vstore_half(data,offset,p)
+#else
+    #ifdef FP16_COMPUTE
+        typedef half net_t;
+    #else
+        typedef float net_t;
+    #endif
+    #define vload_net_t(offset,p) ((p)[(offset)])
+    #define vstore_net_t(data,offset,p) (((p)[(offset)])=(data))
 #endif
 
 // =================================================================================================
@@ -39,6 +63,7 @@ R"(
   typedef half4 real4;
   typedef half8 real8;
   typedef half16 real16;
+  #define SQ2 1.4142135623730951
   #define ZERO 0
   #define ONE 1
   #define SMALLEST -1.0e14
@@ -50,6 +75,7 @@ R"(
   typedef float4 real4;
   typedef float8 real8;
   typedef float16 real16;
+  #define SQ2 1.4142135623730951f
   #define ZERO 0.0f
   #define ONE 1.0f
   #define SMALLEST -1.0e37f
diff --git a/src/kernels/convolve1.opencl b/src/kernels/convolve1.opencl
new file mode 100644
index 000000000..7f25eb8ad
--- /dev/null
+++ b/src/kernels/convolve1.opencl
@@ -0,0 +1,138 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+    __kernel
+    __attribute__((work_group_size_hint(8, 16, 1)))
+    void convolve1(
+                   __global const net_t * restrict in,
+                   __global net_t * restrict merge,
+                   __global const net_t * restrict weights,
+                   __local real * channel_buff,
+                   __local real * row_buff) {
+
+        // cl::NDRange global(channels, outputs, row);
+        const int c   = get_global_id(0);  // channel
+        const int o   = get_global_id(1);  // output
+        const int row_batch = get_global_id(2);  // row * batch_size
+
+        const int row = row_batch % BOARD_SIZE;
+        const int batch = row_batch / BOARD_SIZE;
+
+        const int channels = get_global_size(0);
+        const int outputs  = get_global_size(1);
+
+        const int input_offset = batch * NUM_INTERSECTIONS * channels;
+        const int merge_offset = batch * NUM_INTERSECTIONS * (channels >> 3) * outputs;
+
+        // cl::NDRange local(2, (1->32), 1);
+        const int lx = get_local_id(0);
+        const int ly = get_local_id(1);
+        const int chan_buff_size = 8;
+        const int out_buff_size  = get_local_size(1);
+        const int row_buff_size  = 7;
+        const int chan_shift     = 3;
+        // input = channels * height * width
+        // output = outputs * height * width
+        // weights = output * channels * filter
+        // merge = channels * outputs * height * width
+        const int width = BOARD_SIZE;
+        const int height = BOARD_SIZE;
+        const int strip_size = width;
+        // Copy the input channels (strips) locally
+        if (out_buff_size < BOARD_SIZE && ly == 0) {
+            // strip-row
+            for (int w = 0; w < width; w++) {
+                channel_buff[lx * width + w] =
+                    vload_net_t((c * height + row) * width + w + input_offset, in);
+            }
+        } else if (out_buff_size >= BOARD_SIZE && ly < BOARD_SIZE) {
+            // Every thread copies a column
+            channel_buff[lx * width + ly] = vload_net_t((c * height + row) * width +
+                ly + input_offset, in);
+        }
+        // Copy the filter we are applying locally
+        __private real filter_buff = vload_net_t((o * channels + c), weights);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        int out_lane = 0;
+        int out_cw   = 0;
+        #pragma unroll
+        for (int cw = 0; cw < width; cw++) {
+            int fid = lx * strip_size;
+            real out  = channel_buff[fid + cw] * filter_buff;
+            row_buff[(ly * chan_buff_size + lx) * row_buff_size + out_lane] = out;
+            out_lane++;
+            // Row buffer full or last lane?
+            if (out_lane == row_buff_size || (cw == width - 1)) {
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if (lx < out_lane) {
+                    real val;
+                    val  = row_buff[(ly * chan_buff_size + 0) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 1) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 2) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 3) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 4) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 5) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 6) * row_buff_size + lx];
+                    val += row_buff[(ly * chan_buff_size + 7) * row_buff_size + lx];
+                    vstore_net_t(val, (((c >> chan_shift) * height + row) * width +
+                        out_cw + lx) * outputs + o + merge_offset, merge);
+                }
+                out_cw  += row_buff_size;
+                out_lane = 0;
+           }
+       }
+    }
+
+__kernel void merge(
+                        __global const net_t * restrict in,
+                        __global net_t * restrict out,
+                        __private const int channels) {
+        // cl::NDRange global(outputs, NUM_INTERSECTIONS);
+        const int gx = get_global_id(0);
+        const int gy = get_global_id(1);
+        const int batch = get_global_id(2);
+        const int output = gx;
+        const int b = gy;
+        const int outputs = get_global_size(0);
+        const int width = BOARD_SIZE;
+        const int height = BOARD_SIZE;
+        const int o = output;
+        real sum = 0;
+        for (int c = 0; c < channels; c++) {
+            sum += vload_net_t(batch * channels * NUM_INTERSECTIONS * outputs +
+                (c * NUM_INTERSECTIONS + b) * outputs + o, in);
+        }
+        vstore_net_t(sum, batch * outputs * NUM_INTERSECTIONS + o * NUM_INTERSECTIONS + b, out);
+    }
+
+// End of the C++11 raw string literal
+)"
diff --git a/src/kernels/convolve3.opencl b/src/kernels/convolve3.opencl
new file mode 100644
index 000000000..c422f55a5
--- /dev/null
+++ b/src/kernels/convolve3.opencl
@@ -0,0 +1,455 @@
+/*
+    This file is part of Leela Zero.
+    Copyright (C) 2017-2019 Gian-Carlo Pascutto and contributors
+
+    Leela Zero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Leela Zero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+
+R"(
+
+#ifndef OUTIN_KWG
+#define OUTIN_KWG 2
+#endif
+
+#ifndef OUT_KWG
+#define OUT_KWG 32
+#endif
+
+#ifndef OUT_BWG
+#define OUT_BWG 2
+#endif
+
+__constant real Bt[WINOGRAD_ALPHA * WINOGRAD_ALPHA] = \
+                   {1.0f,  0.0f,     -5.0f/2.0f,  0.0f,      1.0f, 0.0f,
+                    0.0f, -SQ2,      -2.0f,       SQ2/2.0f,  1.0f, 0.0f,
+                    0.0f,  SQ2,      -2.0f,      -SQ2/2.0f,  1.0f, 0.0f,
+                    0.0f, -SQ2/2.0f, -1.0f/2.0f,  SQ2,       1.0f, 0.0f,
+                    0.0f,  SQ2/2.0f, -1.0f/2.0f, -SQ2,       1.0f, 0.0f,
+                    0.0f,  1.0f,      0.0f,      -5.0f/2.0f, 0.0f, 1.0f};
+void multiply_bt(
+    real * o0, real * o1, real * o2, real * o3, real * o4, real * o5,
+    real i0, real i1, real i2, real i3, real i4, real i5
+) {
+    real i3m1 = i1 * -SQ2 + i3 * (SQ2 / 2.0f);
+    real i4m2 = i2 * -2.0f + i4 * 1.0f;
+
+    *o0 = i0 + i2 * (-5.0f/2.0f) + i4;
+    *o1 = i3m1 + i4m2;
+    *o2 = -i3m1 + i4m2;
+
+    real i3m1_2 = i3 * (SQ2) + i1 * (-SQ2/2.0f);
+    real i4m2_2 = i2 * (-1.0f/2.0f) + i4;
+
+    *o3 = i3m1_2 + i4m2_2;
+    *o4 = -i3m1_2 + i4m2_2;
+
+    *o5 = i1 + i3 * (-5.0f/2.0f) + i5;
+}
+
+
+__constant real At[WINOGRAD_M * WINOGRAD_ALPHA] = \
+                   {1.0f, 1.0f,      1.0f,       1.0f,      1.0f,     0.0f,
+                    0.0f, SQ2/2.0f, -SQ2/2.0f,   SQ2,      -SQ2,      0.0f,
+                    0.0f, 1.0f/2.0f, 1.0f/2.0f,  2.0f,      2.0f,     0.0f,
+                    0.0f, SQ2/4.0f, -SQ2/4.0f,   2.0f*SQ2, -2.0f*SQ2, 1.0f};
+void multiply_atv(
+    real4 * o,
+    real i0, real i1, real i2, real i3, real i4, real i5
+) {
+    real t1p2 = (i1 + i2) * (1.0f / 2.0f);
+    real t1m2 = (i1 - i2) * (SQ2/4.0f);
+    real t3p4 = i3 + i4;
+    real t3m4 = (i3 - i4) * (SQ2);
+
+    (*o).x = i0 + t1p2 + t1p2 + t3p4;
+    (*o).y = t1m2 + t1m2 + t3m4;
+    (*o).z = t1p2 + t3p4 + t3p4;
+    (*o).w = t1m2 + t3m4 + t3m4 + i5;
+}
+
+
+void multiply_at(
+    real * o0, real * o1, real * o2, real * o3,
+    real i0, real i1, real i2, real i3, real i4, real i5
+) {
+    real4 o;
+    multiply_atv(&o, i0, i1, i2, i3, i4, i5);
+
+    *o0 = o.x;
+    *o1 = o.y;
+    *o2 = o.z;
+    *o3 = o.w;
+}
+
+void __in_transform_eq(real x[WINOGRAD_ALPHA][WINOGRAD_ALPHA], __global net_t * restrict V, int offset, int CPpad) {
+
+    const int W = BOARD_SIZE;
+    const int H = BOARD_SIZE;
+    const int P = WTILES * WTILES;
+
+    real T1[WINOGRAD_ALPHA][WINOGRAD_ALPHA];
+    real T2[WINOGRAD_ALPHA][WINOGRAD_ALPHA];
+
+    // Calculates transpose(B).x.B
+#ifdef WINOGRAD_SIMD
+    for (int i = 0; i < WINOGRAD_ALPHA; i++){
+        for (int j = 0; j < WINOGRAD_ALPHA; j++) {
+            real2 acc = {ZERO, ZERO};
+            real2 *x2 = (real2 *)&x[j][0];
+            for (int k = 0; k < WINOGRAD_ALPHA/2; k++) {
+                real2 x1;
+                x1.x = Bt[i * WINOGRAD_ALPHA + 2*k];
+                x1.y = Bt[i * WINOGRAD_ALPHA + 2*k + 1];
+                acc += x1 * x2[k];
+            }
+            T1[i][j] = acc.x + acc.y;
+        }
+    }
+#else
+    for (int j = 0; j < WINOGRAD_ALPHA; j++) {
+        multiply_bt(
+            &(T1[0][j]), &(T1[1][j]), &(T1[2][j]), &(T1[3][j]), &(T1[4][j]), &(T1[5][j]),
+            x[j][0], x[j][1], x[j][2], x[j][3], x[j][4], x[j][5]
+        );
+    }
+#endif
+
+#ifdef WINOGRAD_SIMD
+    for (int i = 0; i < WINOGRAD_ALPHA; i++){
+        for (int j = 0; j < WINOGRAD_ALPHA; j++) {
+            real2 acc = {ZERO, ZERO};
+            real2 *x1 = (real2 *)&T1[i][0];
+            for (int k = 0; k < WINOGRAD_ALPHA/2; k++) {
+                real2 x2;
+                x2.x = Bt[j * WINOGRAD_ALPHA + 2*k];
+                x2.y = Bt[j * WINOGRAD_ALPHA + 2*k + 1];
+                acc += x1[k] * x2;
+            }
+            T2[i][j] = acc.x + acc.y;
+        }
+    }
+#else
+    for (int i = 0; i < WINOGRAD_ALPHA; i++){
+        multiply_bt(
+            &(T2[i][0]),  &(T2[i][1]),  &(T2[i][2]),  &(T2[i][3]),  &(T2[i][4]),  &(T2[i][5]),
+            T1[i][0], T1[i][1], T1[i][2], T1[i][3], T1[i][4], T1[i][5]
+        );
+    }
+#endif
+
+    // Scatter each sub element in tile to separate matrices
+    for (int i = 0; i < WINOGRAD_ALPHA; i++) {
+        for (int j = 0; j < WINOGRAD_ALPHA; j++) {
+            vstore_net_t(T2[i][j], (i*WINOGRAD_ALPHA + j)*CPpad + offset, V);
+        }
+    }
+}
+
+__kernel void in_transform(__global net_t * restrict in, __global net_t * restrict V,
+                           const int C, const int Cpad,
+                           const int Ppad, const int batch_size) {
+    const int W = BOARD_SIZE;
+    const int H = BOARD_SIZE;
+    const int P = WTILES * WTILES;
+    const int CPpad = Ppad * Cpad;
+
+    const int block = get_global_id(0);
+    const int ch = get_global_id(1);
+
+    const int batch = block / P;
+    const int block_x = (block - P * batch) % WTILES;
+    const int block_y = (block - P * batch) / WTILES;
+
+    // 6x6 tiles overlap by 2
+    const int yin = WINOGRAD_M * block_y - 1;
+    const int xin = WINOGRAD_M * block_x - 1;
+
+    if (block < batch_size * P && ch < C) {
+        // Cache input tile and handle zero padding
+        real x[WINOGRAD_ALPHA][WINOGRAD_ALPHA];
+        for (int i = 0; i < WINOGRAD_ALPHA; i++) {
+            for (int j = 0; j < WINOGRAD_ALPHA; j++) {
+                int a = xin + j;
+                int b = yin + i;
+                // x is transposed here for better layout later
+                if (b >= 0 && a >= 0 && b < H && a < W) {
+                    x[j][i] = vload_net_t(batch * C * NUM_INTERSECTIONS +
+                        ch * NUM_INTERSECTIONS + b * W + a, in);
+                } else {
+                    x[j][i] = ZERO;
+                }
+            }
+        }
+
+        // V dimensions are [36, input_channels, batch_size * tiles].
+        // Padded with zeros as necessary for SGEMM
+        // = [36, Cpad, Ppad]
+
+        const int offset = ch * Ppad + block;
+        __in_transform_eq(x, V, offset, CPpad);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(OUT_KWG, OUT_BWG, 1)))
+void out_transform_fused_bn(__global const net_t * restrict M,
+                                     __global net_t * restrict Y,
+                                     const int K,
+                                     const int Kpad, const int Ppad,
+                                     const int batch_size,
+                                     __global const net_t * restrict residual,
+                                     __constant const net_t * restrict means,
+                                     __constant const net_t * restrict stddivs) {
+
+    const int W = BOARD_SIZE;
+    const int H = BOARD_SIZE;
+    const int P = WTILES * WTILES;
+
+    const int k = get_global_id(0);
+    const int block = get_global_id(1);
+
+    // Adding some padding decreases bank conflicts
+    __local real out_buf[OUT_KWG][OUT_BWG][WINOGRAD_M][WINOGRAD_M + 1];
+
+    volatile int kid = get_local_id(0);
+    volatile int bid = get_local_id(1);
+
+    if (k < K && block < batch_size * P) {
+        const real mean = vload_net_t(k, means);
+        const real scale_stddiv = vload_net_t(k, stddivs);
+
+        real temp[WINOGRAD_M][WINOGRAD_ALPHA];
+
+        // M dimensions are [36, outputs, batch_size * tiles].
+        // Plus zero padding from SGEMM.
+        const int offset = block * Kpad + k;
+
+        // Calculates transpose(A).temp_m
+        for (int xn = 0; xn < WINOGRAD_ALPHA; xn++) {
+            real temp_m0 = vload_net_t((0 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m1 = vload_net_t((1 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m2 = vload_net_t((2 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m3 = vload_net_t((3 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m4 = vload_net_t((4 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m5 = vload_net_t((5 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            multiply_at(
+                &(temp[0][xn]), &(temp[1][xn]), &(temp[2][xn]), &(temp[3][xn]),
+                temp_m0, temp_m1, temp_m2, temp_m3, temp_m4, temp_m5
+            );
+        }
+
+        // Calculates temp.A
+        for (int i = 0; i < WINOGRAD_M; i++){
+            real4 r;
+            multiply_atv(
+                &r,
+                temp[i][0], temp[i][1], temp[i][2], temp[i][3], temp[i][4], temp[i][5]
+            );
+
+            r = (r - mean) * scale_stddiv;
+            out_buf[kid][bid][i][0] = r.x;
+            out_buf[kid][bid][i][1] = r.y;
+            out_buf[kid][bid][i][2] = r.z;
+            out_buf[kid][bid][i][3] = r.w;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int idx = get_local_id(0) + get_local_size(0) * get_local_id(1); idx < OUT_BWG * OUT_KWG * WINOGRAD_M * WINOGRAD_M; idx += get_local_size(0) * get_local_size(1)) {
+        // Calculate indexing for coalesced memory access.
+        // This should be simplified somehow.
+        const int k_local = idx / (OUT_BWG * WINOGRAD_M * WINOGRAD_M);
+
+        const int idx_block = (idx - k_local * OUT_BWG * WINOGRAD_M * WINOGRAD_M);
+
+        const int row = idx_block / (WINOGRAD_M * OUT_BWG);
+        const int col = (idx_block - row * WINOGRAD_M * OUT_BWG);
+        const int block_local = col / WINOGRAD_M;
+
+        const int j = col % WINOGRAD_M;
+        const int i = row % WINOGRAD_M;
+
+        const int blockt = get_group_id(1) * get_local_size(1) + block_local;
+        const int kt = get_group_id(0) * get_local_size(0) + k_local;
+
+        const int batch = blockt / P;
+        const int blockt_x = (blockt - P * batch) % WTILES;
+        const int blockt_y = (blockt - P * batch) / WTILES;
+
+        const int x = WINOGRAD_M * blockt_x;
+        const int y = WINOGRAD_M * blockt_y;
+        const int out_idx = batch * K * NUM_INTERSECTIONS + kt * NUM_INTERSECTIONS + (y + i) * W + (x + j);
+
+        if (kt < K && blockt < batch_size * P && y + i < H && x + j < W) {
+            real acc = out_buf[k_local][block_local][i][j];
+            if (residual) {
+                acc += vload_net_t(out_idx, residual);
+            }
+            acc = acc > ZERO ? acc : ZERO;
+
+            vstore_net_t(acc, out_idx, Y);
+        }
+    }
+}
+
+__kernel void out_transform_fused_bn_in(
+                                     __global const net_t * restrict M,
+                                     __global net_t * restrict Y,
+                                     __global net_t * restrict V,
+                                     const int K,
+                                     const int Kpad, const int Ppad, const int Cpad,
+                                     __global const net_t * restrict residual,
+                                     __constant const net_t * restrict means,
+                                     __constant const net_t * restrict stddivs) {
+
+    const int W = BOARD_SIZE;
+    const int H = BOARD_SIZE;
+    const int P = WTILES * WTILES;
+
+    const int k = get_global_id(0);
+    const int kg = get_local_id(0);
+    const int block = get_global_id(1);
+    const int batch = get_global_id(2);
+
+    const int block_x = block % WTILES;
+    const int block_y = block / WTILES;
+
+    const int x = WINOGRAD_M * block_x;
+    const int y = WINOGRAD_M * block_y;
+
+    const int kHW = batch * K * NUM_INTERSECTIONS + k * NUM_INTERSECTIONS;
+
+    __local real ybuf[OUTIN_KWG * NUM_INTERSECTIONS];
+
+    if (k < K && block < P) {
+
+        const real mean = vload_net_t(k, means);
+        const real scale_stddiv = vload_net_t(k, stddivs);
+
+        real temp[WINOGRAD_M][WINOGRAD_ALPHA];
+
+        // M dimensions are [36, outputs, batch_size * tiles].
+        // Plus zero padding from SGEMM.
+
+        const int offset = block * Kpad + k;
+
+        // Calculates transpose(A).temp_m
+        for (int xn = 0; xn < WINOGRAD_ALPHA; xn++) {
+            real temp_m0 = vload_net_t((0 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m1 = vload_net_t((1 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m2 = vload_net_t((2 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m3 = vload_net_t((3 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m4 = vload_net_t((4 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+            real temp_m5 = vload_net_t((5 * WINOGRAD_ALPHA + xn) * Kpad * Ppad + offset, M);
+
+            multiply_at(
+                &(temp[0][xn]), &(temp[1][xn]), &(temp[2][xn]), &(temp[3][xn]),
+                temp_m0, temp_m1, temp_m2, temp_m3, temp_m4, temp_m5
+            );
+        }
+
+        // Calculates temp.A
+        for (int i = 0; i < WINOGRAD_M; i++){
+            real4 r;
+            multiply_atv(
+                &r,
+                temp[i][0], temp[i][1], temp[i][2], temp[i][3], temp[i][4], temp[i][5]
+            );
+
+            r = scale_stddiv * (r - mean);
+            if (y + i < H && x + 0 < W) {
+                const int out_idx = (y + i) * W + (x + 0);
+                ybuf[kg * NUM_INTERSECTIONS + out_idx] = r.x;
+            }
+            if (y + i < H && x + 1 < W) {
+                const int out_idx = (y + i) * W + (x + 1);
+                ybuf[kg * NUM_INTERSECTIONS + out_idx] = r.y;
+            }
+            if (y + i < H && x + 2 < W) {
+                const int out_idx = (y + i) * W + (x + 2);
+                ybuf[kg * NUM_INTERSECTIONS + out_idx] = r.z;
+            }
+            if (y + i < H && x + 3 < W) {
+                const int out_idx = (y + i) * W + (x + 3);
+                ybuf[kg * NUM_INTERSECTIONS + out_idx] = r.w;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int ks = get_local_size(0);
+    const int k0 = get_group_id(0) * get_local_size(0);
+
+    for (int x = get_local_id(0) + ks * get_local_id(1); x < ks * NUM_INTERSECTIONS; x += get_local_size(1) * get_local_size(0)) {
+        const int kx = x / NUM_INTERSECTIONS;
+        const int idx = x - kx * NUM_INTERSECTIONS;
+
+        const int kHWx = batch * K * NUM_INTERSECTIONS + (k0 + kx) * NUM_INTERSECTIONS;
+
+        real acc = ybuf[kx * NUM_INTERSECTIONS + idx];
+        if (residual) {
+            acc += vload_net_t(kHWx + idx, residual);
+        }
+        acc = acc > ZERO ? acc : ZERO;
+
+        if (Y) {
+            vstore_net_t(acc, kHWx + idx, Y);
+        }
+        ybuf[kx * NUM_INTERSECTIONS + idx] = acc;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int yin = WINOGRAD_M * block_y - 1;
+    const int xin = WINOGRAD_M * block_x - 1;
+
+    if (block < P && k < K) {
+        const int CPpad = Ppad * Cpad;
+        // Cache input tile and handle zero padding
+        real xx[WINOGRAD_ALPHA][WINOGRAD_ALPHA];
+        for (int i = 0; i < WINOGRAD_ALPHA; i++) {
+            int b = yin + i;
+            for (int j = 0; j < WINOGRAD_ALPHA; j++) {
+                int a = xin + j;
+                // x is transposed here for better layout later
+                if (b >= 0 && a >= 0 && b < H && a < W) {
+                    xx[j][i] = ybuf[kg * NUM_INTERSECTIONS + b * W + a];
+                } else {
+                    xx[j][i] = ZERO;
+                }
+            }
+        }
+
+        const int offset = k * Ppad + P * batch + block;
+        __in_transform_eq(xx, V, offset, CPpad);
+    }
+}
+
+// End of the C++11 raw string literal
+)"
diff --git a/autogtp/Keypress.h b/src/kernels/tensorcore_test.opencl
similarity index 52%
rename from autogtp/Keypress.h
rename to src/kernels/tensorcore_test.opencl
index 4eb320da0..3a05a2f81 100644
--- a/autogtp/Keypress.h
+++ b/src/kernels/tensorcore_test.opencl
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2017-2018 Marco Calignano
+    Copyright (C) 2017-2018 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,22 +16,20 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#ifndef KEYPRESS_H
-#define KEYPRESS_H
 
-#include <QObject>
-#include "Management.h"
+// This kernel simply tests if the host can compile a wmma insturction.
+// Not intended to be run at all.
 
-class KeyPress : public QObject
-{
-    Q_OBJECT
-public:
-    explicit KeyPress(Management *boss, QObject *parent = nullptr);
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
 
-protected:
-    bool eventFilter(QObject *obj, QEvent *event);
-    Management *m_boss;
-};
-
-#endif // KEYPRESS_H
+__kernel void tensorcore_test(__global int * ptr) {
+    asm(
+        ".reg .b32 a0, a1, a2, a3, a4, a5, a6, a7;\n"
+        "wmma.load.a.sync.aligned.m16n16k16.shared.row.f16 {a0,a1,a2,a3,a4,a5,a6,a7}, [%0];\n" : : "l"(ptr)
+    );
+}
 
+// End of the C++11 raw string literal
+)"
diff --git a/src/tests/gtests.cpp b/src/tests/gtests.cpp
index af2cce344..68d56d5dd 100644
--- a/src/tests/gtests.cpp
+++ b/src/tests/gtests.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Gian-Carlo Pascutto and contributors
+    Copyright (C) 2018-2019 Gian-Carlo Pascutto and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,13 +14,23 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
-*/
-#include <gtest/gtest.h>
 
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
+*/
 #include "config.h"
 
-#include <cstdint>
 #include <algorithm>
+#include <cstdint>
+#include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
 #include <regex>
@@ -37,20 +47,21 @@
 
 using namespace Utils;
 
-void expect_regex(std::string s, std::string re, bool positive = true) {
+void expect_regex(const std::string& s, const std::string& re,
+                  const bool positive = true) {
     auto m = std::regex_search(s, std::regex(re));
     if (positive && !m) {
-        FAIL() << "Output:" << std::endl << s
-            << "Does not contain:" << std::endl
-            << re << std::endl;
+        FAIL() << "Output:" << std::endl
+               << s << "Does not contain:" << std::endl
+               << re << std::endl;
     } else if (!positive && m) {
-        FAIL() << "output:" << std::endl << s
-            << "Should not contain:" << std::endl
-            << re << std::endl;
+        FAIL() << "output:" << std::endl
+               << s << "Should not contain:" << std::endl
+               << re << std::endl;
     }
 }
 
-class LeelaEnv: public ::testing::Environment {
+class LeelaEnv : public ::testing::Environment {
 public:
     ~LeelaEnv() {}
     void SetUp() {
@@ -69,17 +80,20 @@ class LeelaEnv: public ::testing::Environment {
         // improves reproducibility across platforms.
         Random::get_Rng().seedrandom(cfg_rng_seed);
 
-        NNCache::get_NNCache().set_size_from_playouts(cfg_max_playouts);
-
         cfg_weightsfile = "../src/tests/0k.txt";
-        Network::initialize();
+
+        auto playouts = std::min(cfg_max_playouts, cfg_max_visits);
+        auto network = std::make_unique<Network>();
+        network->initialize(playouts, cfg_weightsfile);
+        GTP::initialize(std::move(network));
     }
     void TearDown() {}
 };
 
-::testing::Environment* const leela_env = ::testing::AddGlobalTestEnvironment(new LeelaEnv);
+::testing::Environment* const leela_env =
+    ::testing::AddGlobalTestEnvironment(new LeelaEnv);
 
-class LeelaTest: public ::testing::Test {
+class LeelaTest : public ::testing::Test {
 public:
     LeelaTest() {
         // Reset engine parameters
@@ -94,13 +108,16 @@ class LeelaTest: public ::testing::Test {
     GameState& get_gamestate() {
         return *m_gamestate;
     }
-    std::pair<std::string, std::string> gtp_execute(std::string cmd) {
+    std::pair<std::string, std::string> gtp_execute(const std::string& cmd) {
         testing::internal::CaptureStdout();
         testing::internal::CaptureStderr();
         GTP::execute(get_gamestate(), cmd);
         return std::make_pair(testing::internal::GetCapturedStdout(),
                               testing::internal::GetCapturedStderr());
     }
+    void test_analyze_cmd(const std::string& cmd, bool valid, int who,
+                          int interval, int avoidlen, int avoidcolor,
+                          int avoiduntil);
 
 private:
     std::unique_ptr<GameState> m_gamestate;
@@ -141,7 +158,7 @@ TEST_F(LeelaTest, Transposition) {
     EXPECT_EQ(ko_hash, maingame.board.get_ko_hash());
 }
 
-TEST_F(LeelaTest, KoSqNotSame) {
+TEST_F(LeelaTest, KoPntNotSame) {
     auto maingame = get_gamestate();
 
     testing::internal::CaptureStdout();
@@ -177,11 +194,11 @@ TEST_F(LeelaTest, KoSqNotSame) {
 
     // Board position is the same
     EXPECT_EQ(ko_hash, maingame.board.get_ko_hash());
-    // But ko (square) is not
+    // But ko (intersection) is not
     EXPECT_NE(hash, maingame.board.get_hash());
 }
 
-TEST_F(LeelaTest, MoveOnOccupiedSq) {
+TEST_F(LeelaTest, MoveOnOccupiedPnt) {
     auto maingame = get_gamestate();
     std::string output;
 
@@ -241,12 +258,121 @@ TEST_F(LeelaTest, TimeControl2) {
     result = gtp_execute("kgs-time_settings byoyomi 0 100 1");
     result = gtp_execute("go");
     result = gtp_execute("showboard");
-    expect_regex(result.second, "Black time: 00:01:40, 1 period\\(s\\) of 100 seconds left");
-    expect_regex(result.second, "White time: 00:01:40, 1 period\\(s\\) of 100 seconds left");
+    expect_regex(result.second,
+                 "Black time: 00:01:40, 1 period\\(s\\) of 100 seconds left");
+    expect_regex(result.second,
+                 "White time: 00:01:40, 1 period\\(s\\) of 100 seconds left");
 
     result = gtp_execute("kgs-time_settings byoyomi 0 120 1");
     result = gtp_execute("go");
     result = gtp_execute("showboard");
-    expect_regex(result.second, "Black time: 00:02:00, 1 period\\(s\\) of 120 seconds left");
-    expect_regex(result.second, "White time: 00:02:00, 1 period\\(s\\) of 120 seconds left");
+    expect_regex(result.second,
+                 "Black time: 00:02:00, 1 period\\(s\\) of 120 seconds left");
+    expect_regex(result.second,
+                 "White time: 00:02:00, 1 period\\(s\\) of 120 seconds left");
+}
+
+void LeelaTest::test_analyze_cmd(const std::string& cmd, const bool valid,
+                                 const int who, const int interval,
+                                 const int avoidlen, const int avoidcolor,
+                                 const int avoiduntil) {
+    // std::cout << "testing " << cmd << std::endl;
+    // avoid_until checks against the absolute game move number, indexed from 0
+    std::istringstream cmdstream(cmd);
+    auto maingame = get_gamestate();
+    AnalyzeTags result{cmdstream, maingame};
+    EXPECT_EQ(result.m_invalid, !valid);
+    if (!valid) return;
+    EXPECT_EQ(result.m_who, who);
+    EXPECT_EQ(result.m_interval_centis, interval);
+    EXPECT_EQ(result.m_moves_to_avoid.size(), avoidlen);
+    if (avoidlen) {
+        EXPECT_EQ(result.m_moves_to_avoid[0].color, avoidcolor);
+        EXPECT_EQ(result.m_moves_to_avoid[0].until_move, avoiduntil);
+    }
+}
+
+// Test parsing the lz-analyze command line
+TEST_F(LeelaTest, AnalyzeParse) {
+    gtp_execute("clear_board");
+
+    test_analyze_cmd("b 50", true, FastBoard::BLACK, 50, 0, -1, -1);
+    test_analyze_cmd("50 b", true, FastBoard::BLACK, 50, 0, -1, -1);
+    test_analyze_cmd("b interval 50", true, FastBoard::BLACK, 50, 0, -1, -1);
+    test_analyze_cmd("interval 50 b", true, FastBoard::BLACK, 50, 0, -1, -1);
+    test_analyze_cmd("b interval", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("42 w", true, FastBoard::WHITE, 42, 0, -1, -1);
+    test_analyze_cmd("1234", true, FastBoard::BLACK, 1234, 0, -1, -1);
+    gtp_execute("play b q16");
+    test_analyze_cmd("1234", true, FastBoard::WHITE, 1234, 0, -1, -1);
+    test_analyze_cmd("b 100 avoid b k10 1", true, FastBoard::BLACK, 100, 1,
+                     FastBoard::BLACK, 1);
+    test_analyze_cmd("b 100 avoid b k10 1 avoid b a1 1", true, FastBoard::BLACK,
+                     100, 2, FastBoard::BLACK, 1);
+    test_analyze_cmd("b 100 avoid w k10 8", true, FastBoard::BLACK, 100, 1,
+                     FastBoard::WHITE, 8);
+    gtp_execute("play w q4");
+    test_analyze_cmd("b 100 avoid b k10 8", true, FastBoard::BLACK, 100, 1,
+                     FastBoard::BLACK, 9);
+    test_analyze_cmd("100 b avoid b k10 8", true, FastBoard::BLACK, 100, 1,
+                     FastBoard::BLACK, 9);
+    test_analyze_cmd("b avoid b k10 8 100", true, FastBoard::BLACK, 100, 1,
+                     FastBoard::BLACK, 9);
+    test_analyze_cmd("avoid b k10 8 100 b", true, FastBoard::BLACK, 100, 1,
+                     FastBoard::BLACK, 9);
+    test_analyze_cmd("avoid b k10 8 100 w", true, FastBoard::WHITE, 100, 1,
+                     FastBoard::BLACK, 9);
+    test_analyze_cmd("avoid b z10 8 100 w", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("avoid b k10 8 100 w bogus", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("avoid b k10 8 100 w avoid b pass 17", true,
+                     FastBoard::WHITE, 100, 2, FastBoard::BLACK, 9);
+    test_analyze_cmd("avoid b k10 8 w avoid b pass 17", true, FastBoard::WHITE,
+                     0, 2, FastBoard::BLACK, 9);
+
+    gtp_execute("clear_board");
+    test_analyze_cmd("b avoid b a1 10 allow b t1 1", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("b avoid w a1 10 allow b t1 1", true, FastBoard::BLACK, 0,
+                     1, FastBoard::WHITE, 9);
+    test_analyze_cmd("b avoid b pass 10 allow b t1 1", true, FastBoard::BLACK,
+                     0, 1, FastBoard::BLACK, 9);
+    test_analyze_cmd("b avoid b resign 10 allow b t1 1", true, FastBoard::BLACK,
+                     0, 1, FastBoard::BLACK, 9);
+    test_analyze_cmd("b avoid w c3,c4,d3,d4 2 avoid b pass 50", true,
+                     FastBoard::BLACK, 0, 5, FastBoard::WHITE, 1);
+    test_analyze_cmd("b avoid w c3,c4,d3,d4, 2 avoid b pass 50", false, -1, -1,
+                     -1, -1, -1);
+
+    gtp_execute("clear_board");
+    test_analyze_cmd("b avoid b q16 1", true, FastBoard::BLACK, 0, 1,
+                     FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b : 1", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("b avoid b d4: 1", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("b avoid b d14: 1", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("b avoid b :e3 1", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("b avoid b d:e3 1", false, -1, -1, -1, -1, -1);
+    test_analyze_cmd("b avoid b q16:q16 20", true, FastBoard::BLACK, 0, 1,
+                     FastBoard::BLACK, 19);
+    test_analyze_cmd("b avoid b q16:t19 1", true, FastBoard::BLACK, 0, 16,
+                     FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b t19:q16 1", true, FastBoard::BLACK, 0, 16,
+                     FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b t16:q19 1", true, FastBoard::BLACK, 0, 16,
+                     FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b q19:t16 1", true, FastBoard::BLACK, 0, 16,
+                     FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b a1:t19 1", true, FastBoard::BLACK, 0, 361,
+                     FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b a1:t19 1 avoid w pass 1 avoid w resign 1", true,
+                     FastBoard::BLACK, 0, 363, FastBoard::BLACK, 0);
+    test_analyze_cmd("b avoid b a1:t19,pass,resign 1", true, FastBoard::BLACK,
+                     0, 363, FastBoard::BLACK, 0);
+}
+
+TEST_F(LeelaTest, AnalyzeParseMinmoves) {
+    gtp_execute("clear_board");
+    gtp_execute("lz-setoption name pondering value false");
+    gtp_execute("lz-setoption name playouts value 1");
+    auto result = gtp_execute("lz-analyze b interval 1 minmoves 5");
+    // Expect to see at least 5 move priors
+    expect_regex(result.first, "info.*?(prior\\s+\\d+\\s+.*?){5,}.*");
 }
diff --git a/src/tests/utils_unittest.cpp b/src/tests/utils_unittest.cpp
index f3f898847..c23e0b4d4 100644
--- a/src/tests/utils_unittest.cpp
+++ b/src/tests/utils_unittest.cpp
@@ -1,6 +1,6 @@
 /*
     This file is part of Leela Zero.
-    Copyright (C) 2018 Seth Troisi and contributors
+    Copyright (C) 2018-2019 Seth Troisi and contributors
 
     Leela Zero is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -14,6 +14,17 @@
 
     You should have received a copy of the GNU General Public License
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
+
+    Additional permission under GNU GPL version 3 section 7
+
+    If you modify this Program, or any covered work, by linking or
+    combining it with NVIDIA Corporation's libraries from the
+    NVIDIA CUDA Toolkit and/or the NVIDIA CUDA Deep Neural
+    Network library and/or the NVIDIA TensorRT inference library
+    (or a modified version of those libraries), containing parts covered
+    by the terms of the respective license agreement, the licensors of
+    this Program grant you additional permission to convey the resulting
+    work.
 */
 
 #include <boost/math/distributions/chi_squared.hpp>
@@ -38,7 +49,7 @@ TEST(UtilsTest, CeilMultiple) {
     EXPECT_EQ(ceilMultiple(0, 1), (size_t)0);
     EXPECT_EQ(ceilMultiple(0, 3), (size_t)0);
 
-    EXPECT_EQ(ceilMultiple(6,  1), (size_t)6);
+    EXPECT_EQ(ceilMultiple(6, 1), (size_t)6);
     EXPECT_EQ(ceilMultiple(23, 1), (size_t)23);
 
     EXPECT_EQ(ceilMultiple(2, 2), (size_t)2);
@@ -56,7 +67,8 @@ TEST(UtilsTest, CeilMultiple) {
     EXPECT_EQ(ceilMultiple(99, 100), (size_t)100);
 }
 
-double randomlyDistributedProbability(std::vector<short> values, double expected) {
+double randomlyDistributedProbability(const std::vector<short>& values,
+                                      const double expected) {
     auto count = values.size();
 
     // h0: each number had a (1 / count) chance
@@ -79,8 +91,8 @@ double randomlyDistributedProbability(std::vector<short> values, double expected
     return boost::math::gamma_p(degrees_of_freedom / 2.0, x / 2.0);
 }
 
-bool rngBucketsLookRandom(double p, double alpha) {
-    return p >= (alpha/2) && p <= (1-alpha/2);
+bool rngBucketsLookRandom(const double p, const double alpha) {
+    return p >= (alpha / 2) && p <= (1 - alpha / 2);
 }
 
 TEST(UtilsTest, RandFix) {
diff --git a/training/elf/elf_convert.py b/training/elf/elf_convert.py
index cf8412ea3..89d65a442 100755
--- a/training/elf/elf_convert.py
+++ b/training/elf/elf_convert.py
@@ -41,7 +41,10 @@ def write_block(f, b):
 with open('elf_converted_weights.txt', 'w') as f:
     # version 2 means value head is for black, not for side to move
     f.write('2\n')
-    b = convert_block(state, 'init_conv')
+    if 'init_conv.0.weight' in state:
+        b = convert_block(state, 'init_conv')
+    else:
+        b = convert_block(state, 'init_conv.module')
 
     # Permutate input planes
     p = [0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 16, 17]
diff --git a/training/minigo/convert_minigo.py b/training/minigo/convert_minigo.py
new file mode 100755
index 000000000..14148454d
--- /dev/null
+++ b/training/minigo/convert_minigo.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+import gzip
+import re
+import os
+import sys
+
+import numpy as np
+import tensorflow as tf
+
+# Hide boring TF log statements
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'  # or any {'0', '1', '2'}
+
+
+def matches(name, parts):
+    return all(part in name for part in parts)
+
+def deduped(names):
+    names = [re.sub('_\d+', '', name) for name in names]
+    return sorted([(n, names.count(n)) for n in set(names)])
+
+def getMinigoWeightsV1(model):
+    """Load and massage Minigo weights to Leela format.
+
+    This version works on older models (v9 or before)
+    But was broken when conv bias was removed in v10
+    See: https://github.com/tensorflow/minigo/pull/292 and
+         https://github.com/gcp/leela-zero/issues/2020
+    """
+    sess = tf.Session()
+    saver = tf.train.import_meta_graph(model+'.meta')
+    saver.restore(sess, model)
+
+    trainable_names = []
+    for v in tf.trainable_variables():
+        trainable_names.append(v.name)
+
+    weights = []
+    for v in tf.global_variables():
+        if v.name in trainable_names:
+            weights.append(v)
+        elif 'batch_normalization' in v.name:
+            # Moving mean and variance are not trainable, but are needed for the model
+            if 'moving_mean' in v.name or 'moving_variance' in v.name:
+                weights.append(v)
+
+    # To match the format of V2
+    weights_v2_format = []
+    for w in weights:
+        nparray = w.eval(session=sess)
+        weights_v2_format.append((w.name, nparray))
+    return weights_v2_format
+
+def getMinigoWeightsV2(model):
+    """Load and massage Minigo weights to Leela format.
+
+    This version works on older models (v9 or before)
+    But was broken when conv bias was removed in v10
+    See: https://github.com/tensorflow/minigo/pull/292 and
+         https://github.com/gcp/leela-zero/issues/2020
+    """
+    var_names = tf.train.load_checkpoint(model).get_variable_to_dtype_map()
+
+    # count() overcounts by 3 from policy/value head and each layer has two convolutions.
+    layers = (max([count for n, count in deduped(var_names)]) - 3) // 2
+    print (layers, 'layers')
+
+    has_conv_bias = any(matches(name, ('conv2d', 'bias')) for name in var_names.keys())
+    if not has_conv_bias:
+        print('Did not find conv bias in this model, using all zeros')
+    empty_conv_bias = tf.constant([], name='placeholder_for_conv_bias')
+
+    # 2 * layer copies of
+    #   6*n + 0: conv2d/kernel:0
+    #   6*n + 1: conv2d/bias:0
+    #   6*n + 2: batch_normalization/gamma:0
+    #   6*n + 3: batch_normalization/beta:0
+    #   6*n + 4: batch_normalization/moving_mean:0
+    #   6*n + 5: batch_normalization/moving_variance:0
+    # at the end 2x
+    #   conv2d_39/kernel:0
+    #   conv2d_39/bias:0
+    #   batch_normalization_39/moving_mean:0
+    #   batch_normalization_39/moving_variance:0
+    #   dense/kernel:0
+    #   dense/bias:0
+    # final value dense
+    #   dense_2/kernel:0
+    #   dense_2/bias:0
+
+    weight_names = []
+
+    def tensor_number(number):
+        return '' if number ==0 else '_' + str(number)
+
+    def add_conv(number, with_gamma=True):
+        number = tensor_number(number)
+        weight_names.append('conv2d{}/kernel:0'.format(number))
+        weight_names.append('conv2d{}/bias:0'.format(number))
+        if with_gamma:
+            weight_names.append('batch_normalization{}/gamma:0'.format(number))
+            weight_names.append('batch_normalization{}/beta:0'.format(number))
+        weight_names.append('batch_normalization{}/moving_mean:0'.format(number))
+        weight_names.append('batch_normalization{}/moving_variance:0'.format(number))
+
+    def add_dense(number):
+        number = tensor_number(number)
+        weight_names.append('dense{}/kernel:0'.format(number))
+        weight_names.append('dense{}/bias:0'.format(number))
+
+    # This blindly builds the correct names for the tensors.
+    for l in range(2 * layers + 1):
+        add_conv(l)
+
+    add_conv(2 * layers + 1, with_gamma=False)
+    add_dense(0)
+    add_conv(2 * layers + 2, with_gamma=False)
+    add_dense(1)
+    add_dense(2)
+
+    # This tries to load the data for each tensors.
+    weights = []
+    for i, name in enumerate(weight_names):
+        if matches(name, ('conv2d', 'bias')) and not has_conv_bias:
+            w = np.zeros(weights[-1][1].shape[-1:])
+        else:
+            w = tf.train.load_variable(model, name)
+
+#        print ("{:45} {} {}".format(name, type(w), w.shape))
+        weights.append((name, w))
+    return weights
+
+def merge_gammas(weights):
+    out_weights = []
+    skip = 0
+    for e, (name, w) in enumerate(weights):
+        if skip > 0:
+            skip -= 1
+            continue
+
+        if matches(name, ('conv2d', 'kernel')) and 'gamma' in weights[e+2][0]:
+            kernel = w
+            bias = weights[e+1][1]
+            gamma = weights[e+2][1]
+            beta = weights[e+3][1]
+            mean = weights[e+4][1]
+            var = weights[e+5][1]
+
+            new_kernel = kernel * np.reshape(gamma, (1, 1, 1, -1))
+            new_bias = gamma * bias + beta * np.sqrt(var + 1e-5)
+            new_mean = mean * gamma
+
+            out_weights.append(new_kernel)
+            out_weights.append(new_bias)
+            out_weights.append(new_mean)
+            out_weights.append(var)
+
+            skip = 5
+
+        elif matches(name, ('dense', 'kernel')):
+            # Minigo uses channels last order while LZ uses channels first,
+            # Do some surgery for the dense layers to make the output match.
+            planes = w.shape[0] // 361
+            if planes > 0:
+                w1 = np.reshape(w, [19, 19, planes, -1])
+                w2 = np.transpose(w1, [2, 0, 1, 3])
+                new_kernel = np.reshape(w2, [361*planes, -1])
+                out_weights.append(new_kernel)
+            else:
+                out_weights.append(w)
+        else:
+            out_weights.append(w)
+
+    return out_weights
+
+def save_leelaz_weights(filename, weights):
+    with gzip.open(filename, 'wb') as f_out:
+        # Version tag
+        # Minigo outputs winrate from blacks point of view (same as ELF)
+        f_out.write(b'2')
+        for e, w in enumerate(weights):
+            # Newline unless last line (single bias)
+            f_out.write(b'\n')
+            work_weights = None
+            if len(w.shape) == 4:
+                # Convolution weights need a transpose
+                #
+                # TF (kYXInputOutput)
+                # [filter_height, filter_width, in_channels, out_channels]
+                #
+                # Leela/cuDNN/Caffe (kOutputInputYX)
+                # [output, input, filter_size, filter_size]
+                work_weights = np.transpose(w, [3, 2, 0, 1])
+            elif len(w.shape) == 2:
+                # Fully connected layers are [in, out] in TF
+                #
+                # [out, in] in Leela
+                #
+                work_weights = np.transpose(w, [1, 0])
+            else:
+                # Biases, batchnorm etc
+                work_weights = w
+            if e == 0:
+                # Fix input planes
+                #
+                # Add zero weights for white to play input plane
+                work_weights = np.pad(work_weights, ((0, 0), (0, 1), (0, 0), (0, 0)), 'constant', constant_values=0)
+
+                # Permutate weights
+                p = [0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 16, 17]
+
+                work_weights = work_weights[:, p, :, :]
+
+            # 80% of time is in this format line.
+            wt_str = ("{:0.4g}".format(wt) for wt in np.ravel(work_weights))
+            f_out.write(' '.join(wt_str).encode())
+
+
+def main():
+    if len(sys.argv) < 2:
+        print('Model filename without extension needed as an argument.')
+        exit()
+
+    model = sys.argv[1]
+
+    print ('loading ', model)
+    print ()
+
+    # Can be used for v9 or before models.
+    # weights = getMinigoWeightsV1(model)
+    weights = getMinigoWeightsV2(model)
+    if 0:
+        for name, variables in [
+                ('load_checkpoint', var_names.keys()),
+        #        ('trainable_names', trainable_names),
+        #        ('global_variable', [v.name for v in tf.global_variables()])
+                ]:
+            print (name, len(variables))
+            print (deduped(variables))
+            print ()
+
+    save_leelaz_weights(model + '_converted.txt.gz', merge_gammas(weights))
+
+if __name__ == "__main__":
+    main()
diff --git a/training/tf/chunkparser.py b/training/tf/chunkparser.py
index 2bdd68523..9c339e7d5 100644
--- a/training/tf/chunkparser.py
+++ b/training/tf/chunkparser.py
@@ -173,8 +173,11 @@ def convert_v1_to_v2(self, text_item):
         for plane in range(0, 16):
             # first 360 first bits are 90 hex chars, encoded MSB
             hex_string = text_item[plane][0:90]
-            array = np.unpackbits(np.frombuffer(
-                bytearray.fromhex(hex_string), dtype=np.uint8))
+            try:
+                array = np.unpackbits(np.frombuffer(
+                    bytearray.fromhex(hex_string), dtype=np.uint8))
+            except:
+                return False, None
             # Remaining bit that didn't fit. Encoded LSB so
             # it needs to be specially handled.
             last_digit = text_item[plane][90]
diff --git a/training/tf/mixprec.py b/training/tf/mixprec.py
new file mode 100644
index 000000000..889fb6abf
--- /dev/null
+++ b/training/tf/mixprec.py
@@ -0,0 +1,48 @@
+import tensorflow as tf
+
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+    float32 precision and then casts them to the training precision."""
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer,
+                      regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        cast_name = name + '/fp16_cast'
+        try:
+            cast_variable = tf.get_default_graph().get_tensor_by_name(
+                cast_name + ':0')
+        except KeyError:
+            cast_variable = tf.cast(variable, dtype, name=cast_name)
+        cast_variable._ref = variable._ref
+        variable = cast_variable
+    return variable
+
+
+class LossScalingOptimizer(tf.train.Optimizer):
+    """An optimizer that scales loss and un-scales gradients."""
+
+    def __init__(self, optimizer,
+                 scale=None,
+                 name="LossScalingOptimizer",
+                 use_locking=False):
+        super(LossScalingOptimizer, self).__init__(
+            name=name, use_locking=use_locking)
+        self._optimizer = optimizer
+        self._scale = float(scale) if scale is not None else 1.0
+
+    def compute_gradients(self, loss, var_list=None, *args, **kwargs):
+        if self._scale != 1.0:
+            loss = tf.scalar_mul(self._scale, loss)
+        gradvar = self._optimizer.compute_gradients(loss, var_list, *args, **kwargs)
+        gradvar = [(tf.scalar_mul(1. / self._scale, g), v) for g, v in gradvar]
+        return gradvar
+
+    def apply_gradients(self, *args, **kwargs):
+        return self._optimizer.apply_gradients(*args, **kwargs)
diff --git a/training/tf/net_to_model.py b/training/tf/net_to_model.py
index ebefd4303..5a2e58b2c 100755
--- a/training/tf/net_to_model.py
+++ b/training/tf/net_to_model.py
@@ -23,14 +23,8 @@
     blocks //= 8
     print("Blocks", blocks)
 
-tfprocess = TFProcess()
-tfprocess.init(batch_size=1)
-if tfprocess.RESIDUAL_BLOCKS != blocks:
-    raise ValueError("Number of blocks in tensorflow model doesn't match "\
-            "number of blocks in input network")
-if tfprocess.RESIDUAL_FILTERS != channels:
-    raise ValueError("Number of filters in tensorflow model doesn't match "\
-            "number of filters in input network")
+tfprocess = TFProcess(blocks, channels)
+tfprocess.init(batch_size=1, gpus_num=1)
 tfprocess.replace_weights(weights)
 path = os.path.join(os.getcwd(), "leelaz-model")
 save_path = tfprocess.saver.save(tfprocess.session, path, global_step=0)
diff --git a/training/tf/parse.py b/training/tf/parse.py
index 04ca24d7f..e67774da8 100755
--- a/training/tf/parse.py
+++ b/training/tf/parse.py
@@ -107,24 +107,38 @@ def split_chunks(chunks, test_ratio):
 def main():
     parser = argparse.ArgumentParser(
         description='Train network from game data.')
+    parser.add_argument("blockspref",
+        help="Number of blocks", nargs='?', type=int)
+    parser.add_argument("filterspref",
+        help="Number of filters", nargs='?', type=int)
     parser.add_argument("trainpref",
         help='Training file prefix', nargs='?', type=str)
     parser.add_argument("restorepref",
         help='Training snapshot prefix', nargs='?', type=str)
+    parser.add_argument("--blocks", '-b',
+        help="Number of blocks", type=int)
+    parser.add_argument("--filters", '-f',
+        help="Number of filters", type=int)
     parser.add_argument("--train", '-t',
         help="Training file prefix", type=str)
     parser.add_argument("--test", help="Test file prefix", type=str)
     parser.add_argument("--restore", type=str,
         help="Prefix of tensorflow snapshot to restore from")
     parser.add_argument("--logbase", default='leelalogs', type=str,
-        help="Log file prefix (for tensorboard)")
+        help="Log file prefix (for tensorboard) (default: %(default)s)")
     parser.add_argument("--sample", default=DOWN_SAMPLE, type=int,
-        help="Rate of data down-sampling to use")
+        help="Rate of data down-sampling to use (default: %(default)d)")
     args = parser.parse_args()
 
+    blocks = args.blocks or args.blockspref
+    filters = args.filters or args.filterspref
     train_data_prefix = args.train or args.trainpref
     restore_prefix = args.restore or args.restorepref
 
+    if not blocks or not filters:
+        print("Must supply number of blocks and filters")
+        return
+
     training = get_chunks(train_data_prefix)
     if not args.test:
         # Generate test by taking 10% of the training chunks.
@@ -150,7 +164,7 @@ def main():
                               sample=args.sample,
                               batch_size=RAM_BATCH_SIZE).parse()
 
-    tfprocess = TFProcess()
+    tfprocess = TFProcess(blocks, filters)
     tfprocess.init(RAM_BATCH_SIZE,
                    logbase=args.logbase,
                    macrobatch=BATCH_SIZE // RAM_BATCH_SIZE)
diff --git a/training/tf/quantize_weights.py b/training/tf/quantize_weights.py
new file mode 100755
index 000000000..c9bd8a1f8
--- /dev/null
+++ b/training/tf/quantize_weights.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+import sys, os, argparse
+
+def format_n(x):
+    x = float(x)
+    x = '{:.3g}'.format(x)
+    x = x.replace('e-0', 'e-')
+    if x.startswith('0.'):
+        x = x[1:]
+    if x.startswith('-0.'):
+        x = '-' + x[2:]
+    return x
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+            description='Quantize network file to decrease the file size.')
+    parser.add_argument("input",
+        help='Input file', type=str)
+
+    parser.add_argument("-o", "--output",
+        help='Output file. Defaults to input + "_quantized"',
+        required=False, type=str, default=None)
+
+    args = parser.parse_args()
+
+    if args.output == None:
+        output_name = os.path.splitext(sys.argv[1])
+        output_name = output_name[0] + '_quantized' + output_name[1]
+    else:
+        output_name = args.output
+    output = open(output_name, 'w')
+
+    calculate_error = True
+    error = 0
+
+    with open(args.input, 'r') as f:
+        for line in f:
+            line = line.split(' ')
+            lineq = list(map(format_n, line))
+
+            if calculate_error:
+                e = sum((float(line[i]) - float(lineq[i]))**2 for i in range(len(line)))
+                error += e/len(line)
+            output.write(' '.join(lineq) + '\n')
+
+    if calculate_error:
+        print('Weight file difference L2-norm: {}'.format(error**0.5))
+
+    output.close()
diff --git a/training/tf/requirements.txt b/training/tf/requirements.txt
index ee9dbd975..9bb7afad4 100644
--- a/training/tf/requirements.txt
+++ b/training/tf/requirements.txt
@@ -1,4 +1,4 @@
-bleach==1.5.0
+bleach==3.3.0
 enum34==1.1.6
 futures==3.1.1
 html5lib==0.9999999
@@ -7,6 +7,6 @@ numpy==1.13.3
 protobuf==3.4.0
 scipy==1.0.0
 six==1.11.0
-tensorflow==1.4.0
+tensorflow>=1.12.1
 tensorflow-tensorboard==0.4.0rc2
-Werkzeug==0.12.2
+Werkzeug==0.15.3
diff --git a/training/tf/tfprocess.py b/training/tf/tfprocess.py
index 7542ce3bc..be66df9e0 100644
--- a/training/tf/tfprocess.py
+++ b/training/tf/tfprocess.py
@@ -16,27 +16,35 @@
 #    You should have received a copy of the GNU General Public License
 #    along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 
+import math
 import numpy as np
 import os
-import random
 import tensorflow as tf
 import time
 import unittest
 
-def weight_variable(shape):
+from mixprec import float32_variable_storage_getter, LossScalingOptimizer
+
+
+def weight_variable(name, shape, dtype):
     """Xavier initialization"""
     stddev = np.sqrt(2.0 / (sum(shape)))
-    initial = tf.truncated_normal(shape, stddev=stddev)
-    weights = tf.Variable(initial)
+    # Do not use a constant as the initializer, that will cause the
+    # variable to be stored in wrong dtype.
+    weights = tf.get_variable(
+        name, shape, dtype=dtype,
+        initializer=tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
     tf.add_to_collection(tf.GraphKeys.WEIGHTS, weights)
     return weights
 
 # Bias weights for layers not followed by BatchNorm
 # We do not regularlize biases, so they are not
 # added to the regularlizer collection
-def bias_variable(shape):
-    initial = tf.constant(0.0, shape=shape)
-    return tf.Variable(initial)
+def bias_variable(name, shape, dtype):
+    bias = tf.get_variable(name, shape, dtype=dtype,
+                           initializer=tf.zeros_initializer())
+    return bias
+
 
 def conv2d(x, W):
     return tf.nn.conv2d(x, W, data_format='NCHW',
@@ -103,10 +111,22 @@ def elapsed(self):
         return e
 
 class TFProcess:
-    def __init__(self):
+    def __init__(self, residual_blocks, residual_filters):
         # Network structure
-        self.RESIDUAL_FILTERS = 128
-        self.RESIDUAL_BLOCKS = 6
+        self.residual_blocks = residual_blocks
+        self.residual_filters = residual_filters
+
+        # model type: full precision (fp32) or mixed precision (fp16)
+        self.model_dtype = tf.float32
+
+        # Scale the loss to prevent gradient underflow
+        self.loss_scale = 1 if self.model_dtype == tf.float32 else 128
+
+        # L2 regularization parameter applied to weights.
+        self.l2_scale = 1e-4
+
+        # Set number of GPUs for training
+        self.gpus_num = 1
 
         # For exporting
         self.weights = []
@@ -126,13 +146,13 @@ def __init__(self):
         self.swa_recalc_bn = True
 
         gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
-        config = tf.ConfigProto(gpu_options=gpu_options)
+        config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
         self.session = tf.Session(config=config)
 
         self.training = tf.placeholder(tf.bool)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
 
-    def init(self, batch_size, macrobatch=1, logbase='leelalogs'):
+    def init(self, batch_size, macrobatch=1, gpus_num=None, logbase='leelalogs'):
         self.batch_size = batch_size
         self.macrobatch = macrobatch
         self.logbase = logbase
@@ -147,22 +167,71 @@ def init(self, batch_size, macrobatch=1, logbase='leelalogs'):
         probs = tf.decode_raw(self.probs, tf.float32)
         winner = tf.decode_raw(self.winner, tf.float32)
 
-        planes = tf.to_float(planes)
+        planes = tf.cast(planes, self.model_dtype)
 
         planes = tf.reshape(planes, (batch_size, 18, 19*19))
         probs = tf.reshape(probs, (batch_size, 19*19 + 1))
         winner = tf.reshape(winner, (batch_size, 1))
 
-        self.init_net(planes, probs, winner)
+        if gpus_num is None:
+            gpus_num = self.gpus_num
+        self.init_net(planes, probs, winner, gpus_num)
 
-    def init_net(self, planes, probs, winner):
-        self.x = planes  # (tf.float32, [None, 18, 19 * 19])
-        self.y_ = probs  # (tf.float32, [None, 362])
-        self.z_ = winner # (tf.float32, [None, 1])
+    def init_net(self, planes, probs, winner, gpus_num):
+        self.y_ = probs   # (tf.float32, [None, 362])
+        self.sx = tf.split(planes, gpus_num)
+        self.sy_ = tf.split(probs, gpus_num)
+        self.sz_ = tf.split(winner, gpus_num)
         self.batch_norm_count = 0
-        self.y_conv, self.z_conv = self.construct_net(self.x)
+        self.reuse_var = None
+
+        # You need to change the learning rate here if you are training
+        # from a self-play training set, for example start with 0.005 instead.
+        opt = tf.train.MomentumOptimizer(
+            learning_rate=0.05, momentum=0.9, use_nesterov=True)
 
-        if self.swa_enabled == True:
+        opt = LossScalingOptimizer(opt, scale=self.loss_scale)
+
+        # Construct net here.
+        tower_grads = []
+        tower_loss = []
+        tower_policy_loss = []
+        tower_mse_loss = []
+        tower_reg_term = []
+        tower_y_conv = []
+        with tf.variable_scope("fp32_storage",
+                               # this forces trainable variables to be stored as fp32
+                               custom_getter=float32_variable_storage_getter):
+            for i in range(gpus_num):
+                with tf.device("/gpu:%d" % i):
+                    with tf.name_scope("tower_%d" % i):
+                        loss, policy_loss, mse_loss, reg_term, y_conv = self.tower_loss(
+                            self.sx[i], self.sy_[i], self.sz_[i])
+
+                        # Reset batchnorm key to 0.
+                        self.reset_batchnorm_key()
+
+                        tf.get_variable_scope().reuse_variables()
+                        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
+                            grads = opt.compute_gradients(loss)
+
+                        tower_grads.append(grads)
+                        tower_loss.append(loss)
+                        tower_policy_loss.append(policy_loss)
+                        tower_mse_loss.append(mse_loss)
+                        tower_reg_term.append(reg_term)
+                        tower_y_conv.append(y_conv)
+
+        # Average gradients from different GPUs
+        self.loss = tf.reduce_mean(tower_loss)
+        self.policy_loss = tf.reduce_mean(tower_policy_loss)
+        self.mse_loss = tf.reduce_mean(tower_mse_loss)
+        self.reg_term = tf.reduce_mean(tower_reg_term)
+        self.y_conv = tf.concat(tower_y_conv, axis=0)
+        self.mean_grads = self.average_gradients(tower_grads)
+
+        # Do swa after we contruct the net
+        if self.swa_enabled is True:
             # Count of networks accumulated into SWA
             self.swa_count = tf.Variable(0., name='swa_count', trainable=False)
             # Count of networks to skip
@@ -183,38 +252,12 @@ def init_net(self, planes, probs, winner):
                 self.swa_accum_op = tf.assign_add(n, 1.)
             self.swa_load_op = tf.group(*load)
 
-        # Calculate loss on policy head
-        cross_entropy = \
-            tf.nn.softmax_cross_entropy_with_logits(labels=self.y_,
-                                                    logits=self.y_conv)
-        self.policy_loss = tf.reduce_mean(cross_entropy)
-
-        # Loss on value head
-        self.mse_loss = \
-            tf.reduce_mean(tf.squared_difference(self.z_, self.z_conv))
-
-        # Regularizer
-        regularizer = tf.contrib.layers.l2_regularizer(scale=0.0001)
-        reg_variables = tf.get_collection(tf.GraphKeys.WEIGHTS)
-        self.reg_term = \
-            tf.contrib.layers.apply_regularization(regularizer, reg_variables)
-
-        # For training from a (smaller) dataset of strong players, you will
-        # want to reduce the factor in front of self.mse_loss here.
-        self.loss = 1.0 * self.policy_loss + 1.0 * self.mse_loss + self.reg_term
-
-        # You need to change the learning rate here if you are training
-        # from a self-play training set, for example start with 0.005 instead.
-        opt = tf.train.MomentumOptimizer(
-            learning_rate=0.05, momentum=0.9, use_nesterov=True)
-
-        # Compute and accumulate gradients
+        # Accumulate gradients
         self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
         total_grad=[]
         grad_ops=[]
         clear_var=[]
-        with tf.control_dependencies(self.update_ops):
-            self.grad_op_real = opt.compute_gradients(self.loss)
+        self.grad_op_real = self.mean_grads
         for (g, v) in self.grad_op_real:
             if g is None:
                 total_grad.append((g,v))
@@ -261,6 +304,52 @@ def init_net(self, planes, probs, winner):
         # Initialize all variables
         self.session.run(tf.global_variables_initializer())
 
+    def average_gradients(self, tower_grads):
+        # Average gradients from different GPUs
+        average_grads = []
+        for grad_and_vars in zip(*tower_grads):
+            grads = []
+            for g, _ in grad_and_vars:
+                expanded_g = tf.expand_dims(g, dim=0)
+                grads.append(expanded_g)
+
+            grad = tf.concat(grads, axis=0)
+            grad = tf.reduce_mean(grad, reduction_indices=0)
+
+            v = grad_and_vars[0][1]
+            grad_and_var = (grad, v)
+            average_grads.append(grad_and_var)
+        return average_grads
+
+    def tower_loss(self, x, y_, z_):
+        y_conv, z_conv = self.construct_net(x)
+
+        # Cast the nn result back to fp32 to avoid loss overflow/underflow
+        if self.model_dtype != tf.float32:
+            y_conv = tf.cast(y_conv, tf.float32)
+            z_conv = tf.cast(z_conv, tf.float32)
+
+        # Calculate loss on policy head
+        cross_entropy = \
+            tf.nn.softmax_cross_entropy_with_logits(labels=y_,
+                                                    logits=y_conv)
+        policy_loss = tf.reduce_mean(cross_entropy)
+
+        # Loss on value head
+        mse_loss = \
+            tf.reduce_mean(tf.squared_difference(z_, z_conv))
+
+        # Regularizer
+        reg_variables = tf.get_collection(tf.GraphKeys.WEIGHTS)
+        reg_term = self.l2_scale * tf.add_n(
+            [tf.cast(tf.nn.l2_loss(v), tf.float32) for v in reg_variables])
+
+        # For training from a (smaller) dataset of strong players, you will
+        # want to reduce the factor in front of self.mse_loss here.
+        loss = 1.0 * policy_loss + 1.0 * mse_loss + reg_term
+
+        return loss, policy_loss, mse_loss, reg_term, y_conv
+
     def assign(self, var, values):
         try:
             self.session.run(tf.assign(var, values))
@@ -432,30 +521,47 @@ def get_batchnorm_key(self):
         self.batch_norm_count += 1
         return result
 
+    def reset_batchnorm_key(self):
+        self.batch_norm_count = 0
+        self.reuse_var = True
+
+    def add_weights(self, var):
+        if self.reuse_var is None:
+            if var.name[-11:] == "fp16_cast:0":
+                name = var.name[:-12] + ":0"
+                var = tf.get_default_graph().get_tensor_by_name(name)
+            # All trainable variables should be stored as fp32
+            assert var.dtype.base_dtype == tf.float32
+            self.weights.append(var)
+
     def batch_norm(self, net):
         # The weights are internal to the batchnorm layer, so apply
         # a unique scope that we can store, and use to look them back up
         # later on.
         scope = self.get_batchnorm_key()
-        with tf.variable_scope(scope):
+        with tf.variable_scope(scope,
+                               custom_getter=float32_variable_storage_getter):
             net = tf.layers.batch_normalization(
                     net,
                     epsilon=1e-5, axis=1, fused=True,
                     center=True, scale=False,
-                    training=self.training)
+                    training=self.training,
+                    reuse=self.reuse_var)
 
         for v in ['beta', 'moving_mean', 'moving_variance' ]:
-            name = scope + '/batch_normalization/' + v + ':0'
+            name = "fp32_storage/" + scope + '/batch_normalization/' + v + ':0'
             var = tf.get_default_graph().get_tensor_by_name(name)
-            self.weights.append(var)
+            self.add_weights(var)
 
         return net
 
+    def conv_block(self, inputs, filter_size, input_channels, output_channels, name):
+        W_conv = weight_variable(
+            name,
+            [filter_size, filter_size, input_channels, output_channels],
+            self.model_dtype)
 
-    def conv_block(self, inputs, filter_size, input_channels, output_channels):
-        W_conv = weight_variable([filter_size, filter_size,
-                                  input_channels, output_channels])
-        self.weights.append(W_conv)
+        self.add_weights(W_conv)
 
         net = inputs
         net = conv2d(net, W_conv)
@@ -463,21 +569,23 @@ def conv_block(self, inputs, filter_size, input_channels, output_channels):
         net = tf.nn.relu(net)
         return net
 
-    def residual_block(self, inputs, channels):
+    def residual_block(self, inputs, channels, name):
         net = inputs
         orig = tf.identity(net)
 
         # First convnet weights
-        W_conv_1 = weight_variable([3, 3, channels, channels])
-        self.weights.append(W_conv_1)
+        W_conv_1 = weight_variable(name + "_conv_1", [3, 3, channels, channels],
+                                   self.model_dtype)
+        self.add_weights(W_conv_1)
 
         net = conv2d(net, W_conv_1)
         net = self.batch_norm(net)
         net = tf.nn.relu(net)
 
         # Second convnet weights
-        W_conv_2 = weight_variable([3, 3, channels, channels])
-        self.weights.append(W_conv_2)
+        W_conv_2 = weight_variable(name + "_conv_2", [3, 3, channels, channels],
+                                   self.model_dtype)
+        self.add_weights(W_conv_2)
 
         net = conv2d(net, W_conv_2)
         net = self.batch_norm(net)
@@ -494,36 +602,41 @@ def construct_net(self, planes):
         # Input convolution
         flow = self.conv_block(x_planes, filter_size=3,
                                input_channels=18,
-                               output_channels=self.RESIDUAL_FILTERS)
+                               output_channels=self.residual_filters,
+                               name="first_conv")
         # Residual tower
-        for _ in range(0, self.RESIDUAL_BLOCKS):
-            flow = self.residual_block(flow, self.RESIDUAL_FILTERS)
+        for i in range(0, self.residual_blocks):
+            block_name = "res_" + str(i)
+            flow = self.residual_block(flow, self.residual_filters,
+                                       name=block_name)
 
         # Policy head
         conv_pol = self.conv_block(flow, filter_size=1,
-                                   input_channels=self.RESIDUAL_FILTERS,
-                                   output_channels=2)
-        h_conv_pol_flat = tf.reshape(conv_pol, [-1, 2*19*19])
-        W_fc1 = weight_variable([2 * 19 * 19, (19 * 19) + 1])
-        b_fc1 = bias_variable([(19 * 19) + 1])
-        self.weights.append(W_fc1)
-        self.weights.append(b_fc1)
+                                   input_channels=self.residual_filters,
+                                   output_channels=2,
+                                   name="policy_head")
+        h_conv_pol_flat = tf.reshape(conv_pol, [-1, 2 * 19 * 19])
+        W_fc1 = weight_variable("w_fc_1", [2 * 19 * 19, (19 * 19) + 1], self.model_dtype)
+        b_fc1 = bias_variable("b_fc_1", [(19 * 19) + 1], self.model_dtype)
+        self.add_weights(W_fc1)
+        self.add_weights(b_fc1)
         h_fc1 = tf.add(tf.matmul(h_conv_pol_flat, W_fc1), b_fc1)
 
         # Value head
         conv_val = self.conv_block(flow, filter_size=1,
-                                   input_channels=self.RESIDUAL_FILTERS,
-                                   output_channels=1)
-        h_conv_val_flat = tf.reshape(conv_val, [-1, 19*19])
-        W_fc2 = weight_variable([19 * 19, 256])
-        b_fc2 = bias_variable([256])
-        self.weights.append(W_fc2)
-        self.weights.append(b_fc2)
+                                   input_channels=self.residual_filters,
+                                   output_channels=1,
+                                   name="value_head")
+        h_conv_val_flat = tf.reshape(conv_val, [-1, 19 * 19])
+        W_fc2 = weight_variable("w_fc_2", [19 * 19, 256], self.model_dtype)
+        b_fc2 = bias_variable("b_fc_2", [256], self.model_dtype)
+        self.add_weights(W_fc2)
+        self.add_weights(b_fc2)
         h_fc2 = tf.nn.relu(tf.add(tf.matmul(h_conv_val_flat, W_fc2), b_fc2))
-        W_fc3 = weight_variable([256, 1])
-        b_fc3 = bias_variable([1])
-        self.weights.append(W_fc3)
-        self.weights.append(b_fc3)
+        W_fc3 = weight_variable("w_fc_3", [256, 1], self.model_dtype)
+        b_fc3 = bias_variable("b_fc_3", [1], self.model_dtype)
+        self.add_weights(W_fc3)
+        self.add_weights(b_fc3)
         h_fc3 = tf.nn.tanh(tf.add(tf.matmul(h_fc2, W_fc3), b_fc3))
 
         return h_fc1, h_fc3
@@ -594,21 +707,21 @@ def gen_block(size, f_in, f_out):
 
 class TFProcessTest(unittest.TestCase):
     def test_can_replace_weights(self):
-        tfprocess = TFProcess()
+        tfprocess = TFProcess(6, 128)
         tfprocess.init(batch_size=1)
         # use known data to test replace_weights() works.
-        data = gen_block(3, 18, tfprocess.RESIDUAL_FILTERS) # input conv
-        for _ in range(tfprocess.RESIDUAL_BLOCKS):
+        data = gen_block(3, 18, tfprocess.residual_filters) # input conv
+        for _ in range(tfprocess.residual_blocks):
             data.extend(gen_block(3,
-                tfprocess.RESIDUAL_FILTERS, tfprocess.RESIDUAL_FILTERS))
+                tfprocess.residual_filters, tfprocess.residual_filters))
             data.extend(gen_block(3,
-                tfprocess.RESIDUAL_FILTERS, tfprocess.RESIDUAL_FILTERS))
+                tfprocess.residual_filters, tfprocess.residual_filters))
         # policy
-        data.extend(gen_block(1, tfprocess.RESIDUAL_FILTERS, 2))
+        data.extend(gen_block(1, tfprocess.residual_filters, 2))
         data.append([0.4] * 2*19*19 * (19*19+1))
         data.append([0.5] * (19*19+1))
         # value
-        data.extend(gen_block(1, tfprocess.RESIDUAL_FILTERS, 1))
+        data.extend(gen_block(1, tfprocess.residual_filters, 1))
         data.append([0.6] * 19*19 * 256)
         data.append([0.7] * 256)
         data.append([0.8] * 256)
diff --git a/validation/.gitignore b/validation/.gitignore
index 389b9d5eb..4ab5fe227 100644
--- a/validation/.gitignore
+++ b/validation/.gitignore
@@ -1,3 +1,5 @@
+validation.pro.user
+
 /Makefile
 /.qmake.stash
 /validation
diff --git a/validation/CMakeLists.txt b/validation/CMakeLists.txt
index 206ac02fd..42db8d7db 100644
--- a/validation/CMakeLists.txt
+++ b/validation/CMakeLists.txt
@@ -7,4 +7,4 @@ add_executable(validation
 set_target_properties(validation PROPERTIES AUTOMOC 1)
 target_link_libraries(validation Qt5::Core)
 
-install(TARGETS validation DESTINATION bin)
+install(TARGETS validation DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/validation/Results.cpp b/validation/Results.cpp
index ab4a66691..5050942e6 100644
--- a/validation/Results.cpp
+++ b/validation/Results.cpp
@@ -16,30 +16,34 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include <QString>
+#include <iostream>
+
 #include "Results.h"
+
 #include "../autogtp/Game.h"
 #include "SPRT.h"
-#include <QString>
-#include <iostream>
 
 void Results::addGameResult(Sprt::GameResult result, int side) {
     m_gamesPlayed++;
     if (result == Sprt::GameResult::Win) {
-        if (side == Game::BLACK)
+        if (side == Game::BLACK) {
             m_blackWins++;
-        else
+        } else {
             m_whiteWins++;
+        }
     } else {
-        if (side == Game::BLACK)
+        if (side == Game::BLACK) {
             m_blackLosses++;
-        else
+        } else {
             m_whiteLosses++;
+        }
     }
 }
 
 std::string winPercentColumn(int wins, int games) {
-    auto line = QString::asprintf(" %4d %5.2f%%", wins,
-                                  100.0f * (wins / (float)games));
+    auto line =
+        QString::asprintf(" %4d %5.2f%%", wins, 100.0f * (wins / (float)games));
     return line.toStdString();
 }
 
@@ -53,10 +57,8 @@ void Results::printResults(const QString& firstNetName,
             DEFG5678  111 63.07%   61 68.54%   50 57.47%
                                    98 55.68%   78 44.32%
     */
-    auto first_name = firstNetName.leftJustified(8, ' ', true)\
-            .toStdString();
-    auto second_name = secondNetName.leftJustified(8, ' ', true)\
-            .toStdString();
+    auto first_name = firstNetName.leftJustified(8, ' ', true).toStdString();
+    auto second_name = secondNetName.leftJustified(8, ' ', true).toStdString();
 
     // Results for player one
     auto p1_wins = m_blackWins + m_whiteWins;
@@ -70,24 +72,20 @@ void Results::printResults(const QString& firstNetName,
     auto title_line = QString::asprintf("%13s %-11s %-11s %s\n",
                                         "", "wins", "black", "white");
 
-    std::cout
-        << first_name << " v " << second_name
-        << " ( " << m_gamesPlayed << " games)" << std::endl;
+    std::cout << first_name << " v " << second_name
+              << " ( " << m_gamesPlayed << " games)" << std::endl;
     std::cout << title_line.toStdString();
-    std::cout
-        << first_name
-        << winPercentColumn(p1_wins, m_gamesPlayed)
-        << winPercentColumn(m_blackWins, black_wins)
-        << winPercentColumn(m_whiteWins, white_wins) << std::endl;
-    std::cout
-        << second_name
-        << winPercentColumn(p1_losses, m_gamesPlayed)
-        << winPercentColumn(m_whiteLosses, black_wins)
-        << winPercentColumn(m_blackLosses, white_wins) << std::endl;
-    std::cout
-        << std::string(20, ' ')
-        << winPercentColumn(black_wins, m_gamesPlayed)
-        << winPercentColumn(white_wins, m_gamesPlayed) << std::endl;
+    std::cout << first_name
+              << winPercentColumn(p1_wins, m_gamesPlayed)
+              << winPercentColumn(m_blackWins, black_wins)
+              << winPercentColumn(m_whiteWins, white_wins) << std::endl;
+    std::cout << second_name
+              << winPercentColumn(p1_losses, m_gamesPlayed)
+              << winPercentColumn(m_whiteLosses, black_wins)
+              << winPercentColumn(m_blackLosses, white_wins) << std::endl;
+    std::cout << std::string(20, ' ')
+              << winPercentColumn(black_wins, m_gamesPlayed)
+              << winPercentColumn(white_wins, m_gamesPlayed) << std::endl;
 }
 
 QTextStream& operator<<(QTextStream& stream, const Results& r) {
diff --git a/validation/Results.h b/validation/Results.h
index 79e27ca27..c46217daa 100644
--- a/validation/Results.h
+++ b/validation/Results.h
@@ -19,20 +19,23 @@
 #ifndef RESULTS_H
 #define RESULTS_H
 
-#include "SPRT.h"
 #include <QString>
 
+#include "SPRT.h"
 
 class Results {
 public:
     Results() = default;
-    int getGamesPlayed() const { return m_gamesPlayed; }
+    int getGamesPlayed() const {
+        return m_gamesPlayed;
+    }
     void addGameResult(Sprt::GameResult result, int side);
     void printResults(const QString& firstNetName,
                       const QString& secondNetName) const;
 
     friend QTextStream& operator<<(QTextStream& stream, const Results& r);
     friend QTextStream& operator>>(QTextStream& stream, Results& r);
+
 private:
     int m_gamesPlayed{0};
     int m_blackWins{0};
diff --git a/validation/SPRT.cpp b/validation/SPRT.cpp
index 923acf55f..b95e2579b 100644
--- a/validation/SPRT.cpp
+++ b/validation/SPRT.cpp
@@ -18,16 +18,16 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "SPRT.h"
+#include <QtGlobal>
 #include <cmath>
 #include <iostream>
-#include <QtGlobal>
+
+#include "SPRT.h"
 
 class BayesElo;
 class SprtProbability;
 
-class BayesElo
-{
+class BayesElo {
 public:
     BayesElo(double bayesElo, double drawElo);
     BayesElo(const SprtProbability& p);
@@ -41,8 +41,7 @@ class BayesElo
     double m_drawElo;
 };
 
-class SprtProbability
-{
+class SprtProbability {
 public:
     SprtProbability(int wins, int losses, int draws);
     SprtProbability(const BayesElo& b);
@@ -58,41 +57,34 @@ class SprtProbability
     double m_pDraw;
 };
 
-
 BayesElo::BayesElo(double bayesElo, double drawElo)
-    : m_bayesElo(bayesElo),
-      m_drawElo(drawElo)
-{
-}
+    : m_bayesElo(bayesElo), m_drawElo(drawElo) {}
 
-BayesElo::BayesElo(const SprtProbability& p)
-{
+BayesElo::BayesElo(const SprtProbability& p) {
     Q_ASSERT(p.isValid());
 
-    m_bayesElo = 200.0 * std::log10(p.pWin() / p.pLoss() *
-                                   (1.0 - p.pLoss()) / (1.0 - p.pWin()));
-    m_drawElo  = 200.0 * std::log10((1.0 - p.pLoss()) / p.pLoss() *
-                                   (1.0 - p.pWin()) / p.pWin());
+    m_bayesElo = 200.0
+                 * std::log10(p.pWin() / p.pLoss() * (1.0 - p.pLoss())
+                              / (1.0 - p.pWin()));
+    m_drawElo = 200.0
+                * std::log10((1.0 - p.pLoss()) / p.pLoss() * (1.0 - p.pWin())
+                             / p.pWin());
 }
 
-double BayesElo::bayesElo() const
-{
+double BayesElo::bayesElo() const {
     return m_bayesElo;
 }
 
-double BayesElo::drawElo() const
-{
+double BayesElo::drawElo() const {
     return m_drawElo;
 }
 
-double BayesElo::scale() const
-{
+double BayesElo::scale() const {
     const double x = std::pow(10.0, -m_drawElo / 400.0);
     return 4.0 * x / ((1.0 + x) * (1.0 + x));
 }
 
-SprtProbability::SprtProbability(int wins, int losses, int draws)
-{
+SprtProbability::SprtProbability(int wins, int losses, int draws) {
     Q_ASSERT(wins > 0 && losses > 0 && draws > 0);
 
     const int count = wins + losses + draws;
@@ -102,71 +94,56 @@ SprtProbability::SprtProbability(int wins, int losses, int draws)
     m_pDraw = 1.0 - m_pWin - m_pLoss;
 }
 
-SprtProbability::SprtProbability(const BayesElo& b)
-{
-    m_pWin  = 1.0 / (1.0 + std::pow(10.0, (b.drawElo() - b.bayesElo()) / 400.0));
-    m_pLoss = 1.0 / (1.0 + std::pow(10.0, (b.drawElo() + b.bayesElo()) / 400.0));
+SprtProbability::SprtProbability(const BayesElo& b) {
+    m_pWin =
+        1.0 / (1.0 + std::pow(10.0, (b.drawElo() - b.bayesElo()) / 400.0));
+    m_pLoss =
+        1.0 / (1.0 + std::pow(10.0, (b.drawElo() + b.bayesElo()) / 400.0));
     m_pDraw = 1.0 - m_pWin - m_pLoss;
 }
 
-bool SprtProbability::isValid() const
-{
-    return 0.0 < m_pWin && m_pWin < 1.0 &&
-           0.0 < m_pLoss && m_pLoss < 1.0 &&
-           0.0 < m_pDraw && m_pDraw < 1.0;
+bool SprtProbability::isValid() const {
+    return 0.0 < m_pWin && m_pWin < 1.0
+        && 0.0 < m_pLoss && m_pLoss < 1.0
+        && 0.0 < m_pDraw && m_pDraw < 1.0;
 }
 
-double SprtProbability::pWin() const
-{
+double SprtProbability::pWin() const {
     return m_pWin;
 }
 
-double SprtProbability::pLoss() const
-{
+double SprtProbability::pLoss() const {
     return m_pLoss;
 }
 
-double SprtProbability::pDraw() const
-{
+double SprtProbability::pDraw() const {
     return m_pDraw;
 }
 
+Sprt::Sprt()
+    : m_elo0(0),
+      m_elo1(0),
+      m_alpha(0),
+      m_beta(0),
+      m_wins(0),
+      m_losses(0),
+      m_draws(0),
+      m_mutex() {}
 
-Sprt::Sprt():
-    m_elo0(0),
-    m_elo1(0),
-    m_alpha(0),
-    m_beta(0),
-    m_wins(0),
-    m_losses(0),
-    m_draws(0),
-    m_mutex()
-{
-}
-
-bool Sprt::isNull() const
-{
+bool Sprt::isNull() const {
     return m_elo0 == 0 && m_elo1 == 0 && m_alpha == 0 && m_beta == 0;
 }
 
-void Sprt::initialize(double elo0, double elo1,
-                      double alpha, double beta)
-{
+void Sprt::initialize(double elo0, double elo1, double alpha, double beta) {
     m_elo0 = elo0;
     m_elo1 = elo1;
     m_alpha = alpha;
     m_beta = beta;
 }
 
-Sprt::Status Sprt::status() const
-{
+Sprt::Status Sprt::status() const {
     QMutexLocker locker(&m_mutex);
-    Status status = {
-        Continue,
-        0.0,
-        0.0,
-        0.0
-    };
+    Status status = {Continue, 0.0, 0.0, 0.0};
     status.lBound = std::log(m_beta / (1.0 - m_alpha));
     status.uBound = std::log((1.0 - m_beta) / m_alpha);
 
@@ -190,33 +167,32 @@ Sprt::Status Sprt::status() const
     const SprtProbability p0(b0), p1(b1);
 
     // Log-Likelyhood Ratio
-    status.llr = m_wins * std::log(p1.pWin() / p0.pWin()) +
-                 m_losses * std::log(p1.pLoss() / p0.pLoss()) +
-                 m_draws * std::log(p1.pDraw() / p0.pDraw());
+    status.llr = m_wins * std::log(p1.pWin() / p0.pWin())
+                 + m_losses * std::log(p1.pLoss() / p0.pLoss())
+                 + m_draws * std::log(p1.pDraw() / p0.pDraw());
 
     // Bounds based on error levels of the test
-
-    if (status.llr > status.uBound)
+    if (status.llr > status.uBound) {
         status.result = AcceptH1;
-    else if (status.llr < status.lBound)
+    } else if (status.llr < status.lBound) {
         status.result = AcceptH0;
+    }
 
     return status;
 }
 
-void Sprt::addGameResult(GameResult result)
-{
+void Sprt::addGameResult(GameResult result) {
     QMutexLocker locker(&m_mutex);
-    if (result == Win)
+    if (result == Win) {
         m_wins++;
-    else if (result == Draw)
+    } else if (result == Draw) {
         m_draws++;
-    else if (result == Loss)
+    } else if (result == Loss) {
         m_losses++;
+    }
 }
 
-std::tuple<int, int, int> Sprt::getWDL() const
-{
+std::tuple<int, int, int> Sprt::getWDL() const {
     return std::make_tuple(m_wins, m_draws, m_losses);
 }
 
diff --git a/validation/SPRT.h b/validation/SPRT.h
index 063d715c3..6ca530ab1 100644
--- a/validation/SPRT.h
+++ b/validation/SPRT.h
@@ -35,34 +35,30 @@
 #include <QTextStream>
 #include <tuple>
 
-class Sprt
-{
+class Sprt {
 public:
-        /*! The result of the test. */
-    enum Result
-    {
-        Continue,    //!< Continue monitoring
-        AcceptH0,    //!< Accept null hypothesis H0
-        AcceptH1    //!< Accept alternative hypothesis H1
+    /*! The result of the test. */
+    enum Result {
+        Continue, //!< Continue monitoring
+        AcceptH0, //!< Accept null hypothesis H0
+        AcceptH1  //!< Accept alternative hypothesis H1
     };
 
     /*! The result of a chess game. */
-    enum GameResult
-    {
-        NoResult = 0,    //!< Game ended with no result
-        Win,        //!< First player won
-        Loss,        //!< First player lost
-        Draw,        //!< Game was drawn
-        NotEnded    //!< Game was interrupted
+    enum GameResult {
+        NoResult = 0, //!< Game ended with no result
+        Win,          //!< First player won
+        Loss,         //!< First player lost
+        Draw,         //!< Game was drawn
+        NotEnded      //!< Game was interrupted
     };
 
     /*! The status of the test. */
-    struct Status
-    {
-        Result result;    //!< Test result
+    struct Status {
+        Result result; //!< Test result
         double llr;    //!< Log-likelihood ratio
-        double lBound;    //!< Lower bound
-        double uBound;    //!< Upper bound
+        double lBound; //!< Lower bound
+        double uBound; //!< Upper bound
     };
 
     /*! Creates a new uninitialized Sprt object. */
@@ -83,8 +79,7 @@ class Sprt
      * \a alpha is the maximum probability for a type I error and
      * \a beta for a type II error outside interval [elo0, elo1].
      */
-    void initialize(double elo0, double elo1,
-                    double alpha, double beta);
+    void initialize(double elo0, double elo1, double alpha, double beta);
 
     /*! Returns the current status of the test. */
     Status status() const;
@@ -101,6 +96,7 @@ class Sprt
     void addGameResult(GameResult result);
     friend QTextStream& operator<<(QTextStream& stream, const Sprt& sprt);
     friend QTextStream& operator>>(QTextStream& stream, Sprt& sprt);
+
 private:
     double m_elo0;
     double m_elo1;
diff --git a/validation/Validation.cpp b/validation/Validation.cpp
index cbb6f9c24..85ea0ae71 100644
--- a/validation/Validation.cpp
+++ b/validation/Validation.cpp
@@ -16,32 +16,32 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "Validation.h"
-#include <QFile>
 #include <QDir>
+#include <QFile>
 #include <QUuid>
 
+#include "Validation.h"
+
 using VersionTuple = std::tuple<int, int, int>;
 // Minimal Leela Zero version we expect to see
-const VersionTuple min_leelaz_version{0, 10, 0};
-
+const VersionTuple min_leelaz_version{0, 16, 0};
 
 void ValidationWorker::run() {
     do {
-        Game first(m_firstNet,  m_firstOpts, m_firstBin);
+        Game first(m_engines[0]);
         if (!first.gameStart(min_leelaz_version)) {
             emit resultReady(Sprt::NoResult, Game::BLACK);
             return;
         }
-        Game second(m_secondNet, m_secondOpts, m_secondBin);
+        Game second(m_engines[1]);
         if (!second.gameStart(min_leelaz_version)) {
             emit resultReady(Sprt::NoResult, Game::BLACK);
             return;
         }
-        QTextStream(stdout) << "starting:" << endl <<
-            first.getCmdLine() << endl <<
-            "vs" << endl <<
-            second.getCmdLine() << endl;
+        QTextStream(stdout) << "starting:" << endl
+                            << m_engines[0].getCmdLine() << endl
+                            << "vs" << endl
+                            << m_engines[1].getCmdLine() << endl;
 
         QString wmove = "play white ";
         QString bmove = "play black ";
@@ -79,7 +79,8 @@ void ValidationWorker::run() {
                     } else {
                         prefix.append("white_");
                     }
-                    QFile(first.getFile() + ".sgf").rename(prefix + first.getFile() + ".sgf");
+                    QFile(first.getFile() + ".sgf")
+                        .rename(prefix + first.getFile() + ".sgf");
                 }
             }
             QTextStream(stdout) << "Stopping engine." << endl;
@@ -93,9 +94,7 @@ void ValidationWorker::run() {
                 emit resultReady(Sprt::Loss, m_expected);
             }
             // Change color and play again
-            m_firstNet.swap(m_secondNet);
-            m_firstBin.swap(m_secondBin);
-            m_firstOpts.swap(m_secondOpts);
+            std::swap(m_engines[0], m_engines[1]);
             if (m_expected == Game::BLACK) {
                 m_expected = Game::WHITE;
             } else {
@@ -109,96 +108,58 @@ void ValidationWorker::run() {
 }
 
 void ValidationWorker::init(const QString& gpuIndex,
-                            const QString& firstNet,
-                            const QString& secondNet,
-                            const QString& firstBin,
-                            const QString& secondBin,
-                            const QString& firstOpts,
-                            const QString& secondOpts,
-                            const QString& keep,
+                            const QVector<Engine>& engines, const QString& keep,
                             int expected) {
-    m_firstOpts = firstOpts;
-    m_secondOpts = secondOpts;
+    m_engines = engines;
     if (!gpuIndex.isEmpty()) {
-        m_firstOpts.prepend(" --gpu=" + gpuIndex + " ");
-        m_secondOpts.prepend(" --gpu=" + gpuIndex + " ");
+        m_engines[0].m_options.prepend(" --gpu=" + gpuIndex + " ");
+        m_engines[1].m_options.prepend(" --gpu=" + gpuIndex + " ");
     }
-    m_firstNet = firstNet;
-    m_secondNet = secondNet;
-    m_firstBin = firstBin;
-    m_secondBin = secondBin;
     m_expected = expected;
     m_keepPath = keep;
     m_state.store(RUNNING);
 }
 
-Validation::Validation(const int gpus,
-                       const int games,
-                       const QStringList& gpuslist,
-                       const QString& firstNet,
-                       const QString& secondNet,
-                       const QString& keep,
-                       QMutex* mutex,
-                       const QString& firstBin,
-                       const QString& secondBin,
-                       const QString& firstOpts,
-                       const QString& secondOpts,
-                       const float& h0,
-                       const float& h1) :
+Validation::Validation(const int gpus, const int games,
+                       const QStringList& gpuslist, QVector<Engine>& engines,
+                       const QString& keep, QMutex* mutex, const float h0,
+                       const float h1)
+    :
 
-    m_mainMutex(mutex),
-    m_syncMutex(),
-    m_gamesThreads(gpus*games),
-    m_games(games),
-    m_gpus(gpus),
-    m_gpusList(gpuslist),
-    m_firstNet(firstNet),
-    m_secondNet(secondNet),
-    m_firstBin(firstBin),
-    m_secondBin(secondBin),
-    m_firstOpts(firstOpts),
-    m_secondOpts(secondOpts),
-    m_keepPath(keep) {
+      m_mainMutex(mutex),
+      m_syncMutex(),
+      m_gamesThreads(gpus * games),
+      m_games(games),
+      m_gpus(gpus),
+      m_gpusList(gpuslist),
+      m_engines(engines),
+      m_keepPath(keep) {
     m_statistic.initialize(h0, h1, 0.05, 0.05);
     m_statistic.addGameResult(Sprt::Draw);
 }
 
 void Validation::startGames() {
-    QString n1, n2, b1 ,b2 ,o1, o2;
-    int expected;
-    QString myGpu;
     for (int gpu = 0; gpu < m_gpus; ++gpu) {
         for (int game = 0; game < m_games; ++game) {
             auto thread_index = gpu * m_games + game;
             connect(&m_gamesThreads[thread_index],
-                    &ValidationWorker::resultReady,
-                    this,
-                    &Validation::getResult,
-                    Qt::DirectConnection);
-            if (game % 2) {
-                n1 = m_firstNet;
-                n2 = m_secondNet;
-                b1 = m_firstBin;
-                b2 = m_secondBin;
-                o1 = m_firstOpts;
-                o2 = m_secondOpts;
-                expected = Game::BLACK;
-            } else {
-                n1 = m_secondNet;
-                n2 = m_firstNet;
-                b1 = m_secondBin;
-                b2 = m_firstBin;
-                o1 = m_secondOpts;
-                o2 = m_firstOpts;
+                    &ValidationWorker::resultReady, this,
+                    &Validation::getResult, Qt::DirectConnection);
+
+            auto engines = m_engines;
+            auto expected = Game::BLACK;
+            if (game & 1) {
+                std::swap(engines[0], engines[1]);
                 expected = Game::WHITE;
             }
-            if (m_gpusList.isEmpty()) {
-                myGpu = "";
-            } else {
+
+            auto myGpu = QString("");
+            if (!m_gpusList.isEmpty()) {
                 myGpu = m_gpusList.at(gpu);
             }
 
-            m_gamesThreads[thread_index].init(myGpu, n1, n2, b1, b2, o1, o2, m_keepPath, expected);
+            m_gamesThreads[thread_index].init(myGpu, engines, m_keepPath,
+                                              expected);
             m_gamesThreads[thread_index].start();
         }
     }
@@ -213,7 +174,7 @@ void Validation::saveSprt() {
     out << m_statistic;
     out << m_results;
     f.close();
-    m_results.printResults(m_firstNet, m_secondNet);
+    m_results.printResults(m_engines[0].m_network, m_engines[1].m_network);
     printSprtStatus(m_statistic.status());
 }
 
@@ -237,18 +198,17 @@ void Validation::loadSprt() {
     f.close();
     QFile::remove(fi.fileName());
     QTextStream(stdout) << "Initial Statistics" << endl;
-    m_results.printResults(m_firstNet, m_secondNet);
+    m_results.printResults(m_engines[0].m_network, m_engines[1].m_network);
     printSprtStatus(m_statistic.status());
 }
 
 void Validation::printSprtStatus(const Sprt::Status& status) {
-    QTextStream(stdout)
-        << m_results.getGamesPlayed() << " games played." << endl;
-    QTextStream(stdout)
-        << "Status: " << status.result
-        << " LLR " << status.llr
-        << " Lower Bound " << status.lBound
-        << " Upper Bound " << status.uBound << endl;
+    QTextStream(stdout) << m_results.getGamesPlayed() << " games played."
+                        << endl;
+    QTextStream(stdout) << "Status: " << status.result
+                        << " LLR " << status.llr
+                        << " Lower Bound " << status.lBound
+                        << " Upper Bound " << status.uBound << endl;
 }
 
 void Validation::getResult(Sprt::GameResult result, int net_one_color) {
@@ -268,10 +228,10 @@ void Validation::getResult(Sprt::GameResult result, int net_one_color) {
         quitThreads();
         QTextStream(stdout)
             << "The first net is "
-            <<  ((status.result ==  Sprt::AcceptH0) ? "worse " : "better ")
+            << ((status.result == Sprt::AcceptH0) ? "worse " : "better ")
             << "than the second" << endl;
-        m_results.printResults(m_firstNet, m_secondNet);
-        //sendQuit();
+        m_results.printResults(m_engines[0].m_network, m_engines[1].m_network);
+        // sendQuit();
     } else {
         printSprtStatus(status);
     }
diff --git a/validation/Validation.h b/validation/Validation.h
index e17830a50..2d62207b3 100644
--- a/validation/Validation.h
+++ b/validation/Validation.h
@@ -18,15 +18,16 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <QTextStream>
+#include <QAtomicInt>
+#include <QMutex>
 #include <QString>
+#include <QTextStream>
 #include <QThread>
 #include <QVector>
-#include <QAtomicInt>
-#include <QMutex>
-#include "SPRT.h"
+
 #include "../autogtp/Game.h"
 #include "Results.h"
+#include "SPRT.h"
 
 class ValidationWorker : public QThread {
     Q_OBJECT
@@ -38,29 +39,20 @@ class ValidationWorker : public QThread {
     ValidationWorker() = default;
     ValidationWorker(const ValidationWorker& w) : QThread(w.parent()) {}
     ~ValidationWorker() = default;
-    void init(const QString& gpuIndex,
-              const QString& firstNet,
-              const QString& secondNet,
-              const QString& firstBin,
-              const QString& secondBin,
-              const QString& firstOpts,
-              const QString& secondOpts,
-              const QString& keep,
-              int expected);
+    void init(const QString& gpuIndex, const QVector<Engine>& engines,
+              const QString& keep, int expected);
     void run() override;
-    void doFinish() { m_state.store(FINISHING); }
+    void doFinish() {
+        m_state.store(FINISHING);
+    }
 
 signals:
     void resultReady(Sprt::GameResult r, int net_one_color);
+
 private:
-    QString m_firstNet;
-    QString m_secondNet;
+    QVector<Engine> m_engines;
     int m_expected;
     QString m_keepPath;
-    QString m_firstBin;
-    QString m_secondBin;
-    QString m_firstOpts;
-    QString m_secondOpts;
     QAtomicInt m_state;
 };
 
@@ -68,18 +60,9 @@ class Validation : public QObject {
     Q_OBJECT
 
 public:
-    Validation(const int gpus, const int games,
-               const QStringList& gpusList,
-               const QString& firstNet,
-               const QString& secondNet,
-               const QString& keep,
-               QMutex* mutex,
-               const QString& firstBin,
-               const QString& secondBin,
-               const QString& firstOpts,
-               const QString& secondOpts,
-               const float& h0,
-               const float& h1);
+    Validation(int gpus, int games, const QStringList& gpusList,
+               QVector<Engine>& engines, const QString& keep, QMutex* mutex,
+               float h0, float h1);
     ~Validation() = default;
     void startGames();
     void wait();
@@ -89,6 +72,7 @@ class Validation : public QObject {
 public slots:
     void getResult(Sprt::GameResult result, int net_one_color);
     void storeSprt();
+
 private:
     QMutex* m_mainMutex;
     QMutex m_syncMutex;
@@ -98,12 +82,7 @@ public slots:
     int m_games;
     int m_gpus;
     QStringList m_gpusList;
-    QString m_firstNet;
-    QString m_secondNet;
-    QString m_firstBin;
-    QString m_secondBin;
-    QString m_firstOpts;
-    QString m_secondOpts;
+    QVector<Engine>& m_engines;
     QString m_keepPath;
     void quitThreads();
     void saveSprt();
diff --git a/validation/main.cpp b/validation/main.cpp
index 8bcc1a010..8d68669db 100644
--- a/validation/main.cpp
+++ b/validation/main.cpp
@@ -17,24 +17,24 @@
     along with Leela Zero.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <QtCore/QCoreApplication>
-#include <QtCore/QTimer>
-#include <QtCore/QTextStream>
-#include <QtCore/QStringList>
 #include <QCommandLineParser>
-#include <QProcess>
-#include <QFile>
-#include <QDir>
 #include <QDebug>
+#include <QDir>
+#include <QFile>
+#include <QProcess>
+#include <QtCore/QCoreApplication>
+#include <QtCore/QStringList>
+#include <QtCore/QTextStream>
+#include <QtCore/QTimer>
 #include <chrono>
-#include <QCommandLineParser>
-#include "../autogtp/Game.h"
+
 #include "../autogtp/Console.h"
+#include "../autogtp/Game.h"
 #include "Validation.h"
 
 constexpr int VALIDATION_VERSION = 1;
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
     QCoreApplication app(argc, argv);
     app.setApplicationName("validation");
     app.setApplicationVersion(QString("v%1").arg(VALIDATION_VERSION));
@@ -44,40 +44,46 @@ int main(int argc, char *argv[]) {
 
     QCommandLineOption networkOption(
         {"n", "network"},
-            "Networks to use as players in competition mode (two are needed).",
-            "filename");
-    QCommandLineOption binaryOption(
-        {"b", "binary"},
-            "Binary to execute for the game (default ./leelaz).",
-            "filename");
+        "Networks to use as players in competition mode (two are needed).",
+        "filename");
     QCommandLineOption optionsOption(
         {"o", "options"},
-            "Options for the binary given by -b (default '-g -p 1600 --noponder -t 1 -q -d -r 0 -w').",
-            "opt_string");
+        "Options for the binary given by -b (default \"-g -v 3200 --noponder "
+        "-t 1 -q -d -r 0 -w\").",
+        "opt_string");
+    QCommandLineOption gtpCommandOption(
+        {"c", "gtp-command"},
+        "GTP command to send to the binary on startup (default \"time_settings "
+        "0 1 0\").\n"
+        "Multiple commands are sent in the order they are specified.\n"
+        "Commands apply to the preceeding binary or both if specified before "
+        "all binaries.",
+        "command");
     QCommandLineOption sprtOption(
-        {"s", "sprt"},
-            "Set the SPRT hypothesis (default '0.0:35.0').",
-            "lower:upper", "0.0:35.0");
+        {"s", "sprt"}, "Set the SPRT hypothesis (default '0.0:35.0').",
+        "lower:upper", "0.0:35.0");
     QCommandLineOption gamesNumOption(
         {"g", "gamesNum"},
-            "Play 'gamesNum' games on one GPU at the same time.",
-            "num", "1");
+        "Play 'gamesNum' games on one device (GPU/CPU) at the same time.",
+        "num", "1");
     QCommandLineOption gpusOption(
         {"u", "gpus"},
-            "Index of the GPU to use for multiple GPUs support.",
-            "num");
+        "Index of the device(s) to use for multiple devices support.", "num");
     QCommandLineOption keepSgfOption(
-        {"k", "keepSgf" },
-            "Save SGF files after each self-play game.",
-            "output directory");
+        {"k", "keepSgf"}, "Save SGF files after each self-play game.",
+        "output directory");
 
     parser.addOption(gamesNumOption);
     parser.addOption(gpusOption);
-    parser.addOption(networkOption);
-    parser.addOption(binaryOption);
-    parser.addOption(optionsOption);
     parser.addOption(sprtOption);
     parser.addOption(keepSgfOption);
+    parser.addOption(networkOption);
+    parser.addOption(optionsOption);
+    parser.addOption(gtpCommandOption);
+    parser.addPositionalArgument(
+        "[-- binary [--gtp-command...] [-- binary [--gtp-command...]]]",
+        "Binary to execute for the game (default ./leelaz).\n"
+        "Only --gtp-command options are parsed after a binary is specified");
 
     // Process the actual command line arguments given by the user
     parser.process(app);
@@ -86,14 +92,9 @@ int main(int argc, char *argv[]) {
         parser.showHelp();
     }
 
-    QStringList binList = parser.values(binaryOption);
-    while (binList.count() != 2) {
-        binList << "./leelaz";
-    }
-
     QStringList optsList = parser.values(optionsOption);
-    while (optsList.count() != 2) {
-        optsList << " -g  -p 1600 --noponder -t 1 -q -d -r 0 -w ";
+    while (optsList.count() < 2) {
+        optsList << " -g -v 3200 --noponder -t 1 -q -d -r 0 -w ";
     }
 
     QString sprtOpt = parser.value(sprtOption);
@@ -109,27 +110,50 @@ int main(int argc, char *argv[]) {
     }
 
     QTextStream(stdout) << "validation v" << VALIDATION_VERSION << endl;
+
+    auto const keepPath = parser.value(keepSgfOption);
     if (parser.isSet(keepSgfOption)) {
         if (!QDir().mkpath(parser.value(keepSgfOption))) {
-            QTextStream(stdout) << "Couldn't create output directory for self-play SGF files!"
-                 << endl;
+            QTextStream(stdout)
+                << "Couldn't create output directory for self-play SGF files!"
+                << endl;
             return EXIT_FAILURE;
         }
     }
+
+    QStringList commandList = {"time_settings 0 1 0"};
+    commandList << parser.values(gtpCommandOption);
+
+    auto engines =
+        QVector<Engine>({Engine(netList[0], optsList[0], commandList),
+                         Engine(netList[1], optsList[1], commandList)});
+
+    auto engine_idx = 0;
+    auto pos_args = parser.positionalArguments();
+    while (!pos_args.isEmpty()) {
+        if (engine_idx >= 2) {
+            parser.showHelp();
+        }
+        engines[engine_idx].m_binary = pos_args[0];
+        parser.process(pos_args);
+        engines[engine_idx].m_commands << parser.values(gtpCommandOption);
+        pos_args = parser.positionalArguments();
+        engine_idx++;
+    }
+
     QMutex mutex;
-    QTextStream(stdout) << "SPRT : " << sprtOpt << " h0 " << h0 << " h1 " << h1 << endl;
-
-    Console *cons = nullptr;
-    Validation *validate = new Validation(gpusNum, gamesNum, gpusList,
-                        netList.at(0), netList.at(1),
-                        parser.value(keepSgfOption), &mutex,
-                        binList.at(0), binList.at(1),
-                        optsList.at(0), optsList.at(1),
-                        h0, h1);
-    QObject::connect(&app, &QCoreApplication::aboutToQuit, validate, &Validation::storeSprt);
+    QTextStream(stdout) << "SPRT : " << sprtOpt << " h0 " << h0 << " h1 " << h1
+                        << endl;
+
+    Console* cons = nullptr;
+    Validation* validate = new Validation(gpusNum, gamesNum, gpusList, engines,
+                                          keepPath, &mutex, h0, h1);
+    QObject::connect(&app, &QCoreApplication::aboutToQuit, validate,
+                     &Validation::storeSprt);
     validate->loadSprt();
     validate->startGames();
-    QObject::connect(validate, &Validation::sendQuit, &app, &QCoreApplication::quit);
+    QObject::connect(validate, &Validation::sendQuit, &app,
+                     &QCoreApplication::quit);
     cons = new Console();
     QObject::connect(cons, &Console::sendQuit, &app, &QCoreApplication::quit);
     return app.exec();