From 11c53e338e6c601a9b1dc262cd7ce7f02971844f Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Tue, 15 Mar 2022 16:28:03 -0700
Subject: [PATCH] update docs

---
 dockers/rtg-0.7-py39_tr110_cu114.dockerfile |   42 +
 docs/howto-release.adoc                     |   21 +-
 docs/index.html                             |    2 +-
 docs/v0.7/index.html                        | 3154 +++++++++++++++++++
 docs/versions.adoc                          |    1 +
 docs/versions.html                          |    5 +-
 rtg/serve/static/docs.html                  |   35 +-
 7 files changed, 3249 insertions(+), 11 deletions(-)
 create mode 100644 dockers/rtg-0.7-py39_tr110_cu114.dockerfile
 create mode 100644 docs/v0.7/index.html
diff --git a/dockers/rtg-0.7-py39_tr110_cu114.dockerfile b/dockers/rtg-0.7-py39_tr110_cu114.dockerfile
new file mode 100644
index 0000000..6ff2bad
--- /dev/null
+++ b/dockers/rtg-0.7-py39_tr110_cu114.dockerfile
@@ -0,0 +1,42 @@
+# What is this: USC ISI Coral team's MT pipeline
+# Authors:
+#      - Thamme Gowda
+# Created : Oct 20, 2020
+
+#FROM nvidia/cuda:10.2-devel-ubuntu18.04
+#FROM nvidia/cuda:11.1-devel-ubuntu20.04
+FROM nvidia/cuda:11.4.0-runtime-ubuntu20.04
+
+# suppress prompts https://stackoverflow.com/a/67452950/1506477
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt update \
+ && apt install -y curl python3.9 python3-pip python3.9-dev \
+   build-essential git locales locales-all \
+ && apt-get autoremove --purge
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+# Update pip
+RUN  ln -s /usr/bin/python3.9 /usr/bin/python && python -m pip install --upgrade pip
+
+#Make non-root user;
+RUN useradd --create-home rtguser
+#RUN chown -Rv rtguser:rtguser /home/rtguser
+
+WORKDIR /home/rtguser
+USER rtguser
+
+# pip installed bins go here, they needs to be in PATH
+RUN mkdir -p /home/rtguser/.local/bin /home/rtguser/rtg
+ENV CUDA_HOME="/usr/local/cuda/"
+ENV PATH="/home/rtguser/.local/bin:/usr/local/cuda/bin:${PATH}"
+
+#COPY --chown=rtguser:rtguser . /home/rtguser/rtg/
+#   && cd /home/rtguser/rtg && pip install --editable . \
+
+RUN pip install --user torch==1.10 flask==1.1.2 uwsgi rtg==0.7  \
+   && pip cache purge
+
+CMD bash
diff --git a/docs/howto-release.adoc b/docs/howto-release.adoc
index 3cd5dad..904042c 100644
--- a/docs/howto-release.adoc
+++ b/docs/howto-release.adoc
@@ -1,4 +1,4 @@
-==  Release instructions
+==  PyPI Release Instructions
 
 * PyPI release required twine : https://twine.readthedocs.io/en/latest/
 * Docs require asciidoctor: https://anaconda.org/conda-forge/asciidoctor
@@ -18,12 +18,12 @@
   twine upload -r testpypi dist/*
 
 . Make docs and link
+.. List a new version for docs: `docs/versions.adoc`
+
 .. Build docs
 
        docs/make-docs.sh      # docs
 
-.. List a new version for docs: `docs/versions.adoc`
-
 . Upload to **pypi**
 
   twine upload -r pypi dist/*
@@ -52,3 +52,18 @@ password:<password_here>
 ----
 
 For the first time users of PyPI, you need to create an account at https://pypi.org/ AND https://test.pypi.org/. Yes, they are two different accounts! Make your life easy by using same userID and password.
+
+== Docker Release
+
+
+=== Docker for ARM64
+
+With Apple moving to ARM chips, running AMD64 docker images on ARM is not efficient.
+
+[source,bash]
+----
+cd dockers
+# find a suitable docker file to build; e.g., this one
+docker build . -f rtg-0.7-py39_tr110_cu114.dockerfile-t tgowda/rtg:0.7-py39_tr110_cu114
+----
+
diff --git a/docs/index.html b/docs/index.html
index ea94577..9c585f1 120000
--- a/docs/index.html
+++ b/docs/index.html
@@ -1 +1 @@
-v0.6.1/index.html
\ No newline at end of file
+v0.7/index.html
\ No newline at end of file
diff --git a/docs/v0.7/index.html b/docs/v0.7/index.html
new file mode 100644
index 0000000..88cf03b
--- /dev/null
+++ b/docs/v0.7/index.html
@@ -0,0 +1,3154 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<meta name="generator" content="Asciidoctor 2.0.16">
+<meta name="author" content="USC Information Sciences Institute Natural Language Group">
+<title>Reader-Translator-Generator (RTG)</title>
+<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Open+Sans:300,300italic,400,400italic,600,600italic%7CNoto+Serif:400,400italic,700,700italic%7CDroid+Sans+Mono:400,700">
+<style>
+/*! Asciidoctor default stylesheet | MIT License | https://asciidoctor.org */
+/* Uncomment the following line when using as a custom stylesheet */
+/* @import "https://fonts.googleapis.com/css?family=Open+Sans:300,300italic,400,400italic,600,600italic%7CNoto+Serif:400,400italic,700,700italic%7CDroid+Sans+Mono:400,700"; */
+html{font-family:sans-serif;-webkit-text-size-adjust:100%}
+a{background:none}
+a:focus{outline:thin dotted}
+a:active,a:hover{outline:0}
+h1{font-size:2em;margin:.67em 0}
+b,strong{font-weight:bold}
+abbr{font-size:.9em}
+abbr[title]{cursor:help;border-bottom:1px dotted #dddddf;text-decoration:none}
+dfn{font-style:italic}
+hr{height:0}
+mark{background:#ff0;color:#000}
+code,kbd,pre,samp{font-family:monospace;font-size:1em}
+pre{white-space:pre-wrap}
+q{quotes:"\201C" "\201D" "\2018" "\2019"}
+small{font-size:80%}
+sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}
+sup{top:-.5em}
+sub{bottom:-.25em}
+img{border:0}
+svg:not(:root){overflow:hidden}
+figure{margin:0}
+audio,video{display:inline-block}
+audio:not([controls]){display:none;height:0}
+fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}
+legend{border:0;padding:0}
+button,input,select,textarea{font-family:inherit;font-size:100%;margin:0}
+button,input{line-height:normal}
+button,select{text-transform:none}
+button,html input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer}
+button[disabled],html input[disabled]{cursor:default}
+input[type=checkbox],input[type=radio]{padding:0}
+button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}
+textarea{overflow:auto;vertical-align:top}
+table{border-collapse:collapse;border-spacing:0}
+*,::before,::after{box-sizing:border-box}
+html,body{font-size:100%}
+body{background:#fff;color:rgba(0,0,0,.8);padding:0;margin:0;font-family:"Noto Serif","DejaVu Serif",serif;line-height:1;position:relative;cursor:auto;-moz-tab-size:4;-o-tab-size:4;tab-size:4;word-wrap:anywhere;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased}
+a:hover{cursor:pointer}
+img,object,embed{max-width:100%;height:auto}
+object,embed{height:100%}
+img{-ms-interpolation-mode:bicubic}
+.left{float:left!important}
+.right{float:right!important}
+.text-left{text-align:left!important}
+.text-right{text-align:right!important}
+.text-center{text-align:center!important}
+.text-justify{text-align:justify!important}
+.hide{display:none}
+img,object,svg{display:inline-block;vertical-align:middle}
+textarea{height:auto;min-height:50px}
+select{width:100%}
+.subheader,.admonitionblock td.content>.title,.audioblock>.title,.exampleblock>.title,.imageblock>.title,.listingblock>.title,.literalblock>.title,.stemblock>.title,.openblock>.title,.paragraph>.title,.quoteblock>.title,table.tableblock>.title,.verseblock>.title,.videoblock>.title,.dlist>.title,.olist>.title,.ulist>.title,.qlist>.title,.hdlist>.title{line-height:1.45;color:#7a2518;font-weight:400;margin-top:0;margin-bottom:.25em}
+div,dl,dt,dd,ul,ol,li,h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6,pre,form,p,blockquote,th,td{margin:0;padding:0}
+a{color:#2156a5;text-decoration:underline;line-height:inherit}
+a:hover,a:focus{color:#1d4b8f}
+a img{border:0}
+p{line-height:1.6;margin-bottom:1.25em;text-rendering:optimizeLegibility}
+p aside{font-size:.875em;line-height:1.35;font-style:italic}
+h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6{font-family:"Open Sans","DejaVu Sans",sans-serif;font-weight:300;font-style:normal;color:#ba3925;text-rendering:optimizeLegibility;margin-top:1em;margin-bottom:.5em;line-height:1.0125em}
+h1 small,h2 small,h3 small,#toctitle small,.sidebarblock>.content>.title small,h4 small,h5 small,h6 small{font-size:60%;color:#e99b8f;line-height:0}
+h1{font-size:2.125em}
+h2{font-size:1.6875em}
+h3,#toctitle,.sidebarblock>.content>.title{font-size:1.375em}
+h4,h5{font-size:1.125em}
+h6{font-size:1em}
+hr{border:solid #dddddf;border-width:1px 0 0;clear:both;margin:1.25em 0 1.1875em}
+em,i{font-style:italic;line-height:inherit}
+strong,b{font-weight:bold;line-height:inherit}
+small{font-size:60%;line-height:inherit}
+code{font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;font-weight:400;color:rgba(0,0,0,.9)}
+ul,ol,dl{line-height:1.6;margin-bottom:1.25em;list-style-position:outside;font-family:inherit}
+ul,ol{margin-left:1.5em}
+ul li ul,ul li ol{margin-left:1.25em;margin-bottom:0}
+ul.square li ul,ul.circle li ul,ul.disc li ul{list-style:inherit}
+ul.square{list-style-type:square}
+ul.circle{list-style-type:circle}
+ul.disc{list-style-type:disc}
+ol li ul,ol li ol{margin-left:1.25em;margin-bottom:0}
+dl dt{margin-bottom:.3125em;font-weight:bold}
+dl dd{margin-bottom:1.25em}
+blockquote{margin:0 0 1.25em;padding:.5625em 1.25em 0 1.1875em;border-left:1px solid #ddd}
+blockquote,blockquote p{line-height:1.6;color:rgba(0,0,0,.85)}
+@media screen and (min-width:768px){h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6{line-height:1.2}
+h1{font-size:2.75em}
+h2{font-size:2.3125em}
+h3,#toctitle,.sidebarblock>.content>.title{font-size:1.6875em}
+h4{font-size:1.4375em}}
+table{background:#fff;margin-bottom:1.25em;border:1px solid #dedede;word-wrap:normal}
+table thead,table tfoot{background:#f7f8f7}
+table thead tr th,table thead tr td,table tfoot tr th,table tfoot tr td{padding:.5em .625em .625em;font-size:inherit;color:rgba(0,0,0,.8);text-align:left}
+table tr th,table tr td{padding:.5625em .625em;font-size:inherit;color:rgba(0,0,0,.8)}
+table tr.even,table tr.alt{background:#f8f8f7}
+table thead tr th,table tfoot tr th,table tbody tr td,table tr td,table tfoot tr td{line-height:1.6}
+h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6{line-height:1.2;word-spacing:-.05em}
+h1 strong,h2 strong,h3 strong,#toctitle strong,.sidebarblock>.content>.title strong,h4 strong,h5 strong,h6 strong{font-weight:400}
+.center{margin-left:auto;margin-right:auto}
+.stretch{width:100%}
+.clearfix::before,.clearfix::after,.float-group::before,.float-group::after{content:" ";display:table}
+.clearfix::after,.float-group::after{clear:both}
+:not(pre).nobreak{word-wrap:normal}
+:not(pre).nowrap{white-space:nowrap}
+:not(pre).pre-wrap{white-space:pre-wrap}
+:not(pre):not([class^=L])>code{font-size:.9375em;font-style:normal!important;letter-spacing:0;padding:.1em .5ex;word-spacing:-.15em;background:#f7f7f8;border-radius:4px;line-height:1.45;text-rendering:optimizeSpeed}
+pre{color:rgba(0,0,0,.9);font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;line-height:1.45;text-rendering:optimizeSpeed}
+pre code,pre pre{color:inherit;font-size:inherit;line-height:inherit}
+pre>code{display:block}
+pre.nowrap,pre.nowrap pre{white-space:pre;word-wrap:normal}
+em em{font-style:normal}
+strong strong{font-weight:400}
+.keyseq{color:rgba(51,51,51,.8)}
+kbd{font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;display:inline-block;color:rgba(0,0,0,.8);font-size:.65em;line-height:1.45;background:#f7f7f7;border:1px solid #ccc;border-radius:3px;box-shadow:0 1px 0 rgba(0,0,0,.2),inset 0 0 0 .1em #fff;margin:0 .15em;padding:.2em .5em;vertical-align:middle;position:relative;top:-.1em;white-space:nowrap}
+.keyseq kbd:first-child{margin-left:0}
+.keyseq kbd:last-child{margin-right:0}
+.menuseq,.menuref{color:#000}
+.menuseq b:not(.caret),.menuref{font-weight:inherit}
+.menuseq{word-spacing:-.02em}
+.menuseq b.caret{font-size:1.25em;line-height:.8}
+.menuseq i.caret{font-weight:bold;text-align:center;width:.45em}
+b.button::before,b.button::after{position:relative;top:-1px;font-weight:400}
+b.button::before{content:"[";padding:0 3px 0 2px}
+b.button::after{content:"]";padding:0 2px 0 3px}
+p a>code:hover{color:rgba(0,0,0,.9)}
+#header,#content,#footnotes,#footer{width:100%;margin:0 auto;max-width:62.5em;*zoom:1;position:relative;padding-left:.9375em;padding-right:.9375em}
+#header::before,#header::after,#content::before,#content::after,#footnotes::before,#footnotes::after,#footer::before,#footer::after{content:" ";display:table}
+#header::after,#content::after,#footnotes::after,#footer::after{clear:both}
+#content{margin-top:1.25em}
+#content::before{content:none}
+#header>h1:first-child{color:rgba(0,0,0,.85);margin-top:2.25rem;margin-bottom:0}
+#header>h1:first-child+#toc{margin-top:8px;border-top:1px solid #dddddf}
+#header>h1:only-child,body.toc2 #header>h1:nth-last-child(2){border-bottom:1px solid #dddddf;padding-bottom:8px}
+#header .details{border-bottom:1px solid #dddddf;line-height:1.45;padding-top:.25em;padding-bottom:.25em;padding-left:.25em;color:rgba(0,0,0,.6);display:flex;flex-flow:row wrap}
+#header .details span:first-child{margin-left:-.125em}
+#header .details span.email a{color:rgba(0,0,0,.85)}
+#header .details br{display:none}
+#header .details br+span::before{content:"\00a0\2013\00a0"}
+#header .details br+span.author::before{content:"\00a0\22c5\00a0";color:rgba(0,0,0,.85)}
+#header .details br+span#revremark::before{content:"\00a0|\00a0"}
+#header #revnumber{text-transform:capitalize}
+#header #revnumber::after{content:"\00a0"}
+#content>h1:first-child:not([class]){color:rgba(0,0,0,.85);border-bottom:1px solid #dddddf;padding-bottom:8px;margin-top:0;padding-top:1rem;margin-bottom:1.25rem}
+#toc{border-bottom:1px solid #e7e7e9;padding-bottom:.5em}
+#toc>ul{margin-left:.125em}
+#toc ul.sectlevel0>li>a{font-style:italic}
+#toc ul.sectlevel0 ul.sectlevel1{margin:.5em 0}
+#toc ul{font-family:"Open Sans","DejaVu Sans",sans-serif;list-style-type:none}
+#toc li{line-height:1.3334;margin-top:.3334em}
+#toc a{text-decoration:none}
+#toc a:active{text-decoration:underline}
+#toctitle{color:#7a2518;font-size:1.2em}
+@media screen and (min-width:768px){#toctitle{font-size:1.375em}
+body.toc2{padding-left:15em;padding-right:0}
+#toc.toc2{margin-top:0!important;background:#f8f8f7;position:fixed;width:15em;left:0;top:0;border-right:1px solid #e7e7e9;border-top-width:0!important;border-bottom-width:0!important;z-index:1000;padding:1.25em 1em;height:100%;overflow:auto}
+#toc.toc2 #toctitle{margin-top:0;margin-bottom:.8rem;font-size:1.2em}
+#toc.toc2>ul{font-size:.9em;margin-bottom:0}
+#toc.toc2 ul ul{margin-left:0;padding-left:1em}
+#toc.toc2 ul.sectlevel0 ul.sectlevel1{padding-left:0;margin-top:.5em;margin-bottom:.5em}
+body.toc2.toc-right{padding-left:0;padding-right:15em}
+body.toc2.toc-right #toc.toc2{border-right-width:0;border-left:1px solid #e7e7e9;left:auto;right:0}}
+@media screen and (min-width:1280px){body.toc2{padding-left:20em;padding-right:0}
+#toc.toc2{width:20em}
+#toc.toc2 #toctitle{font-size:1.375em}
+#toc.toc2>ul{font-size:.95em}
+#toc.toc2 ul ul{padding-left:1.25em}
+body.toc2.toc-right{padding-left:0;padding-right:20em}}
+#content #toc{border:1px solid #e0e0dc;margin-bottom:1.25em;padding:1.25em;background:#f8f8f7;border-radius:4px}
+#content #toc>:first-child{margin-top:0}
+#content #toc>:last-child{margin-bottom:0}
+#footer{max-width:none;background:rgba(0,0,0,.8);padding:1.25em}
+#footer-text{color:hsla(0,0%,100%,.8);line-height:1.44}
+#content{margin-bottom:.625em}
+.sect1{padding-bottom:.625em}
+@media screen and (min-width:768px){#content{margin-bottom:1.25em}
+.sect1{padding-bottom:1.25em}}
+.sect1:last-child{padding-bottom:0}
+.sect1+.sect1{border-top:1px solid #e7e7e9}
+#content h1>a.anchor,h2>a.anchor,h3>a.anchor,#toctitle>a.anchor,.sidebarblock>.content>.title>a.anchor,h4>a.anchor,h5>a.anchor,h6>a.anchor{position:absolute;z-index:1001;width:1.5ex;margin-left:-1.5ex;display:block;text-decoration:none!important;visibility:hidden;text-align:center;font-weight:400}
+#content h1>a.anchor::before,h2>a.anchor::before,h3>a.anchor::before,#toctitle>a.anchor::before,.sidebarblock>.content>.title>a.anchor::before,h4>a.anchor::before,h5>a.anchor::before,h6>a.anchor::before{content:"\00A7";font-size:.85em;display:block;padding-top:.1em}
+#content h1:hover>a.anchor,#content h1>a.anchor:hover,h2:hover>a.anchor,h2>a.anchor:hover,h3:hover>a.anchor,#toctitle:hover>a.anchor,.sidebarblock>.content>.title:hover>a.anchor,h3>a.anchor:hover,#toctitle>a.anchor:hover,.sidebarblock>.content>.title>a.anchor:hover,h4:hover>a.anchor,h4>a.anchor:hover,h5:hover>a.anchor,h5>a.anchor:hover,h6:hover>a.anchor,h6>a.anchor:hover{visibility:visible}
+#content h1>a.link,h2>a.link,h3>a.link,#toctitle>a.link,.sidebarblock>.content>.title>a.link,h4>a.link,h5>a.link,h6>a.link{color:#ba3925;text-decoration:none}
+#content h1>a.link:hover,h2>a.link:hover,h3>a.link:hover,#toctitle>a.link:hover,.sidebarblock>.content>.title>a.link:hover,h4>a.link:hover,h5>a.link:hover,h6>a.link:hover{color:#a53221}
+details,.audioblock,.imageblock,.literalblock,.listingblock,.stemblock,.videoblock{margin-bottom:1.25em}
+details{margin-left:1.25rem}
+details>summary{cursor:pointer;display:block;position:relative;line-height:1.6;margin-bottom:.625rem;-webkit-tap-highlight-color:transparent}
+details>summary::before{content:"";border:solid transparent;border-left:solid;border-width:.3em 0 .3em .5em;position:absolute;top:.5em;left:-1.25rem;transform:translateX(15%)}
+details[open]>summary::before{border:solid transparent;border-top:solid;border-width:.5em .3em 0;transform:translateY(15%)}
+details>summary::after{content:"";width:1.25rem;height:1em;position:absolute;top:.3em;left:-1.25rem}
+.admonitionblock td.content>.title,.audioblock>.title,.exampleblock>.title,.imageblock>.title,.listingblock>.title,.literalblock>.title,.stemblock>.title,.openblock>.title,.paragraph>.title,.quoteblock>.title,table.tableblock>.title,.verseblock>.title,.videoblock>.title,.dlist>.title,.olist>.title,.ulist>.title,.qlist>.title,.hdlist>.title{text-rendering:optimizeLegibility;text-align:left;font-family:"Noto Serif","DejaVu Serif",serif;font-size:1rem;font-style:italic}
+table.tableblock.fit-content>caption.title{white-space:nowrap;width:0}
+.paragraph.lead>p,#preamble>.sectionbody>[class=paragraph]:first-of-type p{font-size:1.21875em;line-height:1.6;color:rgba(0,0,0,.85)}
+.admonitionblock>table{border-collapse:separate;border:0;background:none;width:100%}
+.admonitionblock>table td.icon{text-align:center;width:80px}
+.admonitionblock>table td.icon img{max-width:none}
+.admonitionblock>table td.icon .title{font-weight:bold;font-family:"Open Sans","DejaVu Sans",sans-serif;text-transform:uppercase}
+.admonitionblock>table td.content{padding-left:1.125em;padding-right:1.25em;border-left:1px solid #dddddf;color:rgba(0,0,0,.6);word-wrap:anywhere}
+.admonitionblock>table td.content>:last-child>:last-child{margin-bottom:0}
+.exampleblock>.content{border:1px solid #e6e6e6;margin-bottom:1.25em;padding:1.25em;background:#fff;border-radius:4px}
+.exampleblock>.content>:first-child{margin-top:0}
+.exampleblock>.content>:last-child{margin-bottom:0}
+.sidebarblock{border:1px solid #dbdbd6;margin-bottom:1.25em;padding:1.25em;background:#f3f3f2;border-radius:4px}
+.sidebarblock>:first-child{margin-top:0}
+.sidebarblock>:last-child{margin-bottom:0}
+.sidebarblock>.content>.title{color:#7a2518;margin-top:0;text-align:center}
+.exampleblock>.content>:last-child>:last-child,.exampleblock>.content .olist>ol>li:last-child>:last-child,.exampleblock>.content .ulist>ul>li:last-child>:last-child,.exampleblock>.content .qlist>ol>li:last-child>:last-child,.sidebarblock>.content>:last-child>:last-child,.sidebarblock>.content .olist>ol>li:last-child>:last-child,.sidebarblock>.content .ulist>ul>li:last-child>:last-child,.sidebarblock>.content .qlist>ol>li:last-child>:last-child{margin-bottom:0}
+.literalblock pre,.listingblock>.content>pre{border-radius:4px;overflow-x:auto;padding:1em;font-size:.8125em}
+@media screen and (min-width:768px){.literalblock pre,.listingblock>.content>pre{font-size:.90625em}}
+@media screen and (min-width:1280px){.literalblock pre,.listingblock>.content>pre{font-size:1em}}
+.literalblock pre,.listingblock>.content>pre:not(.highlight),.listingblock>.content>pre[class=highlight],.listingblock>.content>pre[class^="highlight "]{background:#f7f7f8}
+.literalblock.output pre{color:#f7f7f8;background:rgba(0,0,0,.9)}
+.listingblock>.content{position:relative}
+.listingblock code[data-lang]::before{display:none;content:attr(data-lang);position:absolute;font-size:.75em;top:.425rem;right:.5rem;line-height:1;text-transform:uppercase;color:inherit;opacity:.5}
+.listingblock:hover code[data-lang]::before{display:block}
+.listingblock.terminal pre .command::before{content:attr(data-prompt);padding-right:.5em;color:inherit;opacity:.5}
+.listingblock.terminal pre .command:not([data-prompt])::before{content:"$"}
+.listingblock pre.highlightjs{padding:0}
+.listingblock pre.highlightjs>code{padding:1em;border-radius:4px}
+.listingblock pre.prettyprint{border-width:0}
+.prettyprint{background:#f7f7f8}
+pre.prettyprint .linenums{line-height:1.45;margin-left:2em}
+pre.prettyprint li{background:none;list-style-type:inherit;padding-left:0}
+pre.prettyprint li code[data-lang]::before{opacity:1}
+pre.prettyprint li:not(:first-child) code[data-lang]::before{display:none}
+table.linenotable{border-collapse:separate;border:0;margin-bottom:0;background:none}
+table.linenotable td[class]{color:inherit;vertical-align:top;padding:0;line-height:inherit;white-space:normal}
+table.linenotable td.code{padding-left:.75em}
+table.linenotable td.linenos{border-right:1px solid;opacity:.35;padding-right:.5em}
+pre.pygments .lineno{border-right:1px solid;opacity:.35;display:inline-block;margin-right:.75em}
+pre.pygments .lineno::before{content:"";margin-right:-.125em}
+.quoteblock{margin:0 1em 1.25em 1.5em;display:table}
+.quoteblock:not(.excerpt)>.title{margin-left:-1.5em;margin-bottom:.75em}
+.quoteblock blockquote,.quoteblock p{color:rgba(0,0,0,.85);font-size:1.15rem;line-height:1.75;word-spacing:.1em;letter-spacing:0;font-style:italic;text-align:justify}
+.quoteblock blockquote{margin:0;padding:0;border:0}
+.quoteblock blockquote::before{content:"\201c";float:left;font-size:2.75em;font-weight:bold;line-height:.6em;margin-left:-.6em;color:#7a2518;text-shadow:0 1px 2px rgba(0,0,0,.1)}
+.quoteblock blockquote>.paragraph:last-child p{margin-bottom:0}
+.quoteblock .attribution{margin-top:.75em;margin-right:.5ex;text-align:right}
+.verseblock{margin:0 1em 1.25em}
+.verseblock pre{font-family:"Open Sans","DejaVu Sans",sans-serif;font-size:1.15rem;color:rgba(0,0,0,.85);font-weight:300;text-rendering:optimizeLegibility}
+.verseblock pre strong{font-weight:400}
+.verseblock .attribution{margin-top:1.25rem;margin-left:.5ex}
+.quoteblock .attribution,.verseblock .attribution{font-size:.9375em;line-height:1.45;font-style:italic}
+.quoteblock .attribution br,.verseblock .attribution br{display:none}
+.quoteblock .attribution cite,.verseblock .attribution cite{display:block;letter-spacing:-.025em;color:rgba(0,0,0,.6)}
+.quoteblock.abstract blockquote::before,.quoteblock.excerpt blockquote::before,.quoteblock .quoteblock blockquote::before{display:none}
+.quoteblock.abstract blockquote,.quoteblock.abstract p,.quoteblock.excerpt blockquote,.quoteblock.excerpt p,.quoteblock .quoteblock blockquote,.quoteblock .quoteblock p{line-height:1.6;word-spacing:0}
+.quoteblock.abstract{margin:0 1em 1.25em;display:block}
+.quoteblock.abstract>.title{margin:0 0 .375em;font-size:1.15em;text-align:center}
+.quoteblock.excerpt>blockquote,.quoteblock .quoteblock{padding:0 0 .25em 1em;border-left:.25em solid #dddddf}
+.quoteblock.excerpt,.quoteblock .quoteblock{margin-left:0}
+.quoteblock.excerpt blockquote,.quoteblock.excerpt p,.quoteblock .quoteblock blockquote,.quoteblock .quoteblock p{color:inherit;font-size:1.0625rem}
+.quoteblock.excerpt .attribution,.quoteblock .quoteblock .attribution{color:inherit;font-size:.85rem;text-align:left;margin-right:0}
+p.tableblock:last-child{margin-bottom:0}
+td.tableblock>.content{margin-bottom:1.25em;word-wrap:anywhere}
+td.tableblock>.content>:last-child{margin-bottom:-1.25em}
+table.tableblock,th.tableblock,td.tableblock{border:0 solid #dedede}
+table.grid-all>*>tr>*{border-width:1px}
+table.grid-cols>*>tr>*{border-width:0 1px}
+table.grid-rows>*>tr>*{border-width:1px 0}
+table.frame-all{border-width:1px}
+table.frame-ends{border-width:1px 0}
+table.frame-sides{border-width:0 1px}
+table.frame-none>colgroup+*>:first-child>*,table.frame-sides>colgroup+*>:first-child>*{border-top-width:0}
+table.frame-none>:last-child>:last-child>*,table.frame-sides>:last-child>:last-child>*{border-bottom-width:0}
+table.frame-none>*>tr>:first-child,table.frame-ends>*>tr>:first-child{border-left-width:0}
+table.frame-none>*>tr>:last-child,table.frame-ends>*>tr>:last-child{border-right-width:0}
+table.stripes-all tr,table.stripes-odd tr:nth-of-type(odd),table.stripes-even tr:nth-of-type(even),table.stripes-hover tr:hover{background:#f8f8f7}
+th.halign-left,td.halign-left{text-align:left}
+th.halign-right,td.halign-right{text-align:right}
+th.halign-center,td.halign-center{text-align:center}
+th.valign-top,td.valign-top{vertical-align:top}
+th.valign-bottom,td.valign-bottom{vertical-align:bottom}
+th.valign-middle,td.valign-middle{vertical-align:middle}
+table thead th,table tfoot th{font-weight:bold}
+tbody tr th{background:#f7f8f7}
+tbody tr th,tbody tr th p,tfoot tr th,tfoot tr th p{color:rgba(0,0,0,.8);font-weight:bold}
+p.tableblock>code:only-child{background:none;padding:0}
+p.tableblock{font-size:1em}
+ol{margin-left:1.75em}
+ul li ol{margin-left:1.5em}
+dl dd{margin-left:1.125em}
+dl dd:last-child,dl dd:last-child>:last-child{margin-bottom:0}
+ol>li p,ul>li p,ul dd,ol dd,.olist .olist,.ulist .ulist,.ulist .olist,.olist .ulist{margin-bottom:.625em}
+ul.checklist,ul.none,ol.none,ul.no-bullet,ol.no-bullet,ol.unnumbered,ul.unstyled,ol.unstyled{list-style-type:none}
+ul.no-bullet,ol.no-bullet,ol.unnumbered{margin-left:.625em}
+ul.unstyled,ol.unstyled{margin-left:0}
+ul.checklist>li>p:first-child{margin-left:-1em}
+ul.checklist>li>p:first-child>.fa-square-o:first-child,ul.checklist>li>p:first-child>.fa-check-square-o:first-child{width:1.25em;font-size:.8em;position:relative;bottom:.125em}
+ul.checklist>li>p:first-child>input[type=checkbox]:first-child{margin-right:.25em}
+ul.inline{display:flex;flex-flow:row wrap;list-style:none;margin:0 0 .625em -1.25em}
+ul.inline>li{margin-left:1.25em}
+.unstyled dl dt{font-weight:400;font-style:normal}
+ol.arabic{list-style-type:decimal}
+ol.decimal{list-style-type:decimal-leading-zero}
+ol.loweralpha{list-style-type:lower-alpha}
+ol.upperalpha{list-style-type:upper-alpha}
+ol.lowerroman{list-style-type:lower-roman}
+ol.upperroman{list-style-type:upper-roman}
+ol.lowergreek{list-style-type:lower-greek}
+.hdlist>table,.colist>table{border:0;background:none}
+.hdlist>table>tbody>tr,.colist>table>tbody>tr{background:none}
+td.hdlist1,td.hdlist2{vertical-align:top;padding:0 .625em}
+td.hdlist1{font-weight:bold;padding-bottom:1.25em}
+td.hdlist2{word-wrap:anywhere}
+.literalblock+.colist,.listingblock+.colist{margin-top:-.5em}
+.colist td:not([class]):first-child{padding:.4em .75em 0;line-height:1;vertical-align:top}
+.colist td:not([class]):first-child img{max-width:none}
+.colist td:not([class]):last-child{padding:.25em 0}
+.thumb,.th{line-height:0;display:inline-block;border:4px solid #fff;box-shadow:0 0 0 1px #ddd}
+.imageblock.left{margin:.25em .625em 1.25em 0}
+.imageblock.right{margin:.25em 0 1.25em .625em}
+.imageblock>.title{margin-bottom:0}
+.imageblock.thumb,.imageblock.th{border-width:6px}
+.imageblock.thumb>.title,.imageblock.th>.title{padding:0 .125em}
+.image.left,.image.right{margin-top:.25em;margin-bottom:.25em;display:inline-block;line-height:0}
+.image.left{margin-right:.625em}
+.image.right{margin-left:.625em}
+a.image{text-decoration:none;display:inline-block}
+a.image object{pointer-events:none}
+sup.footnote,sup.footnoteref{font-size:.875em;position:static;vertical-align:super}
+sup.footnote a,sup.footnoteref a{text-decoration:none}
+sup.footnote a:active,sup.footnoteref a:active{text-decoration:underline}
+#footnotes{padding-top:.75em;padding-bottom:.75em;margin-bottom:.625em}
+#footnotes hr{width:20%;min-width:6.25em;margin:-.25em 0 .75em;border-width:1px 0 0}
+#footnotes .footnote{padding:0 .375em 0 .225em;line-height:1.3334;font-size:.875em;margin-left:1.2em;margin-bottom:.2em}
+#footnotes .footnote a:first-of-type{font-weight:bold;text-decoration:none;margin-left:-1.05em}
+#footnotes .footnote:last-of-type{margin-bottom:0}
+#content #footnotes{margin-top:-.625em;margin-bottom:0;padding:.75em 0}
+.gist .file-data>table{border:0;background:#fff;width:100%;margin-bottom:0}
+.gist .file-data>table td.line-data{width:99%}
+div.unbreakable{page-break-inside:avoid}
+.big{font-size:larger}
+.small{font-size:smaller}
+.underline{text-decoration:underline}
+.overline{text-decoration:overline}
+.line-through{text-decoration:line-through}
+.aqua{color:#00bfbf}
+.aqua-background{background:#00fafa}
+.black{color:#000}
+.black-background{background:#000}
+.blue{color:#0000bf}
+.blue-background{background:#0000fa}
+.fuchsia{color:#bf00bf}
+.fuchsia-background{background:#fa00fa}
+.gray{color:#606060}
+.gray-background{background:#7d7d7d}
+.green{color:#006000}
+.green-background{background:#007d00}
+.lime{color:#00bf00}
+.lime-background{background:#00fa00}
+.maroon{color:#600000}
+.maroon-background{background:#7d0000}
+.navy{color:#000060}
+.navy-background{background:#00007d}
+.olive{color:#606000}
+.olive-background{background:#7d7d00}
+.purple{color:#600060}
+.purple-background{background:#7d007d}
+.red{color:#bf0000}
+.red-background{background:#fa0000}
+.silver{color:#909090}
+.silver-background{background:#bcbcbc}
+.teal{color:#006060}
+.teal-background{background:#007d7d}
+.white{color:#bfbfbf}
+.white-background{background:#fafafa}
+.yellow{color:#bfbf00}
+.yellow-background{background:#fafa00}
+span.icon>.fa{cursor:default}
+a span.icon>.fa{cursor:inherit}
+.admonitionblock td.icon [class^="fa icon-"]{font-size:2.5em;text-shadow:1px 1px 2px rgba(0,0,0,.5);cursor:default}
+.admonitionblock td.icon .icon-note::before{content:"\f05a";color:#19407c}
+.admonitionblock td.icon .icon-tip::before{content:"\f0eb";text-shadow:1px 1px 2px rgba(155,155,0,.8);color:#111}
+.admonitionblock td.icon .icon-warning::before{content:"\f071";color:#bf6900}
+.admonitionblock td.icon .icon-caution::before{content:"\f06d";color:#bf3400}
+.admonitionblock td.icon .icon-important::before{content:"\f06a";color:#bf0000}
+.conum[data-value]{display:inline-block;color:#fff!important;background:rgba(0,0,0,.8);border-radius:50%;text-align:center;font-size:.75em;width:1.67em;height:1.67em;line-height:1.67em;font-family:"Open Sans","DejaVu Sans",sans-serif;font-style:normal;font-weight:bold}
+.conum[data-value] *{color:#fff!important}
+.conum[data-value]+b{display:none}
+.conum[data-value]::after{content:attr(data-value)}
+pre .conum[data-value]{position:relative;top:-.125em}
+b.conum *{color:inherit!important}
+.conum:not([data-value]):empty{display:none}
+dt,th.tableblock,td.content,div.footnote{text-rendering:optimizeLegibility}
+h1,h2,p,td.content,span.alt,summary{letter-spacing:-.01em}
+p strong,td.content strong,div.footnote strong{letter-spacing:-.005em}
+p,blockquote,dt,td.content,span.alt,summary{font-size:1.0625rem}
+p{margin-bottom:1.25rem}
+.sidebarblock p,.sidebarblock dt,.sidebarblock td.content,p.tableblock{font-size:1em}
+.exampleblock>.content{background:#fffef7;border-color:#e0e0dc;box-shadow:0 1px 4px #e0e0dc}
+.print-only{display:none!important}
+@page{margin:1.25cm .75cm}
+@media print{*{box-shadow:none!important;text-shadow:none!important}
+html{font-size:80%}
+a{color:inherit!important;text-decoration:underline!important}
+a.bare,a[href^="#"],a[href^="mailto:"]{text-decoration:none!important}
+a[href^="http:"]:not(.bare)::after,a[href^="https:"]:not(.bare)::after{content:"(" attr(href) ")";display:inline-block;font-size:.875em;padding-left:.25em}
+abbr[title]{border-bottom:1px dotted}
+abbr[title]::after{content:" (" attr(title) ")"}
+pre,blockquote,tr,img,object,svg{page-break-inside:avoid}
+thead{display:table-header-group}
+svg{max-width:100%}
+p,blockquote,dt,td.content{font-size:1em;orphans:3;widows:3}
+h2,h3,#toctitle,.sidebarblock>.content>.title{page-break-after:avoid}
+#header,#content,#footnotes,#footer{max-width:none}
+#toc,.sidebarblock,.exampleblock>.content{background:none!important}
+#toc{border-bottom:1px solid #dddddf!important;padding-bottom:0!important}
+body.book #header{text-align:center}
+body.book #header>h1:first-child{border:0!important;margin:2.5em 0 1em}
+body.book #header .details{border:0!important;display:block;padding:0!important}
+body.book #header .details span:first-child{margin-left:0!important}
+body.book #header .details br{display:block}
+body.book #header .details br+span::before{content:none!important}
+body.book #toc{border:0!important;text-align:left!important;padding:0!important;margin:0!important}
+body.book #toc,body.book #preamble,body.book h1.sect0,body.book .sect1>h2{page-break-before:always}
+.listingblock code[data-lang]::before{display:block}
+#footer{padding:0 .9375em}
+.hide-on-print{display:none!important}
+.print-only{display:block!important}
+.hide-for-print{display:none!important}
+.show-for-print{display:inherit!important}}
+@media amzn-kf8,print{#header>h1:first-child{margin-top:1.25rem}
+.sect1{padding:0!important}
+.sect1+.sect1{border:0}
+#footer{background:none}
+#footer-text{color:rgba(0,0,0,.6);font-size:.9em}}
+@media amzn-kf8{#header,#content,#footnotes,#footer{padding:0}}
+</style>
+<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
+<style>
+pre.rouge table td { padding: 5px; }
+pre.rouge table pre { margin: 0; }
+pre.rouge .cm {
+  color: #999988;
+  font-style: italic;
+}
+pre.rouge .cp {
+  color: #999999;
+  font-weight: bold;
+}
+pre.rouge .c1 {
+  color: #999988;
+  font-style: italic;
+}
+pre.rouge .cs {
+  color: #999999;
+  font-weight: bold;
+  font-style: italic;
+}
+pre.rouge .c, pre.rouge .ch, pre.rouge .cd, pre.rouge .cpf {
+  color: #999988;
+  font-style: italic;
+}
+pre.rouge .err {
+  color: #a61717;
+  background-color: #e3d2d2;
+}
+pre.rouge .gd {
+  color: #000000;
+  background-color: #ffdddd;
+}
+pre.rouge .ge {
+  color: #000000;
+  font-style: italic;
+}
+pre.rouge .gr {
+  color: #aa0000;
+}
+pre.rouge .gh {
+  color: #999999;
+}
+pre.rouge .gi {
+  color: #000000;
+  background-color: #ddffdd;
+}
+pre.rouge .go {
+  color: #888888;
+}
+pre.rouge .gp {
+  color: #555555;
+}
+pre.rouge .gs {
+  font-weight: bold;
+}
+pre.rouge .gu {
+  color: #aaaaaa;
+}
+pre.rouge .gt {
+  color: #aa0000;
+}
+pre.rouge .kc {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .kd {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .kn {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .kp {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .kr {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .kt {
+  color: #445588;
+  font-weight: bold;
+}
+pre.rouge .k, pre.rouge .kv {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .mf {
+  color: #009999;
+}
+pre.rouge .mh {
+  color: #009999;
+}
+pre.rouge .il {
+  color: #009999;
+}
+pre.rouge .mi {
+  color: #009999;
+}
+pre.rouge .mo {
+  color: #009999;
+}
+pre.rouge .m, pre.rouge .mb, pre.rouge .mx {
+  color: #009999;
+}
+pre.rouge .sa {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .sb {
+  color: #d14;
+}
+pre.rouge .sc {
+  color: #d14;
+}
+pre.rouge .sd {
+  color: #d14;
+}
+pre.rouge .s2 {
+  color: #d14;
+}
+pre.rouge .se {
+  color: #d14;
+}
+pre.rouge .sh {
+  color: #d14;
+}
+pre.rouge .si {
+  color: #d14;
+}
+pre.rouge .sx {
+  color: #d14;
+}
+pre.rouge .sr {
+  color: #009926;
+}
+pre.rouge .s1 {
+  color: #d14;
+}
+pre.rouge .ss {
+  color: #990073;
+}
+pre.rouge .s, pre.rouge .dl {
+  color: #d14;
+}
+pre.rouge .na {
+  color: #008080;
+}
+pre.rouge .bp {
+  color: #999999;
+}
+pre.rouge .nb {
+  color: #0086B3;
+}
+pre.rouge .nc {
+  color: #445588;
+  font-weight: bold;
+}
+pre.rouge .no {
+  color: #008080;
+}
+pre.rouge .nd {
+  color: #3c5d5d;
+  font-weight: bold;
+}
+pre.rouge .ni {
+  color: #800080;
+}
+pre.rouge .ne {
+  color: #990000;
+  font-weight: bold;
+}
+pre.rouge .nf, pre.rouge .fm {
+  color: #990000;
+  font-weight: bold;
+}
+pre.rouge .nl {
+  color: #990000;
+  font-weight: bold;
+}
+pre.rouge .nn {
+  color: #555555;
+}
+pre.rouge .nt {
+  color: #000080;
+}
+pre.rouge .vc {
+  color: #008080;
+}
+pre.rouge .vg {
+  color: #008080;
+}
+pre.rouge .vi {
+  color: #008080;
+}
+pre.rouge .nv, pre.rouge .vm {
+  color: #008080;
+}
+pre.rouge .ow {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .o {
+  color: #000000;
+  font-weight: bold;
+}
+pre.rouge .w {
+  color: #bbbbbb;
+}
+pre.rouge {
+  background-color: #f8f8f8;
+}
+</style>
+<!-- Global site tag (gtag.js) - Google Analytics -->
+<script async src="https://www.googletagmanager.com/gtag/js?id=UA-172035750-1"></script>
+<script>
+  window.dataLayer = window.dataLayer || [];
+  function gtag(){dataLayer.push(arguments);}
+  gtag('js', new Date());
+  gtag('config', 'UA-172035750-1');
+</script>
+
+<style>
+    #header, #content, #footnotes, #footer {
+  width: 100%;
+  margin: 0 auto;
+  max-width: 120em;
+}
+</style>
+</head>
+<body class="book toc2 toc-left">
+<div id="header">
+<h1>Reader-Translator-Generator (RTG)</h1>
+<div class="details">
+<span id="author" class="author">USC Information Sciences Institute Natural Language Group</span><br>
+</div>
+<div id="toc" class="toc2">
+<div id="toctitle">Table of Contents</div>
+<ul class="sectlevel1">
+<li><a href="#_overview">1. Overview</a>
+<ul class="sectlevel2">
+<li><a href="#_features">1.1. Features</a></li>
+<li><a href="#colab-example">1.2. Google Colab Example</a></li>
+<li><a href="#_setup">1.3. Setup</a>
+<ul class="sectlevel3">
+<li><a href="#_development_setup">1.3.1. Development Setup:</a></li>
+<li><a href="#_requirements">1.3.2. Requirements</a></li>
+</ul>
+</li>
+<li><a href="#_usage">1.4. Usage</a></li>
+<li><a href="#_credits_thanks">1.5. Credits / Thanks</a></li>
+</ul>
+</li>
+<li><a href="#conf">2. RTG <strong><code>conf.yml</code></strong> File</a>
+<ul class="sectlevel2">
+<li><a href="#conf-minimal">2.1. Config Example:</a></li>
+<li><a href="#config-opts">2.2. Config options</a>
+<ul class="sectlevel3">
+<li><a href="#config-schedule">2.2.1. <code>schedule</code> options</a></li>
+<li><a href="#config-criterion">2.2.2. <code>criterion</code> options</a></li>
+</ul>
+</li>
+<li><a href="#conf-early-stop">2.3. Early stop</a></li>
+<li><a href="#conf-optim">2.4. Optimizer</a></li>
+<li><a href="#conf-finetune">2.5. Fine Tuning</a></li>
+<li><a href="#conf-parent-child">2.6. Parent-Child Transfer</a></li>
+<li><a href="#conf-freeze-wt">2.7. Freezing some parts of model</a></li>
+<li><a href="#conf-share-data">2.8. Sharing Data between Experiments</a></li>
+<li><a href="#conf-vocab">2.9. Vocabulary Preprocessing</a>
+<ul class="sectlevel3">
+<li><a href="#_vocabulary_types">2.9.1. Vocabulary Types</a></li>
+<li><a href="#_character_coverage">2.9.2. Character coverage</a></li>
+</ul>
+</li>
+<li><a href="#_sub_word_regularization">2.10. Sub-Word Regularization</a></li>
+</ul>
+</li>
+<li><a href="#avoid-oom">3. Avoiding Out-of-Memory</a>
+<ul class="sectlevel2">
+<li><a href="#_trainer_memory">3.1. Trainer Memory</a></li>
+<li><a href="#_decoder_memory">3.2. Decoder Memory</a></li>
+</ul>
+</li>
+<li><a href="#migrate">4. Migration</a>
+<ul class="sectlevel2">
+<li><a href="#migrate-to-0_6">4.1. v0.5.0 or earlier to v0.6.0</a></li>
+</ul>
+</li>
+<li><a href="#rtg-cli">5. RTG CLI</a>
+<ul class="sectlevel2">
+<li><a href="#_summary">5.1. Summary:</a></li>
+<li><a href="#rtg-pipe">5.2. <code>rtg-pipe</code>:  Pipeline</a></li>
+<li><a href="#rtg-prep">5.3. <code>rtg-prep</code>:  Prepare an experiment</a></li>
+<li><a href="#rtg-train">5.4. <code>rtg-train</code> : Train a Model</a></li>
+<li><a href="#rtg-decode">5.5. <code>rtg-decode</code>: Decoder</a></li>
+<li><a href="#rtg-decode-pro">5.6. <code>rtg-decode-pro</code>: Pro Decoder</a></li>
+<li><a href="#rtg-decode-fork">5.7. <code>rtg-fork</code>: Fork an experiment</a></li>
+<li><a href="#rtg-export">5.8. <code>rtg-export</code> Export</a></li>
+</ul>
+</li>
+<li><a href="#_other_tools">6. Other tools:</a>
+<ul class="sectlevel2">
+<li><a href="#rtg-syscomb">6.1. <code>rtg-syscomb</code> System Combiner</a></li>
+<li><a href="#rtg-perplex">6.2. Perplexity</a></li>
+<li><a href="#line-bleu">6.3. Line Bleu</a></li>
+<li><a href="#rtg-oov">6.4. OOV</a></li>
+<li><a href="#cls-imb-seq-len">6.5. Class imbalance, Sequence lengths</a></li>
+</ul>
+</li>
+<li><a href="#env-vars">7. Environment Variables</a>
+<ul class="sectlevel2">
+<li><a href="#_gpus">7.1. GPUs</a></li>
+<li><a href="#_fast_temporary_filesystem">7.2. Fast Temporary FileSystem</a></li>
+<li><a href="#_number_of_cpu_cores">7.3. Number of CPU Cores</a></li>
+</ul>
+</li>
+<li><a href="#ddp">8. Distributed Data Parallel (DDP)</a></li>
+<li><a href="#fp16">9. FP16, Mixed Precision Training</a></li>
+<li><a href="#grad-clip">10. Gradient Clipping</a></li>
+<li><a href="#scaling-big">11. Scaling Big Using PySpark</a></li>
+<li><a href="#_rtg_serve">12. RTG Serve</a>
+<ul class="sectlevel2">
+<li><a href="#_flask_installation">12.1. Flask Installation</a></li>
+<li><a href="#_running">12.2. Running</a></li>
+<li><a href="#_google_analytics_integration">12.3. Google Analytics Integration</a></li>
+</ul>
+</li>
+<li><a href="#_pre_process_and_post_process">13. Pre-process and post-process</a></li>
+<li><a href="#dev-env">14. Development Environment:</a>
+<ul class="sectlevel2">
+<li><a href="#_run_tests">14.1. Run Tests</a></li>
+<li><a href="#_adding_a_new_model">14.2. Adding a new model</a></li>
+</ul>
+</li>
+<li><a href="#_pypi_release_instructions">15. PyPI Release Instructions</a>
+<ul class="sectlevel2">
+<li><a href="#_steps">15.1. Steps:</a></li>
+<li><a href="#_the_pypirc_file">15.2. The <code>.pypirc</code> file</a></li>
+</ul>
+</li>
+<li><a href="#_docker_release">16. Docker Release</a>
+<ul class="sectlevel2">
+<li><a href="#_docker_for_arm64">16.1. Docker for ARM64</a></li>
+</ul>
+</li>
+<li><a href="#_acknowledgements">Acknowledgements</a></li>
+</ul>
+</div>
+</div>
+<div id="content">
+<div class="sect1">
+<h2 id="_overview">1. Overview</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p><a href="https://github.com/isi-nlp/rtg" target="_blank" rel="noopener">Reader-Translator-Generator (RTG)</a> is a Neural Machine Translation toolkit based on pytorch.</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="versions.html" target="_blank" rel="noopener"><em>See all versions</em></a></p>
+</li>
+<li>
+<p>Demo: 500-Eng multilingual NMT: <a href="http://rtg.isi.edu/many-eng/" class="bare">rtg.isi.edu/many-eng/</a></p>
+</li>
+</ul>
+</div>
+<div class="sect2">
+<h3 id="_features">1.1. Features</h3>
+<div class="ulist">
+<ul>
+<li>
+<p>Reproducible experiments: one <code>conf.yml</code>  that has everything&#8201;&#8212;&#8201;data paths, params, and
+hyper params&#8201;&#8212;&#8201;required to reproduce experiments.</p>
+</li>
+<li>
+<p>Pre-processing options: <a href="https://github.com/google/sentencepiece" target="_blank" rel="noopener">sentencepiece</a> or <a href="https://github.com/isi-nlp/nlcodec" target="_blank" rel="noopener">nlcodec</a> (or add your own)</p>
+<div class="ulist">
+<ul>
+<li>
+<p>word/char/bpe etc types</p>
+</li>
+<li>
+<p>shared vocabulary, separate vocabulary</p>
+<div class="ulist">
+<ul>
+<li>
+<p>one-way, two-way, three-way tied embeddings</p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p><a href="https://arxiv.org/abs/1706.03762" target="_blank" rel="noopener">Transformer model from "Attention is all you need"</a></p>
+<div class="ulist">
+<ul>
+<li>
+<p>Automatically detects and parallelizes across multi GPUs</p>
+<div class="ulist">
+<ul>
+<li>
+<p>Lot of varieties of transformer: width varying, skip transformer etc configurable from YAML files</p>
+</li>
+<li>
+<p><a href="https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf" target="_blank" rel="noopener">RNN based Encoder-Decoder</a> with <a href="https://nlp.stanford.edu/pubs/emnlp15_attn.pdf" target="_blank" rel="noopener">Attention</a>. (No longer using it, but it&#8217;s available for experimentation)</p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p>Language Modeling: RNN, Transformer</p>
+</li>
+<li>
+<p>And more &#8230;&#8203;</p>
+<div class="ulist">
+<ul>
+<li>
+<p>Easy and interpretable code (for those who read code as much as papers)</p>
+</li>
+<li>
+<p>Object Orientated Design. (Not too many levels of functions and function factories like Tensor2Tensor)</p>
+</li>
+<li>
+<p>Experiments and reproducibility are main focus. To control an experiment you edit an YAML file that is inside the experiment directory.</p>
+</li>
+<li>
+<p>Where ever possible, prefer <a href="https://www.wikiwand.com/en/Convention_over_configuration" target="_blank" rel="noopener">convention-over-configuration</a>. Have a look at this experiment directory structure (below).</p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</div>
+<div class="sect2">
+<h3 id="colab-example">1.2. Google Colab Example</h3>
+<div class="paragraph">
+<p>Use this Google Colab notebook for learning <em>how to train your NMT model with RTG</em>: <a href="https://colab.research.google.com/drive/198KbkUcCGXJXnWiM7IyEiO1Mq2hdVq8T?usp=sharing" class="bare">colab.research.google.com/drive/198KbkUcCGXJXnWiM7IyEiO1Mq2hdVq8T?usp=sharing</a></p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_setup">1.3. Setup</h3>
+<div class="paragraph">
+<p><span class="image"><a class="image" href="https://badge.fury.io/py/rtg"><img src="https://badge.fury.io/py/rtg.svg" alt="PyPI version"></a></span></p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>pip install rtg</pre>
+</div>
+</div>
+<div class="sect3">
+<h4 id="_development_setup">1.3.1. Development Setup:</h4>
+<div class="admonitionblock note">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-note" title="Note"></i>
+</td>
+<td class="content">
+This mode of setup is required only if you are developing (i.e. modifying RTG code).
+If you are planning to use RTG without modifying source code, then <code>pip install rtg</code> should be all you need.
+</td>
+</tr>
+</table>
+</div>
+<div class="paragraph">
+<p>While most users are
+Add the root of this repo to <code>PYTHONPATH</code> or install it via <code>pip --editable</code></p>
+</div>
+<div class="paragraph">
+<p>There are two versions of code:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://github.com/isi-nlp/rtg" class="bare">github.com/isi-nlp/rtg</a></p>
+</li>
+<li>
+<p><a href="https://github.com/isi-nlp/rtg-in" class="bare">github.com/isi-nlp/rtg-in</a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Both <code>rtg</code> and <code>rtg-in</code> have the same code on their <code>master</code> branches.
+<code>rtg</code> has stable code base and meant to be used by anyone, so it is recommended for the new users.
+<code>rtg-in</code> is internal to ISI NLP with some unfinished/work-in progress ideas (maybe unpublished), with issues and pull-requests by members of USC ISI team, and often less stable.
+We sync both code bases often (<code>sync-xt.sh</code> at the root of the repo).
+If you like to collaborate with us and/or to get access to <code>rtg-in</code>, email <a href="https://isi.edu/~tg" target="_blank" rel="noopener">TG</a> or <a href="https://isi.edu/~jonmay" target="_blank" rel="noopener">Jonathan May</a>.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>git clone https://github.com/isi-nlp/rtg.git
+cd rtg                # go to the code
+
+conda create -n rtg python=3.7   # creates a conda env named rtg
+conda activate rtg       # activate it
+
+
+pip install --editable .
+# The requirements are in setup.py; you may customize it if you wish
+
+export PYTHONPATH=$PWD # or add it to PYTHONPATH</pre>
+</div>
+</div>
+</div>
+<div class="sect3">
+<h4 id="_requirements">1.3.2. Requirements</h4>
+<div class="admonitionblock note">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-note" title="Note"></i>
+</td>
+<td class="content">
+The required libraries are automatically installed by <code>pip</code>, so manual installation is not required.
+We are listing the requirements here for informative purposes only.
+</td>
+</tr>
+</table>
+</div>
+<div class="admonitionblock note">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-note" title="Note"></i>
+</td>
+<td class="content">
+To view or modify the  version numbers of libraries, please go to <code>setup.py</code> at the root of this project.
+</td>
+</tr>
+</table>
+</div>
+<div class="paragraph">
+<p>The following libraries are used:</p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<caption class="title">Table 1. Table Summary of CLI tools</caption>
+<colgroup>
+<col style="width: 20%;">
+<col style="width: 80%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Library</strong></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Purpose</strong></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">torch</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">deep learning library</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">tensorboard</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">logging and visualizing training and validation losses</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">sacrebleu</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">BLEU scorer</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">sacremoses</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">tokenization and detokenization</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">tqdm</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Progress bar</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">ruamel.yaml</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">configuration management</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">sentencepiece</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">(optional) vocabulary creation using word, char, BPE</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">nlcodec</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">(optional) similar to <code>sentencepiece</code>, but easily customizable; scales to big datasets using pyspark, offers efficient storage of encoded parallel data</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">flask, jinja</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">(optional) HTTP API and web interface for serving the models</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">pyspark</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">(optional) parallelized data preparation (using <code>nlcodec</code>) for massive datasets.</p></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p>Thanks to all the awesome developers of these above tools.</p>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_usage">1.4. Usage</h3>
+<div class="paragraph">
+<p>Refer to <code>scripts/rtg-pipeline.sh</code> bash script and <code>examples/transformer.base.yml</code> file for specific examples.</p>
+</div>
+<div class="paragraph">
+<p>The pipeline takes source (<code>.src</code>) and target (<code>.tgt</code>) files. The sources are in one language and the targets in another. At a minimum, supply a training source, training target, validation source, and validation target. It is best to use <code>.tok</code> files for training. (<code>.tok</code> means tokenized.)</p>
+</div>
+<div class="paragraph">
+<p>Example of training and running a mdoel:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="c"># if you wish to disable gpu, unset</span>
+<span class="c"># export CUDA_VISIBLE_DEVICES=</span>
+
+python <span class="nt">-m</span> rtg.pipeline experiments/sample-exp/
+
+<span class="c"># or use CLI tool installed by pip install</span>
+rtg-pipe experiments/sample-exp/
+
+<span class="c"># or use shell script, edit it to your needs, to submit to Slurm/SGE</span>
+scripts/rtg-pipeline.sh <span class="nt">-d</span> experiments/sample-exp/ <span class="nt">-c</span> experiments/sample-exp/conf.yml
+
+<span class="c"># Then to use the model to translate something:</span>
+<span class="c"># (VERY poor translation due to small training data)</span>
+<span class="nb">echo</span> <span class="s2">"Chacun voit midi à sa porte."</span> | python <span class="nt">-m</span> rtg.decode experiments/sample-exp/</code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The <code>001-tfm</code> directory that hosts an experiment looks like this:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>001-tfm
+├── _PREPARED    &lt;-- Flag file indicating experiment is prepared
+├── _TRAINED     &lt;-- Flag file indicating experiment is trained
+├── conf.yml     &lt;-- Where all the params and hyper params are! You should look into this
+├── data
+│   ├── samples.tsv.gz          &lt;-- samples to log after each check point during training
+│   ├── sentpiece.shared.model  &lt;-- as the name says, sentence piece model, shared
+│   ├── sentpiece.shared.vocab  &lt;-- as the name says
+│   ├── train.db                &lt;-- all the prepared trainig data in a sqlite db
+│   └── valid.tsv.gz            &lt;-- and the validation data
+├── githead       &lt;-- whats was the git HEAD hash this experiment was started?
+├── job.sh.bak    &lt;-- job script used to submit this to grid. Just in case
+├── models        &lt;-- All checkpoints go inside this
+│   ├── model_400_5.265583_4.977106.pkl
+│   ├── model_800_4.478784_4.606745.pkl
+│   ├── ...
+│   └── scores.tsv &lt;-- train and validation losses. incase you dont want to see tensorboard
+├── rtg.log   &lt;-- the python logs are redirected here
+├── rtg.zip   &lt;-- the source code used to run. just `export PYTHONPATH=rtg.zip` to
+├── scripts -&gt; /Users/tg/work/me/rtg/scripts  &lt;-- link to some perl scripts for detok+BLEU
+├── tensorboard    &lt;-- Tensorboard stuff for visualizations
+│   ├── events.out.tfevents.1552850552.hackb0x2
+│   └── ....
+└── test_step2000_beam4_ens5   &lt;-- Tests after the end of training, BLEU scores
+    ├── valid.ref -&gt; /Users/tg/work/me/rtg/data/valid.ref
+    ├── valid.src -&gt; /Users/tg/work/me/rtg/data/valid.src
+    ├── valid.out.tsv
+    ├── valid.out.tsv.detok.tc.bleu
+    └── valid.out.tsv.detok.lc.bleu</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_credits_thanks">1.5. Credits / Thanks</h3>
+<div class="ulist">
+<ul>
+<li>
+<p>OpenNMT and the Harvard NLP team for <a href="http://nlp.seas.harvard.edu/2018/04/03/attention.html" target="_blank" rel="noopener">Annotated Transformer</a>, I learned a lot from their work</p>
+</li>
+<li>
+<p><a href="https://github.com/pytorch/fairseq" target="_blank" rel="noopener">Fairseq</a> has taught and influenced some</p>
+</li>
+<li>
+<p><a href="https://www.isi.edu/research_groups/nlg/people" target="_blank" rel="noopener">My team at USC ISI</a> for everything else</p>
+</li>
+</ul>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="conf">2. RTG <strong><code>conf.yml</code></strong> File</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>The key component of RTG toolkit is a <code>conf.yml</code>. As the name suggest - it is a YAML file containing configuration
+of experiment.
+Before we try to understand what goes into a configuration file, let us review the high level entities:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Experiment - the top level entity that wraps everything below, for the sake of reproducibility.</p>
+</li>
+<li>
+<p>Data Preparation - NLP datasets require preparation of textual data. Typically, creation of
+vocabulary to map text into sequence of integers. Here we can specify type of encoding scheme
+such as BPE/char/words, and vocabulary size.</p>
+</li>
+<li>
+<p>Model - model is neural net for NMT or LM tasks. Here we</p>
+</li>
+<li>
+<p>Optimizer - Optimizer and optimization criteria</p>
+</li>
+<li>
+<p>Trainer - training steps, batch size etc</p>
+</li>
+<li>
+<p>Tester [Optional]&#8201;&#8212;&#8201;testing to do post training</p>
+<div class="ulist">
+<ul>
+<li>
+<p>Tuner [Optional] - to search for beam size, length penalty etc</p>
+</li>
+<li>
+<p>Decoder - the Beam decoder parameters, maybe overwritten by Tuner</p>
+</li>
+<li>
+<p>Suite - a set of source and reference file pairs, for computing BLEU scores</p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+<div class="sect2">
+<h3 id="conf-minimal">2.1. Config Example:</h3>
+<div class="listingblock">
+<div class="title">conf.yml</div>
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">model_args</span><span class="pi">:</span> <span class="c1"># model construction args</span>
+  <span class="na">ff_size</span><span class="pi">:</span> <span class="m">2048</span>
+  <span class="na">hid_size</span><span class="pi">:</span> <span class="m">512</span>
+  <span class="na">n_heads</span><span class="pi">:</span> <span class="m">8</span>
+  <span class="na">attn_dropout</span><span class="pi">:</span> <span class="m">0.1</span>  <span class="c1"># Use lower dropout rates for attention because it masks an entire timestep</span>
+  <span class="na">dropout</span><span class="pi">:</span> <span class="m">0.2</span>
+  <span class="na">enc_layers</span><span class="pi">:</span> <span class="m">6</span>
+  <span class="na">dec_layers</span><span class="pi">:</span> <span class="m">6</span>
+  <span class="na">src_vocab</span><span class="pi">:</span> <span class="m">8000</span>
+  <span class="na">tgt_vocab</span><span class="pi">:</span> <span class="m">8000</span>
+  <span class="na">tied_emb</span><span class="pi">:</span> <span class="s">three-way</span>  <span class="c1"># choices: null, one-way, two-way, three-way</span>
+  <span class="c1"># self_attn_rel_pos: 8  # enable relative pos self attention with window=8</span>
+<span class="na">model_type</span><span class="pi">:</span> <span class="s">tfmnmt</span>  <span class="c1"># model type. tfmnmt is the transformer NMT model</span>
+<span class="na">optimizer</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">adam</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">betas</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="m">0.9</span>
+    <span class="pi">-</span> <span class="m">0.98</span>
+    <span class="na">eps</span><span class="pi">:</span> <span class="s">1.0e-09</span>
+    <span class="na">lr</span><span class="pi">:</span> <span class="m">0.1</span>
+
+<span class="na">schedule</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">noam</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">constant</span><span class="pi">:</span> <span class="m">2</span>
+    <span class="na">warmup</span><span class="pi">:</span> <span class="m">8000</span>
+    <span class="na">model_dim</span><span class="pi">:</span> <span class="m">512</span>
+
+<span class="na">criterion</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">smooth_kld</span>    <span class="c1">#options "cross_entropy", "smooth_kld", "binary_cross_entropy", "triplet_loss"</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">label_smoothing</span><span class="pi">:</span> <span class="m">0.1</span>
+
+<span class="na">prep</span><span class="pi">:</span> <span class="c1"># data preparation</span>
+  <span class="na">max_types</span><span class="pi">:</span> <span class="m">8000</span>  <span class="c1"># maximum number of types in vocab ; if shared_vocab=false, set max_src_types and max_tgt_types separately</span>
+  <span class="na">pieces</span><span class="pi">:</span> <span class="s">bpe</span>   <span class="c1"># choices: bpe, char, word, unigram  from google/sentencepiece</span>
+  <span class="na">shared_vocab</span><span class="pi">:</span> <span class="no">true</span>  <span class="c1"># true means same vocab for src and tgt, false means different vocabs</span>
+  <span class="na">src_len</span><span class="pi">:</span> <span class="m">256</span>   <span class="c1"># longer sentences, decision is made as per 'truncate={true,false}'</span>
+  <span class="na">tgt_len</span><span class="pi">:</span> <span class="m">256</span>
+  <span class="na">truncate</span><span class="pi">:</span> <span class="no">true</span>  <span class="c1"># what to do with long sentences: if true truncate at src_len or tgt_len; if false filter away</span>
+  <span class="na">train_src</span><span class="pi">:</span> <span class="s">wmt_data/data/de-en/europarl-v9.de-en.de.tok</span>   <span class="c1"># training data</span>
+  <span class="na">train_tgt</span><span class="pi">:</span> <span class="s">wmt_data/data/de-en/europarl-v9.de-en.en.tok</span>
+  <span class="na">valid_src</span><span class="pi">:</span> <span class="s">wmt_data/data/dev/newstest2013.de.tok</span>
+  <span class="na">valid_tgt</span><span class="pi">:</span> <span class="s">wmt_data/data/dev/newstest2013.en.tok</span>
+  <span class="na">valid_tgt_raw</span><span class="pi">:</span> <span class="s">wmt_data/data/dev/newstest2013.en</span>  <span class="c1"># unmodified; required for BLEU</span>
+<span class="na">tester</span><span class="pi">:</span>
+  <span class="na">decoder</span><span class="pi">:</span>
+   <span class="na">beam_size</span><span class="pi">:</span> <span class="m">4</span>
+   <span class="na">batch_size</span><span class="pi">:</span> <span class="m">18000</span>   <span class="c1"># effective size = batch_size/beam_size</span>
+  <span class="na">suit</span><span class="pi">:</span>  <span class="c1"># suit of tests to run after the training</span>
+    <span class="na">newstest2013</span><span class="pi">:</span>  <span class="c1"># name of test and list of src.tok, ref file (ref should be unmodified)</span>
+      <span class="pi">-</span> <span class="s">wmt_data/data/dev/newstest2013.de.tok</span>
+      <span class="pi">-</span> <span class="s">wmt_data/data/dev/newstest2013.en</span>
+    <span class="na">newstest2014</span><span class="pi">:</span>  <span class="c1"># name of test and list of src.tok, ref file (ref should be unmodified)</span>
+      <span class="pi">-</span> <span class="s">wmt_data/data/dev/newstest2014-deen-src.de.tok</span>
+      <span class="pi">-</span> <span class="s">wmt_data/data/dev/newstest2014-deen-ref.en</span>
+<span class="na">trainer</span><span class="pi">:</span>
+  <span class="na">init_args</span><span class="pi">:</span>
+    <span class="na">chunk_size</span><span class="pi">:</span> <span class="m">10</span>   <span class="c1"># generation in chunks of time steps to reduce memory consumption</span>
+    <span class="na">grad_accum</span><span class="pi">:</span> <span class="m">1</span>     <span class="c1"># How many batches to accumulate gradients</span>
+  <span class="na">batch_size</span><span class="pi">:</span> <span class="m">4200</span>   <span class="c1"># not exceeding these many tokens (including paddings)</span>
+  <span class="na">check_point</span><span class="pi">:</span> <span class="m">1000</span>  <span class="c1"># how often to checkpoint?</span>
+  <span class="na">keep_models</span><span class="pi">:</span> <span class="m">10</span>   <span class="c1"># how many checkpoints to keep on disk (small enough to save disk, large enough for checkpt averaging</span>
+  <span class="na">steps</span><span class="pi">:</span> <span class="m">200000</span>      <span class="c1"># how many steps to train; if early_stop is enabled, this is max steps</span>
+  <span class="na">keep_in_mem</span><span class="pi">:</span> <span class="no">true</span>   <span class="c1"># keep training data in memory</span>
+<span class="na">updated_at</span><span class="pi">:</span> <span class="s1">'</span><span class="s">2019-03-09T21:15:33.707183'</span>  <span class="c1"># automatically updated by system</span>
+<span class="na">seed</span><span class="pi">:</span> <span class="m">12345</span>  <span class="c1"># fix the manual seed of pytorch + cuda + numpy + python_stdlib RNGs. Remove/comment this to disable</span></code></pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="config-opts">2.2. Config options</h3>
+<table class="tableblock frame-all grid-all fit-content">
+<caption class="title">Table 2. Summary of component choices</caption>
+<colgroup>
+<col>
+<col>
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top">Component</th>
+<th class="tableblock halign-left valign-top">Choices</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">model</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">tfmnmt, rnnmt, rnnlm, tfmlm, skptfmnmt, wvtfmnmt, wvskptfmnmt, tfmextembmt, robertamt, mtfmnmt, hybridmt, CBOW, tfmcls</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">optimizer</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">adam, sgd, adagrad, adam_w, adadelta, sparse_adam</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">schedule</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">noam, inverse_sqrt</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">criterion</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">sparse_cross_entropy, kl_divergence, focal_loss, binary_cross_entropy, smooth_kld, triplet_loss, smooth_kld_and_triplet_loss, dice_loss, squared_error</p></td>
+</tr>
+</tbody>
+</table>
+<div class="sect3">
+<h4 id="config-schedule">2.2.1. <code>schedule</code> options</h4>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p><code>noam</code> with args:</p>
+<div class="ulist">
+<ul>
+<li>
+<p>warmup</p>
+</li>
+<li>
+<p>constant</p>
+</li>
+<li>
+<p>model_dim</p>
+</li>
+</ul>
+</div>
+</li>
+<li>
+<p><code>inverse_sqrt</code> with args:</p>
+<div class="ulist">
+<ul>
+<li>
+<p>warmup</p>
+</li>
+<li>
+<p>peak_lr</p>
+</li>
+</ul>
+</div>
+</li>
+</ol>
+</div>
+</div>
+<div class="sect3">
+<h4 id="config-criterion">2.2.2. <code>criterion</code> options</h4>
+<div class="ulist">
+<ul>
+<li>
+<p><code>smooth_kld</code>     (recommended; used since the first version of transformer)</p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>label_smoothing</code>:  float : [0, 1] : optional: default=0.1</p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<caption class="title">Table 3. Args to <code>smooth_kld</code></caption>
+<colgroup>
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Name</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Type</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Range/Choices</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Required</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Default</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>label_smoothing</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>float</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>[0.0, 1.0)</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.1</p></td>
+</tr>
+</tbody>
+</table>
+<div class="ulist">
+<ul>
+<li>
+<p><code>sparse_cross_entropy</code></p>
+</li>
+</ul>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<caption class="title">Table 4. Args to <code>sparse_cross_entropy</code></caption>
+<colgroup>
+<col style="width: 16.6666%;">
+<col style="width: 16.6666%;">
+<col style="width: 16.6666%;">
+<col style="width: 16.6666%;">
+<col style="width: 16.6666%;">
+<col style="width: 16.667%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top">Name</th>
+<th class="tableblock halign-left valign-top">Type</th>
+<th class="tableblock halign-left valign-top">Range/Choices</th>
+<th class="tableblock halign-left valign-top">Required</th>
+<th class="tableblock halign-left valign-top">Default</th>
+<th class="tableblock halign-left valign-top">Comment</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>weight</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>str</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>{inv_freq, inv_sqrt_freq, inv_log_freq}</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">None &#8658; disable weighing</p></td>
+<td class="tableblock halign-left valign-top"></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>weight_calm_time</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>int</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">[0, )</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0 &#8658; disable calming;</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Applicable when <code>weight</code> is enabled</p></td>
+</tr>
+</tbody>
+</table>
+<div class="ulist">
+<ul>
+<li>
+<p><code>kl_divergence</code>   (re-implementation of <code>smooth_kld</code> with some extra features)</p>
+</li>
+</ul>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<caption class="title">Table 5. Args to <code>kl_divergence</code></caption>
+<colgroup>
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top">Name</th>
+<th class="tableblock halign-left valign-top">Type</th>
+<th class="tableblock halign-left valign-top">Range/Choices</th>
+<th class="tableblock halign-left valign-top">Required</th>
+<th class="tableblock halign-left valign-top">Default</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>label_smoothing</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>float</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>[0.0, 1.0)</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.0 &#8658; disable label smoothing</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>weight</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>str</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>{inv_freq, inv_sqrt_freq, inv_log_freq}</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">None &#8658; disable weighing</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>weight_calm_time</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>int</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">[0, )</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0 &#8658; disable calming &#8658; weights applicable from step 0</p></td>
+</tr>
+</tbody>
+</table>
+<div class="ulist">
+<ul>
+<li>
+<p><code>focal_loss</code>
+.Args to <code>focal_loss</code></p>
+</li>
+</ul>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+<col style="width: 20%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top">Name</th>
+<th class="tableblock halign-left valign-top">Type</th>
+<th class="tableblock halign-left valign-top">Range/Choices</th>
+<th class="tableblock halign-left valign-top">Required</th>
+<th class="tableblock halign-left valign-top">Default</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>gamma</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>float</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>[0.0, )</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0.0 &#8658; disable &#8658; cross entropy</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>weight_calm_time</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><code>int</code></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">[0, )</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Optional</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0 &#8658; disable calming &#8658; weights applicable from step 0</p></td>
+</tr>
+</tbody>
+</table>
+<div class="ulist">
+<ul>
+<li>
+<p><em>Experimental loss functions:</em></p>
+<div class="ulist">
+<ul>
+<li>
+<p><code>dice_loss</code></p>
+</li>
+<li>
+<p><code>binary_cross_entropy</code></p>
+</li>
+<li>
+<p><code>triplet_loss</code></p>
+</li>
+<li>
+<p><code>squared_error</code></p>
+</li>
+</ul>
+</div>
+</li>
+</ul>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-early-stop">2.3. Early stop</h3>
+<div class="paragraph">
+<p>Add the below piece of config to <code>trainer</code> to enable early stop on convergence.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">trainer</span><span class="pi">:</span>
+  <span class="s">....</span>           <span class="c1"># other args</span>
+  <span class="na">steps</span><span class="pi">:</span> <span class="m">100000</span>      <span class="c1"># steps is treated as max steps</span>
+  <span class="na">checkpoint</span><span class="pi">:</span> <span class="m">1000</span>   <span class="c1"># validate every these many steps</span>
+  <span class="na">early_stop</span><span class="pi">:</span>       <span class="c1"># remove this block to disable</span>
+    <span class="na">enabled</span><span class="pi">:</span> <span class="no">true</span>   <span class="c1"># or, alternatively flip this to disable;</span>
+    <span class="na">by</span><span class="pi">:</span> <span class="s">loss</span>        <span class="c1"># stop by validation loss (default); TODO: add BLEU</span>
+    <span class="na">patience</span><span class="pi">:</span> <span class="m">5</span>     <span class="c1"># how many validations to wait, to be sure of stopping; each validation is per check_point steps</span>
+    <span class="na">min_steps</span><span class="pi">:</span> <span class="m">8000</span>  <span class="c1"># minimum steps to wait before test for early stop;</span>
+    <span class="na">signi_round</span><span class="pi">:</span> <span class="m">3</span>   <span class="c1"># significant in 'by' value, used as round(value, signi_round).</span>
+                     <span class="c1"># e.g. round(1/3, 3) = 0.333; round(100/3, 0) = 33; round(100/3, -1) = 30.0</span></code></pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-optim">2.4. Optimizer</h3>
+<div class="paragraph">
+<p>By default, we use the <code>ADAM</code> optimizer from
+<a href="https://arxiv.org/abs/1412.6980">Adam: A Method for Stochastic Optimization</a>.
+It is also possible to use <code>ADAMW</code> from <a href="https://arxiv.org/abs/1711.05101">Decoupled Weight Decay Regularization</a>,
+since weight decay is different in optimizers with variable step sizes.</p>
+</div>
+<div class="paragraph">
+<p>An alternative optimizer may look like:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">optimizer</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">adamw</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">betas</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="m">0.9</span>
+    <span class="pi">-</span> <span class="m">0.98</span>
+    <span class="na">eps</span><span class="pi">:</span> <span class="s">1.0e-09</span>
+    <span class="na">lr</span><span class="pi">:</span> <span class="m">0.0005</span>   <span class="c1"># this doesnt matter, see "schedule"</span>
+    <span class="na">weight_decay</span><span class="pi">:</span> <span class="s">1e-3</span>
+
+<span class="na">schedule</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">inverse_sqrt</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">warmup</span><span class="pi">:</span> <span class="m">4000</span>
+    <span class="na">peak_lr</span><span class="pi">:</span> <span class="m">0.0005</span></code></pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-finetune">2.5. Fine Tuning</h3>
+<div class="paragraph">
+<p>We define fine tuning as the act of changing the training data at certain time step in the training process.
+To enable this feature, we need to do following.</p>
+</div>
+<div class="paragraph">
+<p>Step1. specify, <code>finetune_src</code> <code>finetune_tgt</code> in the <code>prep</code> block as follows</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span> <span class="c1"># data preparation</span>
+  <span class="s">....</span>
+  <span class="s">train_src</span><span class="err">:</span> <span class="s">wmt_data/data/de-en/europarl-v9.de-en.de.tok</span>   <span class="c1"># training data</span>
+  <span class="na">train_tgt</span><span class="pi">:</span> <span class="s">wmt_data/data/de-en/europarl-v9.de-en.en.tok</span>
+  <span class="na">finetune_src</span><span class="pi">:</span> <span class="s">wmt_data/data/de-en/finetune.de-en.de.tok</span>   <span class="c1"># Finetuning data</span>
+  <span class="na">finetune_tgt</span><span class="pi">:</span> <span class="s">wmt_data/data/de-en/finetune.de-en.en.tok</span>
+  <span class="na">valid_src</span><span class="pi">:</span> <span class="s">wmt_data/data/dev/newstest2013.de.tok</span>
+  <span class="na">valid_tgt</span><span class="pi">:</span> <span class="s">wmt_data/data/dev/newstest2013.en.tok</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Step2, Inform the Trainer to continue training, edit the <code>trainer</code> block with <code>finetune_steps</code>.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">trainer</span><span class="pi">:</span>
+  <span class="na">batch_size</span><span class="pi">:</span> <span class="m">12000</span>        <span class="c1"># training batch size</span>
+  <span class="na">steps</span><span class="pi">:</span> <span class="m">200000</span>           <span class="c1"># how many steps to train</span>
+  <span class="na">finetune_steps</span><span class="pi">:</span> <span class="m">300000</span> <span class="c1"># fine tuning steps.</span>
+  <span class="na">finetune_batch_size</span><span class="pi">:</span> <span class="m">1024</span>  <span class="c1"># fine tuning batch_size; optional; default is training batach_size</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>This makes the trainer use <code>train_{src,tgt}</code> for 0 - 200k steps,  followed by <code>finetune_{src,tgt}</code>
+for 200k-300k steps. Note that <code>finetune_steps &gt; steps</code> .</p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-parent-child">2.6. Parent-Child Transfer</h3>
+<div class="paragraph">
+<p>To initialize from another compatible model as parent, add <code>parent:</code> specification to conf.yml as shown below:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">model_type</span><span class="pi">:</span> <span class="s">tfmnmt</span>
+<span class="na">model_args</span><span class="pi">:</span>
+  <span class="c1"># will be inherited from parent  ; see parent.mode.args: true</span>
+<span class="na">parent</span><span class="pi">:</span>
+  <span class="na">experiment</span><span class="pi">:</span> <span class="s">&lt;path/to/experiment/dir&gt;</span>
+  <span class="na">vocab</span><span class="pi">:</span>
+    <span class="na">shared</span><span class="pi">:</span> <span class="s">shared</span>       <span class="c1"># for reusing the shared vocab</span>
+    <span class="c1">#src: src            # for separate vocabs</span>
+    <span class="c1">#tgt: tgt</span>
+  <span class="na">shrink</span><span class="pi">:</span> <span class="no">true</span>        <span class="c1"># shrink vocabularies and embeddings to child data</span>
+                      <span class="c1"># specified in train_{src,tgt} and mono_{src,tgt}</span>
+  <span class="na">model</span><span class="pi">:</span>
+    <span class="na">args</span><span class="pi">:</span> <span class="no">true</span>          <span class="c1"># update/overwrite the model_args of child with the parent</span>
+    <span class="na">ensemble</span><span class="pi">:</span> <span class="m">5</span>         <span class="c1"># how many checkpoints of parent to ensemble, to obtain initial state</span>
+<span class="c1"># ... rest of the config such as prep, trainer etc</span></code></pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-freeze-wt">2.7. Freezing some parts of model</h3>
+<div class="paragraph">
+<p>Frozen weights associated to parts of network means the weights remain unmodified during the course of the training.
+It is a useful feature when the model weights are initialized from a well trained parent model.
+WKT Optimizer is the one that modifies model&#8217;s parameters according to their gradients.
+Therefore, to freeze the weights implies excluding the weights from optimizer.
+Or alternatively, explicitly mention the parts of the model needs to be trained (i.e. updated by optimizer).</p>
+</div>
+<div class="paragraph">
+<p>Here is an example&#8201;&#8212;&#8201;comment or remove the parts that you wish to freeze in the below 6 layer network.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">optimizer</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">adam</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="s">....# the usual args for optimizer</span>
+  <span class="na">trainable</span><span class="pi">:</span>  <span class="c1"># trainable parameter</span>
+    <span class="na">include</span><span class="pi">:</span> <span class="c1"># only include these and exclude everything else not listed here</span>
+    <span class="pi">-</span> <span class="s">src_embed</span>
+    <span class="pi">-</span> <span class="s">tgt_embed</span>
+    <span class="pi">-</span> <span class="s">generator</span>
+    <span class="pi">-</span> <span class="s1">'</span><span class="s">encoder:0,1,2,3,4,5'</span>  <span class="c1"># the numbers are layer indices starting from 0</span>
+    <span class="pi">-</span> <span class="s1">'</span><span class="s">decoder:0,1,2,3,4,5'</span>  <span class="c1"># the numbers are layer indices starting from 0</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>TODO: add support for <code>exclude</code> logic i.e., include everything else except the mentioned.</p>
+</div>
+<div class="paragraph">
+<p>This feature is supported only in <code>AbstractTransformerNMT</code> and all of its children.
+If you are adding a new <code>NMTModel</code> or customising this feature, please override <code>get_trainable_parameters(self, include, exclude)</code> function to support this feature.</p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-share-data">2.8. Sharing Data between Experiments</h3>
+<div class="paragraph">
+<p>In the new experiment config, add <code>same_data</code> to reference parent experiment from which the data
+should be reused for training and validation. Note that this uses the same vocabulary as parent.
+The child experiment creates a symbolic link to parent experiments data (instead of copying,
+to reduce the disk space).</p>
+</div>
+<div class="paragraph">
+<p>Alternatively, you may use <code>rtg-fork --data</code> to fork an experiment with same data,
+where the forked experiment will have symbolic link to parent&#8217;s data.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+  <span class="na">same_data</span><span class="pi">:</span> <span class="s">path/to/prior/experiment_dir</span></code></pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="conf-vocab">2.9. Vocabulary Preprocessing</h3>
+<div class="paragraph">
+<p><a href="https://github.com/google/sentencepiece">Google&#8217;s sentencepiece</a> is an awesome lib for
+preprocessing the text datasets.
+We&#8217;ve used sentencepiece&#8217;s python API since day-1 of RTG and it is the default library.
+However, since the core sentencepiece is written in C, it was hard to modify to explore some new
+ideas on BPE (without knowing C). So, we reimplemented BPE in pure python, with advanced
+datastructures such as linked-lists, prefix tries and dirty-maxheap to match the speed.
+Our reimplementation is named as <a href="https://github.com/isi-nlp/nlcodec/">NLCodec</a>.
+NLCoded can be enabled as:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+  <span class="s">....</span>
+  <span class="s">codec_lib</span><span class="err">:</span> <span class="s">nlcodec</span>  <span class="c1"># default is sentpiece</span></code></pre>
+</div>
+</div>
+<div class="sect3">
+<h4 id="_vocabulary_types">2.9.1. Vocabulary Types</h4>
+<div class="paragraph">
+<p>Both <code>sentpiece</code> or <code>nlcodec</code> support <code>pieces=</code> <code>bpe</code>, <code>char</code>, <code>word</code>.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+  <span class="s">....</span>
+  <span class="s">codec_lib</span><span class="err">:</span> <span class="s">nlcodec</span>  <span class="c1"># other option: sentpiece</span>
+  <span class="na">pieces</span><span class="pi">:</span> <span class="s">bpe</span>         <span class="c1"># other options: char, word</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>As of now, only <code>sentpiece</code> supports <code>pieces=unigram</code>.
+For classification experiments, <code>nlcodec</code> supports <code>pieces=class</code></p>
+</div>
+</div>
+<div class="sect3">
+<h4 id="_character_coverage">2.9.2. Character coverage</h4>
+<div class="paragraph">
+<p>For <code>bpe</code> and <code>char</code> vocabulary types, a useful trick is to exclude low frequency character and mark them as <code>UNK&#8217;s</code>.
+Usually expressed as percentage of character coverage in training corpus.
+Sentencepiece&#8217;s default (when we last checked) is 99.95% ie 0.9995.
+Here is how to set this for eg to 99.99% i.e. 0.9999 in <code>nlcodec</code></p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+  <span class="s">....</span>
+  <span class="s">codec_lib</span><span class="err">:</span> <span class="s">nlcodec</span>      <span class="c1"># other option: sentpiece</span>
+  <span class="na">pieces</span><span class="pi">:</span> <span class="s">bpe</span>             <span class="c1"># other options: char, word</span>
+  <span class="na">char_coverage</span><span class="pi">:</span> <span class="s">0.9999</span></code></pre>
+</div>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_sub_word_regularization">2.10. Sub-Word Regularization</h3>
+<div class="paragraph">
+<p>When using <code>codec_lib: nlcodec</code> and <code>pieces: bpe</code>, you have the option to add  sub-word regularization to your training.
+Normally, text is split into the fewest tokens necessary to represent  the sequence (greedy split).
+By occasionally splitting some tokens into its constituents (suboptimal split),  we can represent the same sequence many ways.
+This allows us to leverage less data more effectively.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">trainer</span><span class="pi">:</span>
+  <span class="s">....</span>
+  <span class="s">split_ratio</span><span class="err">:</span> <span class="m">0.1</span>        <span class="c1"># 10% chance to suboptimally split (recursive)</span>
+  <span class="na">dynamic_epoch</span><span class="pi">:</span> <span class="no">true</span>     <span class="c1"># Recompute splits for each epoch</span></code></pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="avoid-oom">3. Avoiding Out-of-Memory</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>Out-of-memory is pretty common, and we have worked out ways to avoid that situation as much as possible.</p>
+</div>
+<div class="sect2">
+<h3 id="_trainer_memory">3.1. Trainer Memory</h3>
+<div class="paragraph">
+<p>Let&#8217;s visualize the total required memory for training a model in the order of a 4D tensor: <code>[ ModelDim x Batch x SequenceLength x Vocabulary]</code></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Model dim is often fixed. We dont do anything fancy here.</p>
+</li>
+<li>
+<p>Vocabulary size is often fixed too. We dont do anything fancy here.
+If you can use smaller target vocabulary, it greatly reduces memory consumption.
+Sometimes, especially when training data is less, <a href="https://arxiv.org/abs/2004.02334">using smaller target vocabulary  such as 8K is actually best thing to do!</a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>So, we are left with <code>Batch x SequenceLength</code> as two dims that we can manipulate.</p>
+</div>
+<div class="paragraph">
+<p>For <strong>SequenceLength</strong>, set <code>trainer.init_args.chunk_size</code> to a smaller value to break down whole sequence into smaller chunks.
+This operation does not affect gradients, but affects training time. Smaller chunk_size &#8658; less memory, but it also means more chunks &#8658; more time.
+Also note that the <code>prep.src_len</code> and <code>prep.tgt_len</code> allows you to decide maximum length of source and target sequences.
+When combined that with <code>prep.truncate=True</code>, all longer sequences will be truncated, or <code>prep.truncate=False</code> causes the longer sequences to be dropped.</p>
+</div>
+<div class="paragraph">
+<p>Regarding <strong>Batch</strong>, there are some things you can do.</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>If you have GPUs with larger memory, use them. For example, V100 with 32GB is much better than 1080 Ti with 11GB.</p>
+</li>
+<li>
+<p>If you dont have larger GPU, but you have many smaller GPUs, use many them by setting <code>CUDA_VISIBLE_DEVICES</code> variable to comma separated list of GPU IDs.
+The built in <code>DataParallel</code> module divides batches into multiple GPUs &#8658; reduces total memory needed on each GPU.</p>
+</li>
+<li>
+<p>If you dont have multiple GPUs, use <code>trainer.init_args.grad_accum</code>.  eg. if you set <code>grad_accum=2</code>, the effective <code>batch_size</code> is <code>2 * batch_size</code>.</p>
+</li>
+</ol>
+</div>
+<div class="paragraph">
+<p>In summary, to make best out of your GPUs, adjust <code>trainer.init_args.chunk_size</code>, <code>trainer.init_args.grad_accum</code>, and <code>trainer.batch_size</code>.
+I suggest using <code>gpustat -i 0.5</code>, look at the GPU RAM usage and see if you need to increase or decrease some parameters.</p>
+</div>
+<div class="paragraph">
+<p>Regarding the CPU RAM, we usually need as much as a single GPU RAM.
+But if you have a plenty of it, please enable <code>trainer.keep_in_mem=True</code> to reduce disk IO.
+This <code>keep_in_mem</code> parameter informs the trainer to load training data once and hold it in CPU RAM during the course of training.</p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_decoder_memory">3.2. Decoder Memory</h3>
+<div class="paragraph">
+<p>Since beam decoder is used, let&#8217;s visualize memory as <code>[Batch x Beams x Vocabulary x SequenceLength]</code></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>tester.decoder.beam_size</code> : Number of beams to be used. You may reduce it, e.g. beam_size=4 if often a good value.</p>
+</li>
+<li>
+<p><code>tester.decoder.batch_size</code> for 1 beam. internally, it calculates, effective = batch_size/beam_size</p>
+</li>
+<li>
+<p><code>tester.decoder.max_len</code> is a relative length. It decides how long the target sequence can grow in relation to source length. For example, max_len=50 &#8658; len(src) + 50</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><code>rtg-decode</code> has <code>--max-src-len</code> argument which can be used to hard limit the max length of source sentences.
+<code>--max-src-len</code> can be degrade test performance since it drops out words.
+Right thing to do for long sequences will be to split long sentences in input and merge the outputs after decoding.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="migrate">4. Migration</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="migrate-to-0_6">4.1. v0.5.0 or earlier to v0.6.0</h3>
+<div class="paragraph">
+<p>The optimizer block got a big update in v0.6.0, as a result it is not backward compatible.</p>
+</div>
+<div class="listingblock">
+<div class="title">Old config, prior to v0.6.0:</div>
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">optim</span><span class="pi">:</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">betas</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="m">0.9</span>
+    <span class="pi">-</span> <span class="m">0.98</span>
+    <span class="na">eps</span><span class="pi">:</span> <span class="s">1.0e-09</span>
+    <span class="na">label_smoothing</span><span class="pi">:</span> <span class="m">0.1</span>
+    <span class="na">lr</span><span class="pi">:</span> <span class="m">0.1</span>
+    <span class="na">warmup_steps</span><span class="pi">:</span> <span class="m">4000</span>
+    <span class="na">amsgrad</span><span class="pi">:</span> <span class="no">false</span>
+    <span class="na">weight_decay</span><span class="pi">:</span> <span class="m">0</span>
+    <span class="na">criterion</span><span class="pi">:</span> <span class="s">smooth_kld</span>
+    <span class="na">inv_sqrt</span><span class="pi">:</span> <span class="no">false</span>
+    <span class="na">constant</span><span class="pi">:</span> <span class="m">2</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">ADAM</span></code></pre>
+</div>
+</div>
+<div class="listingblock">
+<div class="title">New config in v0.6.0</div>
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">optimizer</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">adam</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">betas</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="m">0.9</span>
+    <span class="pi">-</span> <span class="m">0.98</span>
+    <span class="na">eps</span><span class="pi">:</span> <span class="s">1.0e-09</span>
+    <span class="na">lr</span><span class="pi">:</span> <span class="m">0.1</span>
+
+<span class="na">schedule</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">noam</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">constant</span><span class="pi">:</span> <span class="m">2</span>
+    <span class="na">warmup</span><span class="pi">:</span> <span class="m">4000</span>
+    <span class="na">model_dim</span><span class="pi">:</span> <span class="m">512</span>
+
+<span class="na">criterion</span><span class="pi">:</span>
+  <span class="na">name</span><span class="pi">:</span> <span class="s">smooth_kld</span>
+  <span class="na">args</span><span class="pi">:</span>
+    <span class="na">label_smoothing</span><span class="pi">:</span> <span class="s">0.1</span></code></pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="rtg-cli">5. RTG CLI</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>All the below CLI tools give you finer control to go step by step if you want to test only a part of the pipeline.
+For end usage of the RTG toolkit, the workflow should be as simple as:</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Edit the <a href="conf.yml.adoc"><code>conf.yml</code> file</a></p>
+</li>
+<li>
+<p>Run the pipeline using <code>python -m rtg.pipeline</code> or <code>rtg-pipe</code> command</p>
+</li>
+<li>
+<p>Occasionally, to decode newer tests files that were not listed in conf.yml, use <code>python -m rtg.decode</code> or <code>rtg-decode</code></p>
+</li>
+</ol>
+</div>
+<div class="sect2">
+<h3 id="_summary">5.1. Summary:</h3>
+<div class="paragraph">
+<p>The following command line tools are added when <code>rtg</code> is installed using pip.</p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<caption class="title">Table 6. Table Summary of CLI tools</caption>
+<colgroup>
+<col style="width: 50%;">
+<col style="width: 50%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Command</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Purpose</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-pipe</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Run rtg-prep, rtg-train and test case evaluation</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-decode</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Decode new source files using the values set in <code>conf.yml</code></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-export</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Export an experiment</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-fork</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Fork an experiment with/without same conf, code, data, vocabularies etc</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-serve</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Serve an RTG model over HTTP API using  Flask server</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-decode-pro</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Decode new source files using the values that you supply from CLI args</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-prep</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Prepare an experiment. You should be using <code>rtg-pipe</code></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-train</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Train a model. You should be using <code>rtg-pipe</code></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-syscomb</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">System combination. Dont bother about it for now.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-launch</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Launch data distributed training</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">rtg-params</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Show parameters in model</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<div class="sect2">
+<h3 id="rtg-pipe">5.2. <code>rtg-pipe</code>:  Pipeline</h3>
+<div class="paragraph">
+<p>This is the  CLI interface that most likely use.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>$ python -m rtg.pipeline -h
+
+usage: rtg.prep [-h] [-G] exp [conf]
+
+prepare NMT experiment
+
+positional arguments:
+  exp             Working directory of experiment
+  conf            Config File. By default &lt;work_dir&gt;/conf.yml is used
+
+optional arguments:
+  -h, --help      show this help message and exit
+  -G, --gpu-only  Crash if no GPU is available</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-prep">5.3. <code>rtg-prep</code>:  Prepare an experiment</h3>
+<div class="listingblock">
+<div class="content">
+<pre>    $ python -m rtg.prep -h
+    usage: rtg.prep [-h] work_dir [conf_file]
+
+    prepare NMT experiment
+
+    positional arguments:
+      work_dir    Working directory
+      conf_file   Config File. By default &lt;work_dir&gt;/conf.yml is used
+
+    optional arguments:
+      -h, --help  show this help message and exit</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-train">5.4. <code>rtg-train</code> : Train a Model</h3>
+<div class="listingblock">
+<div class="content">
+<pre>    $ python -m rtg.train -h
+    usage: rtg.train [-h] [-rs SEED] [-st STEPS] [-cp CHECK_POINT]
+                     [-km KEEP_MODELS] [-bs BATCH_SIZE] [-op {ADAM,SGD}]
+                     [-oa OPTIM_ARGS] [-ft]
+                     work_dir
+
+    Train NMT model
+
+    positional arguments:
+      work_dir              Working directory
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -rs SEED, --seed SEED
+                            Seed for random number generator. Set it to zero to
+                            not touch this part. (default: 0)
+      -st STEPS, --steps STEPS
+                            Total steps (default: 128000)
+      -cp CHECK_POINT, --check-point CHECK_POINT
+                            Store model after every --check-point steps (default:
+                            1000)
+      -km KEEP_MODELS, --keep-models KEEP_MODELS
+                            Number of checkpoints to keep. (default: 10)
+      -bs BATCH_SIZE, --batch-size BATCH_SIZE
+                            Mini batch size of training and validation (default:
+                            256)
+      -op {ADAM,SGD}, --optim {ADAM,SGD}
+                            Name of optimizer (default: ADAM)
+      -oa OPTIM_ARGS, --optim-args OPTIM_ARGS
+                            Comma separated key1=val1,key2=val2 args to optimizer.
+                            Example: lr=0.01,warmup_steps=1000 The arguments
+                            depends on the choice of --optim (default: lr=0.001)
+      -ft, --fine-tune      Use fine tune corpus instead of train corpus.
+                            (default: False)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-decode">5.5. <code>rtg-decode</code>: Decoder</h3>
+<div class="listingblock">
+<div class="content">
+<pre>usage: rtg.decode [-h] [-if [INPUT [INPUT ...]]] [-of [OUTPUT [OUTPUT ...]]]
+                  [-sc] [-b BATCH_SIZE] [-msl MAX_SRC_LEN] [-nb]
+                  exp_dir
+
+Decode using NMT model
+
+positional arguments:
+  exp_dir               Experiment directory
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -if [INPUT [INPUT ...]], --input [INPUT [INPUT ...]]
+                        Input file path. default is STDIN (default:
+                        [&lt;_io.TextIOWrapper name='&lt;stdin&gt;' encoding='utf-8'&gt;])
+  -of [OUTPUT [OUTPUT ...]], --output [OUTPUT [OUTPUT ...]]
+                        Output File path. default is STDOUT (default:
+                        [&lt;_io.TextIOWrapper name='&lt;stdout&gt;'
+                        encoding='utf-8'&gt;])
+  -sc, --skip-check     Skip Checking whether the experiment dir is prepared
+                        and trained (default: False)
+  -b BATCH_SIZE, --batch-size BATCH_SIZE
+                        batch size for 1 beam. effective_batch =
+                        batch_size/beam_size (default: None)
+  -msl MAX_SRC_LEN, --max-src-len MAX_SRC_LEN
+                        max source len; longer seqs will be truncated
+                        (default: None)
+  -nb, --no-buffer      Processes one line per batch followed by flush output
+                        (default: False)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-decode-pro">5.6. <code>rtg-decode-pro</code>: Pro Decoder</h3>
+<div class="paragraph">
+<p>Note: for simple use with defauls from conf.yml, use  <code>rtg-decode</code> or <code>python -m rtg.decode</code>.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>    $ python -m rtg.decode_pro -h
+    usage: rtg.decode [-h] [-if INPUT] [-of OUTPUT] [-bs BEAM_SIZE] [-ml MAX_LEN]
+                      [-nh NUM_HYP] [--prepared]
+                      [-bp {E1D1,E2D2,E1D2E2D1,E2D2E1D2,E1D2,E2D1}] [-it] [-sc]
+                      [-en ENSEMBLE] [-cb SYS_COMB]
+                      work_dir [model_path [model_path ...]]
+
+    Decode using NMT model
+
+    positional arguments:
+      work_dir              Working directory
+      model_path            Path to model's checkpoint. If not specified, a best
+                            model (based on the score on validation set) from the
+                            experiment directory will be used. If multiple paths
+                            are specified, then an ensembling is performed by
+                            averaging the param weights (default: None)
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -if INPUT, --input INPUT
+                            Input file path. default is STDIN (default:
+                            &lt;_io.TextIOWrapper name='&lt;stdin&gt;' mode='r'
+                            encoding='UTF-8'&gt;)
+      -of OUTPUT, --output OUTPUT
+                            Output File path. default is STDOUT (default:
+                            &lt;_io.TextIOWrapper name='&lt;stdout&gt;' mode='w'
+                            encoding='UTF-8'&gt;)
+      -bs BEAM_SIZE, --beam-size BEAM_SIZE
+                            Beam size. beam_size=1 is greedy, In theory: higher
+                            beam is better approximation but expensive. But in
+                            practice, higher beam doesnt always increase.
+                            (default: 5)
+      -ml MAX_LEN, --max-len MAX_LEN
+                            Maximum output sequence length (default: 100)
+      -nh NUM_HYP, --num-hyp NUM_HYP
+                            Number of hypothesis to output. This should be smaller
+                            than beam_size (default: 1)
+      --prepared            Each token is a valid integer which is an index to
+                            embedding, so skip indexifying again (default: False)
+      -bp {E1D1,E2D2,E1D2E2D1,E2D2E1D2,E1D2,E2D1}, --binmt-path {E1D1,E2D2,E1D2E2D1,E2D2E1D2,E1D2,E2D1}
+                            Sub module path inside BiNMT. applicable only when
+                            model is BiNMT (default: None)
+      -it, --interactive    Open interactive shell with decoder (default: False)
+      -sc, --skip-check     Skip Checking whether the experiment dir is prepared
+                            and trained (default: False)
+      -en ENSEMBLE, --ensemble ENSEMBLE
+                            Ensemble best --ensemble models by averaging them
+                            (default: 1)
+      -cb SYS_COMB, --sys-comb SYS_COMB
+                            System combine models at the softmax layer using the
+                            weights specified in this file. When this argument is
+                            supplied, model_path argument is ignored. (default:
+                            None)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-decode-fork">5.7. <code>rtg-fork</code>: Fork an experiment</h3>
+<div class="listingblock">
+<div class="content">
+<pre>usage: rtg-fork [-h] [--conf | --no-conf] [--data | --no-data]
+                [--vocab | --no-vocab] [--code | --no-code]
+                EXP_DIR TO_DIR
+
+fork an experiment.
+
+positional arguments:
+  EXP_DIR     From experiment. Should be valid experiment dir
+  TO_DIR      To experiment. This will be created.
+
+optional arguments:
+  -h, --help  show this help message and exit
+  --conf      Copy config: from/conf.yml → to/conf.yml (default: True)
+  --no-conf   Negation of --conf (default: False)
+  --data      Link data dir . This includes vocab. (default: True)
+  --no-data   Negation of --data (default: False)
+  --vocab     copy vocabularies. dont use it with --data (default: False)
+  --no-vocab  Negation of --vocab (default: True)
+  --code      copy source code. (default: True)
+  --no-code   Negation of --code (default: False)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-export">5.8. <code>rtg-export</code> Export</h3>
+<div class="paragraph">
+<p>Export an experiment:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>    python -m rtg.export -h
+    usage: export.py [-h] [-en ENSEMBLE] [-nm NAME] [--config | --no-config]
+                     [--vocab | --no-vocab]
+                     source target
+
+    positional arguments:
+      source                Path to experiment (source)
+      target                Path to destination where the export should be
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -en ENSEMBLE, --ensemble ENSEMBLE
+                            Maximum number of checkpoints to average and export.
+                            set 0 to disable (default: 5)
+      -nm NAME, --name NAME
+                            Name for the exported model (active when --ensemble &gt;
+                            0). Value should be a single word. This will be useful
+                            if you are going to place multiple exports in a same
+                            dir for system combination (default: None)
+      --config              Copy config (default: True)
+      --no-config           See --config (default: False)
+      --vocab               Copy vocabulary files (such as sentence piece models)
+                            (default: True)
+      --no-vocab            See --vocab (default: False)</pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_other_tools">6. Other tools:</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="rtg-syscomb">6.1. <code>rtg-syscomb</code> System Combiner</h3>
+<div class="listingblock">
+<div class="content">
+<pre>    python -m rtg.syscomb -h
+    usage: __main__.py [-h] [-b BATCH_SIZE] [-s STEPS]
+                       experiment models [models ...]
+
+    positional arguments:
+      experiment            Path to experiment directory
+      models                Path to models
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -b BATCH_SIZE, --batch-size BATCH_SIZE
+                            Batch size (default: 128)
+      -s STEPS, --steps STEPS
+                            Training steps (default: 2000)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-perplex">6.2. Perplexity</h3>
+<div class="paragraph">
+<p>Compute perplexity of a language model on a test set.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>    $ python -m rtg.eval.perplexity -h
+    usage: rtg.eval.perplexity [-h] [-t TEST] [-en ENSEMBLE]
+                           work_dir [model_path [model_path ...]]
+
+    positional arguments:
+    work_dir              Working/Experiment directory
+    model_path            Path to model's checkpoint. If not specified, a best
+                        model (based on the score on validation set) from the
+                        experiment directory will be used. If multiple paths
+                        are specified, then an ensembling is performed by
+                        averaging the param weights (default: None)
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -t TEST, --test TEST  test file path. default is STDIN (default:
+                        &lt;_io.TextIOWrapper name='&lt;stdin&gt;' mode='r'
+                        encoding='UTF-8'&gt;)
+    -en ENSEMBLE, --ensemble ENSEMBLE
+                        Ensemble best --ensemble models by averaging them
+                        (default: 1)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="line-bleu">6.3. Line Bleu</h3>
+<div class="paragraph">
+<p>Computes BLEU per line</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>    python -m rtg.eval.linebleu -h
+    usage: linebleu.py [-h] [-c CANDS] [-r REFS] [-n N] [-nr] [-nc] [-o OUT] [-v]
+
+    Computes BLEU score per record.
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -c CANDS, --cands CANDS
+                            Candidate (aka output from NLG system) file (default:
+                            &lt;_io.TextIOWrapper name='&lt;stdin&gt;' mode='r'
+                            encoding='UTF-8'&gt;)
+      -r REFS, --refs REFS  Reference (aka human label) file (default:
+                            &lt;_io.TextIOWrapper name='&lt;stdin&gt;' mode='r'
+                            encoding='UTF-8'&gt;)
+      -n N, --n N           maximum n as in ngram. (default: 4)
+      -nr, --no-refs        Do not write references to --out (default: False)
+      -nc, --no-cands       Do not write candidates to --out (default: False)
+      -o OUT, --out OUT     Output file path to store the result. (default:
+                            &lt;_io.TextIOWrapper name='&lt;stdout&gt;' mode='w'
+                            encoding='UTF-8'&gt;)
+      -v, --verbose         verbose mode (default: False)</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="rtg-oov">6.4. OOV</h3>
+<div class="paragraph">
+<p>Compute Out-of-Vocabulary(OOV) rate</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>    $ python -m rtg.tool.oov -h
+    usage: oov.py [-h] -tr TRAIN [-ts [TESTS [TESTS ...]]]
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -tr TRAIN, --train TRAIN
+                            Train file path (default: None)
+      -ts [TESTS [TESTS ...]], --test [TESTS [TESTS ...]]
+                            Test file paths (default: [&lt;_io.TextIOWrapper
+                            name='&lt;stdin&gt;' mode='r' encoding='UTF-8'&gt;])</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="cls-imb-seq-len">6.5. Class imbalance, Sequence lengths</h3>
+<div class="paragraph">
+<p>Computes class Imbalance on training data and reports mean and median sequence lengths
+Get the stats reported in <a href="https://arxiv.org/abs/2004.02334">Gowda and May 's Neural Machine Translation with Imbalanced Classes</a></p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>$ python -m rtg.eval.imbalance -h
+usage: imbalance.py [-h] exp
+
+positional arguments:
+  exp         Path to experiment directory
+
+optional arguments:
+  -h, --help  show this help message and exit</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Example:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre>$ python -m rtg.eval.imbalance runs/001-tfm
+Experiment: runs/001-tfm shared_vocab:True
+src types: 500 toks: 2,062,912 len_mean: 15.8686 len_median: 15.0 imbalance: 0.4409
+tgt types: 500 toks: 1,711,685 len_mean: 13.1668 len_median: 12.0 imbalance: 0.4632
+n_segs: 130,000</pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="env-vars">7. Environment Variables</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_gpus">7.1. GPUs</h3>
+<div class="paragraph">
+<p>By default, RTG uses all GPUs specified by <code>CUDA_VISIBLE_DEVICES</code> environment variable.</p>
+</div>
+<div class="paragraph">
+<p>To check if GPU is configured correctly,</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>python -c 'import torch; print(torch.cuda.is_available(), torch.cuda.device_count())'
+# prints True and number_of_gpus</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>You can specify multiple GPUS, say devices with ids <code>0</code> and <code>1</code></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>export CUDA_VISIBLE_DEVICES=0,1</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>To disable GPU usage, simply set empty string to the variable or unset it</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>export CUDA_VISIBLE_DEVICES=
+unset CUDA_VISIBLE_DEVICES</pre>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_fast_temporary_filesystem">7.2. Fast Temporary FileSystem</h3>
+<div class="paragraph">
+<p>When shared compute grids with network file systems (NFS) are used, the disk IO can be too slow.
+It helps to move training data that is frequently read to a fast temporary file system.
+Placing training data on TMPFS can be a good thing to do in this situation.
+<code>export RTG_TMP</code> to the desired path such as <code>$TMPDIR</code> before starting rtg process.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nb">export </span><span class="nv">RTG_TMP</span><span class="o">=</span><span class="nv">$TMPDIR</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The RTG_TMP does NOT have to be unique to each directory. So you can use the same directory for all
+the experiments.</p>
+</div>
+<div class="paragraph">
+<p>Note: the model checkpoints dont use TMPDIR as of now. Since the checkpoints are
+taken once for every 1000 steps or so, it should be okay for now. But if it is a problem that needs to be addressed
+we shall revise this decision again.</p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_number_of_cpu_cores">7.3. Number of CPU Cores</h3>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nb">export </span><span class="nv">RTG_CPUS</span><span class="o">=</span>10     <span class="c">#$SLURM_CPUS_ON_NODE</span>
+<span class="nb">export </span><span class="nv">OMP_NUM_THREADS</span><span class="o">=</span><span class="nv">$RTG_CPUS</span>
+<span class="nb">export </span><span class="nv">MKL_NUM_THREADS</span><span class="o">=</span><span class="nv">$RTG_CPUS</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>For scaling to large datasets, see <a href="#scaling-big">"Scaling to Big Datasets Using PySpark"</a> section.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="ddp">8. Distributed Data Parallel (DDP)</h2>
+<div class="sectionbody">
+<div class="admonitionblock note">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-note" title="Note"></i>
+</td>
+<td class="content">
+This is a new feature to RTG and not all edge cases are tested.
+</td>
+</tr>
+</table>
+</div>
+<div class="paragraph">
+<p><code>rtg.distrib.launch</code> simplifies the use of <code>torch.distributed.launch</code> as follows:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nv">$ </span>python <span class="nt">-m</span> rtg.distrib.launch <span class="nt">-h</span>
+usage: launch.py <span class="o">[</span><span class="nt">-h</span><span class="o">]</span> <span class="o">[</span><span class="nt">-N</span> NODES] <span class="o">[</span><span class="nt">-r</span> NODE_RANK] <span class="o">[</span><span class="nt">-P</span> PROCS_PER_NODE]
+                 <span class="o">[</span><span class="nt">-G</span> GPUS_PER_PROC] <span class="o">[</span><span class="nt">--master-addr</span> MASTER_ADDR]
+                 <span class="o">[</span><span class="nt">--master-port</span> MASTER_PORT] <span class="o">[</span><span class="nt">-m</span> | <span class="nt">--no_python</span><span class="o">]</span>
+                 training_script ...
+
+PyTorch distributed training launch helper utilty that will spawn up multiple
+distributed processes
+
+positional arguments:
+  training_script       The full path to the single GPU training
+                        program/script to be launched <span class="k">in </span>parallel, followed by
+                        all the arguments <span class="k">for </span>the training script
+  training_script_args
+
+optional arguments:
+  <span class="nt">-h</span>, <span class="nt">--help</span>            show this <span class="nb">help </span>message and <span class="nb">exit</span>
+  <span class="nt">-N</span> NODES, <span class="nt">--nodes</span> NODES
+                        The number of nodes to use <span class="k">for </span>distributed training
+                        <span class="o">(</span>default: 1<span class="o">)</span>
+  <span class="nt">-r</span> NODE_RANK, <span class="nt">--node-rank</span> NODE_RANK
+                        The rank of the node <span class="k">for </span>multi-node distributed
+                        training <span class="o">(</span>default: 0<span class="o">)</span>
+  <span class="nt">-P</span> PROCS_PER_NODE, <span class="nt">--procs-per-node</span> PROCS_PER_NODE
+                        The number of processes to launch on each node with
+                        one gpu each, <span class="k">for </span>GPU training, this is recommended to
+                        be <span class="nb">set </span>to the number of GPUs <span class="k">in </span>your system so that
+                        each process can be bound to a single GPU. <span class="o">(</span>default:
+                        1<span class="o">)</span>
+  <span class="nt">-G</span> GPUS_PER_PROC, <span class="nt">--gpus-per-proc</span> GPUS_PER_PROC
+                        Number of GPUs to assign to each process. <span class="o">(</span>default: 0<span class="o">)</span>
+  <span class="nt">--master-addr</span> MASTER_ADDR
+                        Master node <span class="o">(</span>rank 0<span class="o">)</span><span class="s1">'s address, should be either the
+                        IP address or the hostname of node 0, for single node
+                        multi-proc training, the --master_addr can simply be
+                        127.0.0.1 (default: 127.0.0.1)
+  --master-port MASTER_PORT
+                        Master node (rank 0)'</span>s free port that needs to be used
+                        <span class="k">for </span>communciation during distributed training
+                        <span class="o">(</span>default: 29500<span class="o">)</span>
+  <span class="nt">-m</span>, <span class="nt">--module</span>          Changes each process to interpret the launch script as
+                        a python module, executing with the same behavior
+                        as<span class="s1">'python -m'</span><span class="nb">.</span> <span class="o">(</span>default: False<span class="o">)</span>
+  <span class="nt">--no_python</span>           Do not prepend the training script with <span class="s2">"python"</span> -
+                        just <span class="nb">exec </span>it directly. Useful when the script is not a
+                        Python script. <span class="o">(</span>default: False<span class="o">)</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Examples</strong></p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Run on two CPU processes <code>-P 2</code> on single node <code>-N 1</code> (for testing, no GPUS <code>-G 0</code>)</p>
+<div class="listingblock">
+<div class="content">
+<pre>python -m rtg.distrib.launch -N 1 -P 2 -G 0 -m rtg.pipeline  runs/005-tfm-nldb</pre>
+</div>
+</div>
+</li>
+<li>
+<p>Run on on single node, two processes, one GPU per process: <code>-N 1 -P 2 -G 1</code></p>
+</li>
+<li>
+<p>Run on on two node, two processes each, one GPU per process: <code>-N 2 -P 2 -G 1</code>.</p>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="c"># on first node: rank 0</span>
+python <span class="nt">-m</span> rtg.distrib.launch <span class="nt">-N</span> 2 <span class="nt">-r</span> 0 <span class="nt">-P</span> 2 <span class="nt">-G</span> 1 <span class="nt">-m</span> rtg.pipeline runs/005-tfm-nldb <span class="nt">-G</span>
+<span class="c"># on second node: rank 1</span>
+python <span class="nt">-m</span> rtg.distrib.launch <span class="nt">-N</span> 2 <span class="nt">-r</span> 1 <span class="nt">-P</span> 2 <span class="nt">-G</span> 1 <span class="nt">-m</span> rtg.pipeline  runs/005-tfm-nldb <span class="nt">-G</span></code></pre>
+</div>
+</div>
+</li>
+</ol>
+</div>
+<div class="paragraph">
+<p>WARNING:</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Don&#8217;t ever use <code>-G 2</code> or more (i.e. dont use 2 or more GPUs per process), instead use more <code>-P</code> (i.e. more processes with 1 GPU each.</p>
+</li>
+</ol>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="fp16">9. FP16, Mixed Precision Training</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>Note that <code>rtg-pipe -h</code> has <code>-fp16, --fp16</code> CLI argument flag that can be used to enable mixed precision training.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nv">$ </span>rtg-pipe &lt;experiment-dir&gt; <span class="nt">--fp16</span></code></pre>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="grad-clip">10. Gradient Clipping</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>Gradient clipping is supported using <a href="https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html"><code>torch.clip_grad_norm_</code></a>.</p>
+</div>
+<div class="paragraph">
+<p><code>trainer.init_args.clip_grad_norm</code> is treated as maximum L2 norm at which gradients are clipped.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">trainer</span><span class="pi">:</span>
+  <span class="na">init_args</span><span class="pi">:</span>
+    <span class="c1"># grad_accum: 1   # other params for init_args are allowed</span>
+    <span class="na">clip_grad_norm</span><span class="pi">:</span> <span class="s">8</span></code></pre>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="scaling-big">11. Scaling Big Using PySpark</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>When dealing with big datasets, the traditional tools such as multiprocessing and SQLite3 simply aren&#8217;t enogh.
+In such scenario, <a href="https://spark.apache.org/">PySpark</a> is a useful backend to use.
+When pyspark is enabled</p>
+</div>
+<div class="paragraph">
+<p>PySpark is used to</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>compute term frequencies which help speed up BPE learning</p>
+</li>
+<li>
+<p>encode data with BPE</p>
+</li>
+<li>
+<p>store data in <a href="https://isi-nlp.github.io/nlcodec/#_database">NLCodec MultipartDb  </a></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>To enable pyspark backend</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Install pyspark; eg: <code>pip install pyspark &gt;= 3.0.0</code>.  Make sure you have a JVM/JDK that is compatible for pyspark.</p>
+</li>
+<li>
+<p>For data preparation, only <code>codec_lib: nlcodec</code> supports pyspark backend as of now. If you are using <code>sentencepiece</code>, switch to <code>nlcodec</code></p>
+</li>
+<li>
+<p>Add the <code>spark</code> block to the top level of <code>conf.yml</code>.  See <code>experiments/spark-bigdataprep.html</code> for a full example.</p>
+</li>
+</ol>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+   <span class="na">codec_lib</span><span class="pi">:</span> <span class="s">nlcodec</span>    <span class="c1"># only nlcodec supports pyspark backend</span>
+   <span class="na">max_part_size</span><span class="pi">:</span> <span class="m">1000000</span>  <span class="c1"># part size (num of recs); divides the training data into multiple parts</span>
+   <span class="s">...</span> <span class="c1"># other args</span>
+<span class="na">spark</span><span class="pi">:</span> <span class="c1"># add this block to enable spark backend</span>
+  <span class="c1"># double quote the keys containing dot</span>
+  <span class="s2">"</span><span class="s">spark.master"</span><span class="err">:</span> <span class="s">local[3]</span>           <span class="c1"># set it to local[*] to use all local CPUs</span>
+  <span class="s2">"</span><span class="s">spark.app.name"</span><span class="err">:</span> <span class="s">RTG NMT on Spark</span>  <span class="c1"># Name for the App</span>
+  <span class="s2">"</span><span class="s">spark.driver.memory"</span><span class="err">:</span> <span class="s">6g</span>
+  <span class="s">#key1</span><span class="err">:</span> <span class="s">value1</span>    <span class="c1"># any other spark configs you want to control</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>You may use <code>local[*]</code> to use all CPUs however, it is important to note that:</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>If you have too many CPU cores (say 32 or more), the disk/storage may have too much pressure and the overall performance could degrade.</p>
+</li>
+<li>
+<p>Remember to exclude some CPUs out from spark for other work load, such as for pytorch. In the above example I used <code>local[3]</code> because I had only 4 CPUs in total and excluded one from spark.</p>
+</li>
+</ol>
+</div>
+<div class="paragraph">
+<p>Watch out the spark logs for any warning messages.
+Also, the log message provides the Spark web UI address when spark session is initialized.</p>
+</div>
+<div class="admonitionblock warning">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-warning" title="Warning"></i>
+</td>
+<td class="content">
+The multi-node spark distributed mode is not tested. (But it might work out of the box  if <code>"spark.master"</code> is correctly set)
+</td>
+</tr>
+</table>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_rtg_serve">12. RTG Serve</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>RTG model can be served using Flask Server.</p>
+</div>
+<div class="sect2">
+<h3 id="_flask_installation">12.1. Flask Installation</h3>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="commandline">$ pip install rtg[serve]</code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Flask has its own set of dependencies unrelated to the core functionality, hence, not installed when installing <code>rtg</code>.</p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_running">12.2. Running</h3>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="commandline">$ python -m rtg.serve -h  # rtg-serve
+usage: rtg.serve [-h] [-d] [-p PORT] [-ho HOST] [-msl MAX_SRC_LEN] exp_dir
+
+Deploy an RTG model to a RESTful server
+
+positional arguments:
+  exp_dir               Experiment directory
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -d, --debug           Run Flask server in debug mode (default: False)
+  -p PORT, --port PORT  port to run server on (default: 6060)
+  -ho HOST, --host HOST
+                        Host address to bind. (default: 0.0.0.0)
+  -b BASE, --base BASE  Base prefix path for all the URLs (default: None)
+  -msl MAX_SRC_LEN, --max-src-len MAX_SRC_LEN
+                        max source len; longer seqs will be truncated
+                        (default: 250)</code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>To launch a service for <code>runs/001-tfm</code> experiment, run <code>python -m rtg.serve -d runs/001-tfm</code>
+To use basepath of <code>/v1</code>: <code>python -m rtg.serve -d runs/001-tfm -b /v1</code></p>
+</div>
+<div class="paragraph">
+<p>It prints :
+<code>* Running on <a href="http://0.0.0.0:6060/" class="bare">0.0.0.0:6060/</a> (Press CTRL+C to quit)</code></p>
+</div>
+<div class="paragraph">
+<p>Currently only <code>/translate</code> API is supported. It accepts both <code>GET</code> with query params and <code>POST</code> with form params.</p>
+</div>
+<div class="admonitionblock note">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-note" title="Note"></i>
+</td>
+<td class="content">
+batch decoding is yet to be supported. The current decoder decodes only one sentence at a time.
+</td>
+</tr>
+</table>
+</div>
+<div class="paragraph">
+<p>An example POST request:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre> curl --data "source=Comment allez-vous?" --data "source=Bonne journée" http://localhost:6060/translate</pre>
+</div>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="json"><span class="p">{</span><span class="w">
+  </span><span class="nl">"source"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="w">
+    </span><span class="s2">"Comment allez-vous?"</span><span class="p">,</span><span class="w">
+    </span><span class="s2">"Bonne journée"</span><span class="w">
+  </span><span class="p">],</span><span class="w">
+  </span><span class="nl">"translation"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="w">
+    </span><span class="s2">"How are you?"</span><span class="p">,</span><span class="w">
+    </span><span class="s2">"Have a nice day"</span><span class="w">
+  </span><span class="p">]</span><span class="w">
+</span><span class="p">}</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>You can also request like GET method as <code><a href="http://localhost:6060/translate?source=text1&amp;source=text2" class="bare">localhost:6060/translate?source=text1&amp;source=text2</a></code>
+after properly URL encoding the <code>text1</code> <code>text2</code>. This should only be used for quick testing in your web browser.</p>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_google_analytics_integration">12.3. Google Analytics Integration</h3>
+<div class="paragraph">
+<p>Google Analytics is supported on web pages, however disabled by default.
+To enable set <code>GA_TAG</code> environment variable before starting <code>rtg.serve</code> process.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nb">export </span><span class="nv">GA_TAG</span><span class="o">=</span><span class="s2">"G-xxxxx"</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Production Deployment</strong>
+Please use uWSGI for production deployment.
+If you dont already have uWSGI, you may install it via conda by running <code>conda install -c conda-forge uwsgi</code>.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash">uwsgi <span class="nt">--http</span> 127.0.0.1:5000 <span class="nt">--module</span> rtg.serve.app:app <span class="nt">--pyargv</span> <span class="s2">"&lt;path-to-exp-dir&gt;"</span>
+
+<span class="c"># or using a .ini file</span>
+uwsgi <span class="nt">--ini</span> examples/uwsgi.ini</code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Where the <code>uwsgi.ini</code> has the following info:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="ini"><span class="nn">[uwsgi]</span>
+<span class="py">http</span> <span class="p">=</span> <span class="s">0.0.0.0:6060</span>
+<span class="py">module</span> <span class="p">=</span> <span class="s">rtg.serve.app:app</span>
+<span class="py">pyargv</span> <span class="p">=</span> <span class="s">/full/path/&lt;path-to-exp-dir&gt; -b /v1</span>
+<span class="py">master</span> <span class="p">=</span> <span class="s">true</span>
+<span class="py">processes</span> <span class="p">=</span> <span class="s">1</span>
+<span class="py">stats</span> <span class="p">=</span> <span class="s">127.0.0.1:9191</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Note that <code>&lt;path-to-exp-dir&gt;</code> is expected to be a valid path to Experiment dir, it maybe obtained using <code>rtg-export</code> tool.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_pre_process_and_post_process">13. Pre-process and post-process</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>The input/source text given to the API must be pre-processed in the same settings as the preprocessing during training phase. So, we offer configurations to match the preprocessing:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><code>src_pre_proc</code>:  List of transformations to be used on source text before giving to model (e.g. tokenizer, lowercase)</p>
+</li>
+<li>
+<p><code>tgt_pre_proc</code>: List of transformations to be used on target text before giving to model (e.g. tokenizer, lowercase)</p>
+</li>
+<li>
+<p><code>tgt_post_proc</code>: List of transformations to be used on target text produced by model (e.g. detokenizer, removal of unk)</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>The following transformations are built into RTG, so you may simply use their name:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="python"><span class="n">transformers</span>  <span class="o">=</span> <span class="p">{</span>
+    <span class="s">'no_op'</span><span class="p">:</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">,</span>
+    <span class="s">'space_tok'</span><span class="p">:</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="s">' '</span><span class="p">.</span><span class="n">join</span><span class="p">(</span><span class="n">x</span><span class="p">.</span><span class="n">strip</span><span class="p">().</span><span class="n">split</span><span class="p">()),</span>  <span class="c1"># removes extra white spaces
+</span>    <span class="s">'space_detok'</span><span class="p">:</span> <span class="k">lambda</span> <span class="n">toks</span><span class="p">:</span> <span class="s">' '</span><span class="p">.</span><span class="n">join</span><span class="p">(</span><span class="n">toks</span><span class="p">),</span>
+    <span class="s">'moses_tok'</span><span class="p">:</span> <span class="n">partial</span><span class="p">(</span><span class="n">MosesTokenizer</span><span class="p">().</span><span class="n">tokenize</span><span class="p">,</span> <span class="n">escape</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">return_str</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
+                         <span class="n">aggressive_dash_splits</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
+                         <span class="n">protected_patterns</span><span class="o">=</span><span class="n">MosesTokenizer</span><span class="p">.</span><span class="n">WEB_PROTECTED_PATTERNS</span><span class="p">),</span>
+    <span class="s">'moses_detok'</span><span class="p">:</span> <span class="n">partial</span><span class="p">(</span><span class="n">MosesDetokenizer</span><span class="p">().</span><span class="n">detokenize</span><span class="p">,</span> <span class="n">return_str</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">unescape</span><span class="o">=</span><span class="bp">True</span><span class="p">),</span>
+    <span class="s">'moses_truecase'</span><span class="p">:</span> <span class="n">partial</span><span class="p">(</span><span class="n">MosesTruecaser</span><span class="p">().</span><span class="n">truecase</span><span class="p">,</span> <span class="n">return_str</span><span class="o">=</span><span class="bp">True</span><span class="p">),</span>
+    <span class="s">'lowercase'</span><span class="p">:</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">.</span><span class="n">lower</span><span class="p">(),</span>
+    <span class="s">'drop_unk'</span><span class="p">:</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">.</span><span class="n">replace</span><span class="p">(</span><span class="s">'&lt;unk&gt;'</span><span class="p">,</span> <span class="s">''</span><span class="p">),</span>
+    <span class="s">'html_unescape'</span><span class="p">:</span> <span class="n">html</span><span class="p">.</span><span class="n">unescape</span><span class="p">,</span>
+    <span class="s">'punct_norm'</span><span class="p">:</span> <span class="n">MosesPunctNormalizer</span><span class="p">().</span><span class="n">normalize</span>
+<span class="p">}</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>When no arguments are given to <code>{src_pre,tgt_pre,tgt_prop}_proc</code> are missing, we use the same sensible defaults (same as the ones used in <a href="https://aclanthology.org/2021.acl-demo.37/" class="bare">aclanthology.org/2021.acl-demo.37/</a>.)</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">src_pre_proc</span><span class="pi">:</span>
+  <span class="pi">-</span> <span class="s">html_unescape</span>
+  <span class="pi">-</span> <span class="s">punct_norm</span>
+  <span class="pi">-</span> <span class="s">moses_tok</span>
+<span class="na">tgt_post_proc</span><span class="pi">:</span>
+  <span class="pi">-</span> <span class="s">moses_detok</span>
+  <span class="pi">-</span> <span class="s">drop_unk</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>You may also use shell command line, including unix pipes, by prefixing your command with "#!". In addition, you may mix shell commands with known (pythonic) transforms. Example:</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+  <span class="na">src_pre_proc</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="s2">"</span><span class="s">#!/path/to/normalizer.perl</span><span class="nv"> </span><span class="s">|</span><span class="nv"> </span><span class="s">/path/to/tokenizer.py</span><span class="nv"> </span><span class="s">--lang</span><span class="nv"> </span><span class="s">deu"</span>
+    <span class="pi">-</span> <span class="s">lowercase</span>
+  <span class="na">tgt_post_proc</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="s">drop_unk</span>
+    <span class="pi">-</span> <span class="s">moses_detok</span></code></pre>
+</div>
+</div>
+<div class="ulist">
+<div class="title">Disabling pre- and post- processing</div>
+<ul>
+<li>
+<p>You may permanently disable preprocessing and post processing using</p>
+</li>
+</ul>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="yaml"><span class="na">prep</span><span class="pi">:</span>
+  <span class="na">src_pre_proc</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="s">no_op</span>
+  <span class="na">tgt_post_proc</span><span class="pi">:</span>
+    <span class="pi">-</span> <span class="s">no_op</span></code></pre>
+</div>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Or, temporarily, add <code>prep=false</code> argument <code><a href="http://localhost:6060/translate\?prep\=false" class="bare">localhost:6060/translate\?prep\=false</a></code></p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>NOTE:
+  <code>{src,tgt}_pre_proc</code> and <code>tgt_post_proc</code> are only used by REST API as of now. rtg.decode and rtg.prep do not yet to use pre- and post- text transformers.</p>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="dev-env">14. Development Environment:</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_run_tests">14.1. Run Tests</h3>
+<div class="paragraph">
+<p>Test cases are done using the <a href="https://docs.pytest.org/en/latest/"><code>pytest</code></a> framework.
+It can be installed using <code>pip install pytest</code></p>
+</div>
+<div class="paragraph">
+<p>All tests should be run from the root dir of the project.</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Run all tests, with minimal outputs: <code>python -m pytest</code></p>
+</li>
+<li>
+<p>Run all tests and also see STDOUT/STDERR <code>python -m pytest -s</code></p>
+</li>
+<li>
+<p>Run a specific test such as <code>python -m pytest  -s -k 'test_pipeline_transformer'</code></p>
+</li>
+</ol>
+</div>
+<div class="paragraph">
+<p>IDE Setup:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><a href="https://www.jetbrains.com/help/pycharm/pytest.html">Configure Pytest as test framework in PyCharm</a></p>
+</li>
+</ul>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_adding_a_new_model">14.2. Adding a new model</h3>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Go to <a href="../rtg/module/"><code>rtg.module</code></a> and create a new <code>.py</code> file</p>
+</li>
+<li>
+<p>Create a class say <code>MyModel</code> that extends either <a href="../rtg/module/<em>init</em>.py"><code>NMTModel</code> or <code>LangModel</code> or <code>Model</code></a> depending on its type</p>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>it should implement required methods including  <code>make_model(cls, &#8230;&#8203;)</code>, <code>make_trainer</code>  and property <code>model_type</code>; And also <code>make_generator</code> for generation models such as NMT and LangModel</p>
+</li>
+<li>
+<p><code>make_model</code> is a model factory, used to create an instance of model. This would be unique to your model.</p>
+</li>
+<li>
+<p><code>make_trainer</code> is a trainer factory, used for creating a trainer. Often, one of the existing (e.g. parent) model&#8217;s trainer would be sufficient, but if your model requires a different training strategy or has different training API, then you may want to create a new one.</p>
+</li>
+<li>
+<p><code>generators</code> used by decoder, to run model in eval model</p>
+</li>
+<li>
+<p>refer to <a href="../rtg/module/tfmnmt.py"><code>tfmnmt</code></a> for an example</p>
+</li>
+</ol>
+</div>
+</li>
+<li>
+<p>register the <code>model_type</code>  in the <a href="../rtg/registry.py"><code>registry.py</code></a></p>
+</li>
+</ol>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="python"><span class="kn">from</span> <span class="nn">rtg.registry</span> <span class="kn">import</span> <span class="n">register</span><span class="p">,</span> <span class="n">MODEL</span>
+<span class="kn">from</span> <span class="nn">rtg.module</span> <span class="kn">import</span> <span class="n">NMTModel</span>
+
+<span class="o">@</span><span class="n">register</span><span class="p">(</span><span class="n">kind</span><span class="o">=</span><span class="n">MODEL</span><span class="p">,</span><span class="n">name</span><span class="o">=</span><span class="s">'newnmt'</span><span class="p">)</span>
+<span class="k">class</span> <span class="nc">NewNMTModel</span><span class="p">(</span><span class="n">NMTModel</span><span class="p">):</span>
+
+    <span class="o">@</span><span class="nb">classmethod</span>
+    <span class="k">def</span> <span class="nf">make_model</span><span class="p">(</span><span class="n">cls</span><span class="p">,</span> <span class="n">exp</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+        <span class="k">pass</span>
+
+    <span class="o">@</span><span class="nb">classmethod</span>
+    <span class="k">def</span> <span class="nf">make_trainer</span><span class="p">(</span><span class="n">cls</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+        <span class="k">pass</span>
+
+    <span class="o">@</span><span class="nb">classmethod</span>
+    <span class="k">def</span> <span class="nf">make_generator</span><span class="p">(</span><span class="n">cls</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+        <span class="k">pass</span></code></pre>
+</div>
+</div>
+<div class="admonitionblock note">
+<table>
+<tr>
+<td class="icon">
+<i class="fa icon-note" title="Note"></i>
+</td>
+<td class="content">
+If your model is similar to existing model and match most of its API, you should reuse one of those trainers and generators.
+</td>
+</tr>
+</table>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_pypi_release_instructions">15. PyPI Release Instructions</h2>
+<div class="sectionbody">
+<div class="ulist">
+<ul>
+<li>
+<p>PyPI release required twine : <a href="https://twine.readthedocs.io/en/latest/" class="bare">twine.readthedocs.io/en/latest/</a></p>
+</li>
+<li>
+<p>Docs require asciidoctor: <a href="https://anaconda.org/conda-forge/asciidoctor" class="bare">anaconda.org/conda-forge/asciidoctor</a></p>
+</li>
+</ul>
+</div>
+<div class="sect2">
+<h3 id="_steps">15.1. Steps:</h3>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Update the version: <code><em>_version_</em></code> in <code>rtg/<em>_init_</em>.py</code></p>
+</li>
+<li>
+<p>Remove old builds (if any)</p>
+<div class="literalblock">
+<div class="content">
+<pre>rm -r build dist *.egg-info</pre>
+</div>
+</div>
+</li>
+<li>
+<p>Build:</p>
+<div class="literalblock">
+<div class="content">
+<pre>python setup.py sdist bdist_wheel</pre>
+</div>
+</div>
+</li>
+<li>
+<p>Upload to <strong>testpypi</strong></p>
+<div class="literalblock">
+<div class="content">
+<pre>twine upload -r testpypi dist/*</pre>
+</div>
+</div>
+</li>
+<li>
+<p>Make docs and link</p>
+<div class="olist loweralpha">
+<ol class="loweralpha" type="a">
+<li>
+<p>List a new version for docs: <code>docs/versions.adoc</code></p>
+</li>
+<li>
+<p>Build docs</p>
+<div class="literalblock">
+<div class="content">
+<pre>docs/make-docs.sh      # docs</pre>
+</div>
+</div>
+</li>
+</ol>
+</div>
+</li>
+<li>
+<p>Upload to <strong>pypi</strong></p>
+<div class="literalblock">
+<div class="content">
+<pre>twine upload -r pypi dist/*</pre>
+</div>
+</div>
+</li>
+</ol>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_the_pypirc_file">15.2. The <code>.pypirc</code> file</h3>
+<div class="paragraph">
+<p>The rc file <code>~/.pypirc</code> should have something like this</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="ini"><span class="nn">[distutils]</span>
+<span class="py">index-servers</span> <span class="p">=</span>
+    <span class="err">pypi</span>
+    <span class="err">testpypi</span>
+
+<span class="nn">[pypi]</span>
+<span class="err">repository:</span> <span class="err">https://upload.pypi.org/legacy/</span>
+<span class="err">username:Thamme.Gowda</span>
+<span class="err">password:&lt;password_here&gt;</span>
+
+<span class="nn">[testpypi]</span>
+<span class="err">repository:</span> <span class="err">https://test.pypi.org/legacy/</span>
+<span class="err">username:Thamme.Gowda</span>
+<span class="err">password:&lt;password_here&gt;</span></code></pre>
+</div>
+</div>
+<div class="paragraph">
+<p>For the first time users of PyPI, you need to create an account at <a href="https://pypi.org/" class="bare">pypi.org/</a> AND <a href="https://test.pypi.org/" class="bare">test.pypi.org/</a>. Yes, they are two different accounts! Make your life easy by using same userID and password.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_docker_release">16. Docker Release</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_docker_for_arm64">16.1. Docker for ARM64</h3>
+<div class="paragraph">
+<p>With Apple moving to ARM chips, running AMD64 docker images on ARM is not efficient.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nb">cd </span>dockers
+<span class="c"># find a suitable docker file to build; e.g., this one</span>
+docker build <span class="nb">.</span> <span class="nt">-f</span> rtg-0.7-py39_tr110_cu114.dockerfile-t tgowda/rtg:0.7-py39_tr110_cu114</code></pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_acknowledgements">Acknowledgements</h2>
+<div class="sectionbody">
+<div class="paragraph">
+<p>The research is based upon work supported by the Office of the Director of
+National Intelligence (ODNI), Intelligence Advanced Research Projects
+Activity (IARPA), via AFRL Contract #FA8650-17-C-9116.
+The views and conclusions contained herein are those of the authors and
+should not be interpreted as necessarily representing the official policies or
+endorsements, either expressed or implied, of the ODNI, IARPA, or the
+U.S. Government. The U.S. Government is authorized to reproduce and
+distribute reprints for Governmental purposes notwithstanding any
+copyright annotation thereon.</p>
+</div>
+<div class="paragraph">
+<p>This material is based on research sponsored by Air Force Research Laboratory (AFRL)
+under agreement number FA8750-19-1-1000. The U.S. Government is authorized to reproduce and
+distribute reprints for Government purposes notwithstanding any copyright notation therein.</p>
+</div>
+</div>
+</div>
+</div>
+<div id="footer">
+<div id="footer-text">
+Last updated 2022-03-04 20:40:04 -0800
+</div>
+</div>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/versions.adoc b/docs/versions.adoc
index 2cf3178..67c2edd 100644
--- a/docs/versions.adoc
+++ b/docs/versions.adoc
@@ -1,6 +1,7 @@
 = RTG Docs Versions
 :hide-uri-scheme:
 
+* link:v0.7[v0.7]
 * link:v0.6.1[v0.6.1]
 * link:v0.6.0[v0.6.0]
 * link:v0.5.2[v0.5.2]
diff --git a/docs/versions.html b/docs/versions.html
index 6491841..502a8a0 100644
--- a/docs/versions.html
+++ b/docs/versions.html
@@ -447,6 +447,9 @@ <h1>RTG Docs Versions</h1>
 <div class="ulist">
 <ul>
 <li>
+<p><a href="v0.7">v0.7</a></p>
+</li>
+<li>
 <p><a href="v0.6.1">v0.6.1</a></p>
 </li>
 <li>
@@ -477,7 +480,7 @@ <h2 id="_releases">Releases</h2>
 </div>
 <div id="footer">
 <div id="footer-text">
-Last updated 2022-01-27 23:54:05 -0800
+Last updated 2022-03-15 16:24:58 -0700
 </div>
 </div>
 </body>
diff --git a/rtg/serve/static/docs.html b/rtg/serve/static/docs.html
index 6429623..88cf03b 100644
--- a/rtg/serve/static/docs.html
+++ b/rtg/serve/static/docs.html
@@ -774,12 +774,17 @@ <h1>Reader-Translator-Generator (RTG)</h1>
 <li><a href="#_adding_a_new_model">14.2. Adding a new model</a></li>
 </ul>
 </li>
-<li><a href="#_release_instructions">15. Release instructions</a>
+<li><a href="#_pypi_release_instructions">15. PyPI Release Instructions</a>
 <ul class="sectlevel2">
 <li><a href="#_steps">15.1. Steps:</a></li>
 <li><a href="#_the_pypirc_file">15.2. The <code>.pypirc</code> file</a></li>
 </ul>
 </li>
+<li><a href="#_docker_release">16. Docker Release</a>
+<ul class="sectlevel2">
+<li><a href="#_docker_for_arm64">16.1. Docker for ARM64</a></li>
+</ul>
+</li>
 <li><a href="#_acknowledgements">Acknowledgements</a></li>
 </ul>
 </div>
@@ -2999,7 +3004,7 @@ <h3 id="_adding_a_new_model">14.2. Adding a new model</h3>
 </div>
 </div>
 <div class="sect1">
-<h2 id="_release_instructions">15. Release instructions</h2>
+<h2 id="_pypi_release_instructions">15. PyPI Release Instructions</h2>
 <div class="sectionbody">
 <div class="ulist">
 <ul>
@@ -3047,6 +3052,9 @@ <h3 id="_steps">15.1. Steps:</h3>
 <div class="olist loweralpha">
 <ol class="loweralpha" type="a">
 <li>
+<p>List a new version for docs: <code>docs/versions.adoc</code></p>
+</li>
+<li>
 <p>Build docs</p>
 <div class="literalblock">
 <div class="content">
@@ -3054,9 +3062,6 @@ <h3 id="_steps">15.1. Steps:</h3>
 </div>
 </div>
 </li>
-<li>
-<p>List a new version for docs: <code>docs/versions.adoc</code></p>
-</li>
 </ol>
 </div>
 </li>
@@ -3101,6 +3106,24 @@ <h3 id="_the_pypirc_file">15.2. The <code>.pypirc</code> file</h3>
 </div>
 </div>
 <div class="sect1">
+<h2 id="_docker_release">16. Docker Release</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_docker_for_arm64">16.1. Docker for ARM64</h3>
+<div class="paragraph">
+<p>With Apple moving to ARM chips, running AMD64 docker images on ARM is not efficient.</p>
+</div>
+<div class="listingblock">
+<div class="content">
+<pre class="rouge highlight"><code data-lang="bash"><span class="nb">cd </span>dockers
+<span class="c"># find a suitable docker file to build; e.g., this one</span>
+docker build <span class="nb">.</span> <span class="nt">-f</span> rtg-0.7-py39_tr110_cu114.dockerfile-t tgowda/rtg:0.7-py39_tr110_cu114</code></pre>
+</div>
+</div>
+</div>
+</div>
+</div>
+<div class="sect1">
 <h2 id="_acknowledgements">Acknowledgements</h2>
 <div class="sectionbody">
 <div class="paragraph">
@@ -3124,7 +3147,7 @@ <h2 id="_acknowledgements">Acknowledgements</h2>
 </div>
 <div id="footer">
 <div id="footer-text">
-Last updated 2022-01-27 23:54:05 -0800
+Last updated 2022-03-04 20:40:04 -0800
 </div>
 </div>
 </body>

Library	Purpose
torch	deep learning library
tensorboard	logging and visualizing training and validation losses
sacrebleu	BLEU scorer
sacremoses	tokenization and detokenization
tqdm	Progress bar
ruamel.yaml	configuration management
sentencepiece	(optional) vocabulary creation using word, char, BPE
nlcodec	(optional) similar to `sentencepiece`, but easily customizable; scales to big datasets using pyspark, offers efficient storage of encoded parallel data
flask, jinja	(optional) HTTP API and web interface for serving the models
pyspark	(optional) parallelized data preparation (using `nlcodec`) for massive datasets.
Component	Choices
model	tfmnmt, rnnmt, rnnlm, tfmlm, skptfmnmt, wvtfmnmt, wvskptfmnmt, tfmextembmt, robertamt, mtfmnmt, hybridmt, CBOW, tfmcls
optimizer	adam, sgd, adagrad, adam_w, adadelta, sparse_adam
schedule	noam, inverse_sqrt
criterion	sparse_cross_entropy, kl_divergence, focal_loss, binary_cross_entropy, smooth_kld, triplet_loss, smooth_kld_and_triplet_loss, dice_loss, squared_error
Name	Type	Range/Choices	Required	Default
`label_smoothing`	`float`	`[0.0, 1.0)`	Optional	0.1
Name	Type	Range/Choices	Required	Default	Comment
`weight`	`str`	`{inv_freq, inv_sqrt_freq, inv_log_freq}`	Optional	None ⇒ disable weighing
`weight_calm_time`	`int`	[0, )	Optional	0 ⇒ disable calming;	Applicable when `weight` is enabled
Command	Purpose
rtg-pipe	Run rtg-prep, rtg-train and test case evaluation
rtg-decode	Decode new source files using the values set in `conf.yml`
rtg-export	Export an experiment
rtg-fork	Fork an experiment with/without same conf, code, data, vocabularies etc
rtg-serve	Serve an RTG model over HTTP API using Flask server
rtg-decode-pro	Decode new source files using the values that you supply from CLI args
rtg-prep	Prepare an experiment. You should be using `rtg-pipe`
rtg-train	Train a model. You should be using `rtg-pipe`
rtg-syscomb	System combination. Dont bother about it for now.
rtg-launch	Launch data distributed training
rtg-params	Show parameters in model