diff --git a/transformer_review/attention.png b/transformer_review/attention.png
new file mode 100644
index 0000000..78d0518
Binary files /dev/null and b/transformer_review/attention.png differ
diff --git a/transformer_review/attention_example.png b/transformer_review/attention_example.png
new file mode 100644
index 0000000..201ae12
Binary files /dev/null and b/transformer_review/attention_example.png differ
diff --git a/transformer_review/cnn.png b/transformer_review/cnn.png
new file mode 100644
index 0000000..096e779
Binary files /dev/null and b/transformer_review/cnn.png differ
diff --git a/transformer_review/decode-only-transformer.svg b/transformer_review/decode-only-transformer.svg
new file mode 100644
index 0000000..64d0294
--- /dev/null
+++ b/transformer_review/decode-only-transformer.svg
@@ -0,0 +1,854 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="180.48257mm"
+   height="473.35675mm"
+   viewBox="0 0 180.48257 473.35675"
+   version="1.1"
+   id="svg237"
+   inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
+   sodipodi:docname="decode-only-transformer.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview239"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="1"
+     inkscape:cx="313"
+     inkscape:cy="257"
+     inkscape:window-width="1920"
+     inkscape:window-height="1015"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs234">
+    <marker
+       style="overflow:visible"
+       id="Arrow1"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606599"
+       markerHeight="6.7071061"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-9"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-3"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-0"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-6"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-6"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606599"
+       markerHeight="6.7071061"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-1"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-0-7"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-6-9"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-0"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-2"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-7"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-5"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-7-2"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-5-2"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-4"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606599"
+       markerHeight="6.7071061"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-74"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-4"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-3"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-9-0"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-3-7"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-0-8"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-6-6"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-6-8"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606599"
+       markerHeight="6.7071061"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-1-8"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-0-7-4"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-6-9-3"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-0-6"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-2-1"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-7-0"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-5-6"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1-2-0-6-7"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="Arrow1"
+       markerWidth="4.0606604"
+       markerHeight="6.7071066"
+       viewBox="0 0 4.0606602 6.7071068"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         style="fill:none;stroke:context-stroke;stroke-width:1;stroke-linecap:butt"
+         d="M 3,-3 0,0 3,3"
+         id="path5057-7-2-1-7"
+         transform="rotate(180,0.125,0)"
+         sodipodi:nodetypes="ccc" />
+    </marker>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-22.296,-103.68059)">
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="49.790062"
+       y="471.82776"
+       id="text567"><tspan
+         sodipodi:role="line"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="49.790062"
+         y="471.82776"
+         id="tspan619">Multi-Head Attention</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect410-7"
+       width="94.121414"
+       height="29.600576"
+       x="40.054497"
+       y="388.05225" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="46.243896"
+       y="404.70694"
+       id="text657"><tspan
+         sodipodi:role="line"
+         id="tspan655"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="46.243896"
+         y="404.70694">Vanilla Neural Net/MLP</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:3.52777px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="145.63556"
+       y="112.74233"
+       id="text661"><tspan
+         sodipodi:role="line"
+         id="tspan659"
+         style="stroke-width:0.264583px"
+         x="145.63556"
+         y="112.74233"></tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:3.52777px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="84.489563"
+       y="130.31082"
+       id="text665"><tspan
+         sodipodi:role="line"
+         id="tspan663"
+         style="stroke-width:0.264583px"
+         x="84.489563"
+         y="130.31082"></tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect410-7-5"
+       width="94.121414"
+       height="29.600576"
+       x="40.054497"
+       y="454.68497" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect699"
+       width="67.139587"
+       height="14.660593"
+       x="54.926437"
+       y="429.06757" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="67.012276"
+       y="439.43787"
+       id="text703"><tspan
+         sodipodi:role="line"
+         id="tspan701"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="67.012276"
+         y="439.43787">Add + Norm</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect699-3"
+       width="67.139587"
+       height="14.660593"
+       x="54.926437"
+       y="361.71075" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="67.012276"
+       y="372.08084"
+       id="text703-5"><tspan
+         sodipodi:role="line"
+         id="tspan701-6"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="67.012276"
+         y="372.08084">Add + Norm</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1)"
+       d="M 86.413658,454.5032 V 444.49971"
+       id="path851"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2)"
+       d="M 86.413998,428.69389 V 418.6904"
+       id="path851-0"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-9)"
+       d="M 86.413998,387.68356 V 377.68007"
+       id="path851-6"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-0)"
+       d="M 86.413998,495.36886 V 485.36537"
+       id="path851-2"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1)"
+       d="M 87.804192,490.74713 H 148.09866 V 449.09754 H 87.329142"
+       id="path3403" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-6)"
+       d="M 87.789862,424.7181 H 148.08433 V 383.06851 H 87.314812"
+       id="path3403-8" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-0-7)"
+       d="M 86.413998,360.96065 V 350.95711"
+       id="path851-2-2"
+       inkscape:label="path851" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:butt;stroke-dasharray:0.75, 3;stroke-dashoffset:0"
+       id="rect5457"
+       width="131.37444"
+       height="137.30093"
+       x="22.671104"
+       y="355.76587" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="159.09232"
+       y="422.01639"
+       id="text5461"><tspan
+         sodipodi:role="line"
+         id="tspan5459"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="159.09232"
+         y="422.01639">Transformer </tspan><tspan
+         sodipodi:role="line"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="159.09232"
+         y="430.83585"
+         id="tspan5463">Block</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5469"
+       width="78.452934"
+       height="13.560033"
+       x="48.672371"
+       y="496.56567" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="52.922009"
+       y="505.88745"
+       id="text5473"><tspan
+         sodipodi:role="line"
+         id="tspan5471"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="52.922009"
+         y="505.88745">Positional Encoding</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5475"
+       width="77.72364"
+       height="13.101722"
+       x="47.673138"
+       y="520.73865" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="67.1194"
+       y="529.82043"
+       id="text5479"><tspan
+         sodipodi:role="line"
+         id="tspan5477"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="67.1194"
+         y="529.82043">Embedding</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5481"
+       width="77.895866"
+       height="12.084733"
+       x="47.52573"
+       y="544.47308" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="64.950104"
+       y="552.30096"
+       id="text5485"><tspan
+         sodipodi:role="line"
+         id="tspan5483"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="64.950104"
+         y="552.30096">Tokenization</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-0)"
+       d="M 86.413553,521.29758 V 511.29409"
+       id="path851-0-3"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-7)"
+       d="M 86.413553,545.11006 V 535.10657"
+       id="path851-0-9"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-7-2)"
+       d="M 86.413553,567.86422 V 557.86073"
+       id="path851-0-9-8"
+       inkscape:label="path851" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="69.101448"
+       y="575.07062"
+       id="text5983"><tspan
+         sodipodi:role="line"
+         id="tspan5981"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="69.101448"
+         y="575.07062">Input Text</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="32.037575"
+       y="340.6611"
+       id="text5993"><tspan
+         sodipodi:role="line"
+         id="tspan5991"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="32.037575"
+         y="340.6611">Repeat Transformer Blocks n times</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="49.789955"
+       y="307.37726"
+       id="text567-1"><tspan
+         sodipodi:role="line"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="49.789955"
+         y="307.37726"
+         id="tspan619-4">Multi-Head Attention</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect410-7-9"
+       width="94.121414"
+       height="29.600576"
+       x="40.05439"
+       y="223.60025" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="46.24379"
+       y="240.25494"
+       id="text657-2"><tspan
+         sodipodi:role="line"
+         id="tspan655-0"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="46.24379"
+         y="240.25494">Vanilla Neural Net/MLP</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect410-7-5-6"
+       width="94.121414"
+       height="29.600576"
+       x="40.05439"
+       y="290.23395" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect699-8"
+       width="67.139587"
+       height="14.660593"
+       x="54.926331"
+       y="264.61581" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="67.012169"
+       y="274.98642"
+       id="text703-9"><tspan
+         sodipodi:role="line"
+         id="tspan701-2"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="67.012169"
+         y="274.98642">Add + Norm</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-dasharray:none;stroke-dashoffset:5.66929"
+       id="rect699-3-6"
+       width="67.139587"
+       height="14.660593"
+       x="54.926331"
+       y="197.25876" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="67.012169"
+       y="207.62885"
+       id="text703-5-6"><tspan
+         sodipodi:role="line"
+         id="tspan701-6-4"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="67.012169"
+         y="207.62885">Add + Norm</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-4)"
+       d="M 86.413553,290.05252 V 280.04903"
+       id="path851-9"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-4)"
+       d="M 86.413893,264.24321 V 254.23972"
+       id="path851-0-5"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-9-0)"
+       d="M 86.413893,223.23288 V 213.22939"
+       id="path851-6-0"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-0-8)"
+       d="M 86.413893,330.91818 V 320.91469"
+       id="path851-2-4"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-4)"
+       d="M 87.804087,326.29645 H 148.09855 V 284.64686 H 87.329037"
+       id="path3403-87" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-6-8)"
+       d="M 87.789757,260.26742 H 148.08422 V 218.61783 H 87.314707"
+       id="path3403-8-1" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-0-7-4)"
+       d="M 86.413893,196.50997 V 186.50643"
+       id="path851-2-2-7"
+       inkscape:label="path851" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.75;stroke-linecap:butt;stroke-dasharray:0.75, 3;stroke-dashoffset:0"
+       id="rect5457-2"
+       width="131.37444"
+       height="137.30093"
+       x="22.671"
+       y="191.3139" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="159.09221"
+       y="257.56442"
+       id="text5461-7"><tspan
+         sodipodi:role="line"
+         id="tspan5459-2"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="159.09221"
+         y="257.56442">Transformer </tspan><tspan
+         sodipodi:role="line"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="159.09221"
+         y="266.38388"
+         id="tspan5463-2">Block</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5469-1"
+       width="78.452934"
+       height="13.560033"
+       x="48.672642"
+       y="124.25605" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="52.922279"
+       y="133.57776"
+       id="text5473-5"><tspan
+         sodipodi:role="line"
+         id="tspan5471-9"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="52.922279"
+         y="133.57776">Lookup Highest Prob</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5475-4"
+       width="77.72364"
+       height="13.101722"
+       x="47.673409"
+       y="148.42876" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:none;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="57.594658"
+       y="157.51038"
+       id="text5479-9"><tspan
+         sodipodi:role="line"
+         id="tspan5477-0"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="57.594658"
+         y="157.51038">Take Last Element</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect5481-9"
+       width="77.895866"
+       height="12.084733"
+       x="47.526001"
+       y="172.16283" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="64.950371"
+       y="179.9906"
+       id="text5485-1"><tspan
+         sodipodi:role="line"
+         id="tspan5483-7"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="64.950371"
+         y="179.9906">Linear Layer</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-0-6)"
+       d="M 86.413823,148.98825 V 138.98471"
+       id="path851-0-3-7"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-7-0)"
+       d="M 86.413823,172.80087 V 162.79732"
+       id="path851-0-9-1"
+       inkscape:label="path851" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow1-2-0-6-7)"
+       d="m 86.413553,123.22009 v -10.0036"
+       id="path851-0-3-7-6"
+       inkscape:label="path851" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:125%;font-family:'Source Han Serif SC';-inkscape-font-specification:'Source Han Serif SC';letter-spacing:0px;word-spacing:0px;fill:#000000;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       x="67.456993"
+       y="109.49261"
+       id="text9336"><tspan
+         sodipodi:role="line"
+         id="tspan9334"
+         style="font-size:7.05556px;fill:#000000;stroke-width:0.264583px"
+         x="67.456993"
+         y="109.49261">Next Token</tspan></text>
+  </g>
+</svg>
diff --git a/transformer_review/generate_slides.sh b/transformer_review/generate_slides.sh
new file mode 100755
index 0000000..a54960c
--- /dev/null
+++ b/transformer_review/generate_slides.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+pandoc -s --mathjax -i -t slidy slides.md -o slides.html
diff --git a/transformer_review/iyyer_slide18.png b/transformer_review/iyyer_slide18.png
new file mode 100644
index 0000000..ef72db9
Binary files /dev/null and b/transformer_review/iyyer_slide18.png differ
diff --git a/transformer_review/iyyer_slide19.png b/transformer_review/iyyer_slide19.png
new file mode 100644
index 0000000..a98ef6f
Binary files /dev/null and b/transformer_review/iyyer_slide19.png differ
diff --git a/transformer_review/iyyer_slide20.png b/transformer_review/iyyer_slide20.png
new file mode 100644
index 0000000..d57f514
Binary files /dev/null and b/transformer_review/iyyer_slide20.png differ
diff --git a/transformer_review/iyyer_slide21.png b/transformer_review/iyyer_slide21.png
new file mode 100644
index 0000000..be94c7a
Binary files /dev/null and b/transformer_review/iyyer_slide21.png differ
diff --git a/transformer_review/iyyer_slide22.png b/transformer_review/iyyer_slide22.png
new file mode 100644
index 0000000..9fcff91
Binary files /dev/null and b/transformer_review/iyyer_slide22.png differ
diff --git a/transformer_review/iyyer_slide23.png b/transformer_review/iyyer_slide23.png
new file mode 100644
index 0000000..6981eba
Binary files /dev/null and b/transformer_review/iyyer_slide23.png differ
diff --git a/transformer_review/multihead_attention.png b/transformer_review/multihead_attention.png
new file mode 100644
index 0000000..ae8f380
Binary files /dev/null and b/transformer_review/multihead_attention.png differ
diff --git a/transformer_review/neural_net.svg b/transformer_review/neural_net.svg
new file mode 100644
index 0000000..7f8682d
--- /dev/null
+++ b/transformer_review/neural_net.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" height="350" width="700" viewBox="0 0 700 350"><path fill="#fff" d="M0 0h700v350H0z"/><g fill="none" transform="translate(0 -702.36)"><g stroke="#000" stroke-width="2"><path d="M635 902.36l-170-50M635 852.36l-170-50M465 902.36l170-50M465 952.36l170-50M635 902.36l-170-100M635 852.36l-170-100M465 952.36l170-100M465 1002.4l170-100M465 1002.4l170-150M635 902.36l-170-150M465 902.36h170M465 852.36h170"/></g><circle cx="-650" transform="scale(-1 1)" stroke="#0f9640" cy="902.36" r="16.2" stroke-width="4"/><circle stroke-width="4" transform="scale(-1 1)" stroke="#0f9640" cy="852.36" cx="-650" r="16.2"/><g stroke="#000" stroke-width="2"><path d="M65 952.36l170-50M65 902.36l170-50M65 852.36l170-50M65 802.36l170-50M235 852.36l-170-50M235 902.36l-170-50M235 952.36l-170-50M235 1002.4L65 952.36M65 952.36l170-100M65 902.36l170-100M65 852.36l170-100M235 902.36l-170-100M235 952.36l-170-100M235 1002.4L65 902.36M235 1002.4L65 852.36M235 952.36l-170-150M65 902.36l170-150M65 952.36l170-150M65 952.36l170-200M235 1002.4L65 802.36M235 952.36H65M235 902.36H65M235 852.36H65M235 802.36H65"/></g><g stroke="#164bc5" stroke-width="4"><circle cx="50" cy="902.36" r="16.2"/><circle cy="952.36" cx="50" r="16.2"/><circle cy="852.36" cx="50" r="16.2"/><circle cx="50" cy="802.36" r="16.2"/></g><g stroke="#000"><g stroke-width="2"><path d="M435 1002.4l-170-50M435 952.36l-170-50M435 902.36l-170-50M435 852.36l-170-50M435 802.36l-170-50M265 802.36l170-50M265 852.36l170-50M265 902.36l170-50M265 952.36l170-50M265 1002.4l170-50M435 1002.4l-170-100M435 952.36l-170-100M435 902.36l-170-100M435 852.36l-170-100M265 852.36l170-100M265 902.36l170-100M265 952.36l170-100M265 1002.4l170-100M265 1002.4l170-150M265 952.36l170-150M265 902.36l170-150M435 902.36l-170-150M435 952.36l-170-150M435 1002.4l-170-150M435 1002.4l-170-200M435 952.36l-170-200M265 952.36l170-200M265 1002.4l170-200M435 1002.4l-170-250M265 1002.4l170-250M265 1002.4h170M265 952.36h170M265 902.36h170M265 852.36h170M265 802.36h170M265 752.36h170"/></g><g stroke-width="4"><circle cx="-450" transform="scale(-1 1)" cy="1002.4" r="16.2"/><circle transform="scale(-1 1)" cy="902.36" cx="-450" r="16.2"/><circle cx="-450" transform="scale(-1 1)" cy="952.36" r="16.2"/><circle cx="-450" transform="scale(-1 1)" cy="852.36" r="16.2"/><circle transform="scale(-1 1)" cy="802.36" cx="-450" r="16.2"/><circle cx="-450" transform="scale(-1 1)" cy="752.36" r="16.2"/><circle cx="250" cy="1002.4" r="16.2"/><circle cy="902.36" cx="250" r="16.2"/><circle cx="250" cy="952.36" r="16.2"/><circle cx="250" cy="852.36" r="16.2"/><circle cy="802.36" cx="250" r="16.2"/><circle cx="250" cy="752.36" r="16.2"/></g></g></g></svg>
\ No newline at end of file
diff --git a/transformer_review/slides.html b/transformer_review/slides.html
new file mode 100644
index 0000000..63cdc3b
--- /dev/null
+++ b/transformer_review/slides.html
@@ -0,0 +1,456 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  <meta http-equiv="Content-Style-Type" content="text/css" />
+  <meta name="generator" content="pandoc" />
+  <title>Transformers Review</title>
+  <style type="text/css">
+    code{white-space: pre-wrap;}
+    span.smallcaps{font-variant: small-caps;}
+    div.columns{display: flex; gap: min(4vw, 1.5em);}
+    div.column{flex: auto; overflow-x: auto;}
+    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+    /* The extra [class] is a hack that increases specificity enough to
+       override a similar rule in reveal.js */
+    ul.task-list[class]{list-style: none;}
+    ul.task-list li input[type="checkbox"] {
+      font-size: inherit;
+      width: 0.8em;
+      margin: 0 0.8em 0.2em -1.6em;
+      vertical-align: middle;
+    }
+  </style>
+  <link rel="stylesheet" type="text/css" media="screen, projection, print"
+    href="https://www.w3.org/Talks/Tools/Slidy2/styles/slidy.css" />
+  <script
+  src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js"
+  type="text/javascript"></script>
+  <script src="https://www.w3.org/Talks/Tools/Slidy2/scripts/slidy.js"
+    charset="utf-8" type="text/javascript"></script>
+</head>
+<body>
+<div class="slide titlepage">
+  <h1 class="title">Transformers Review</h1>
+</div>
+<div id="transformers-overview" class="slide section level1">
+<h1>Transformers Overview</h1>
+<ul class="incremental">
+<li>“Attention is all you need” (2017)</li>
+<li>The go-to architecture for most machine learning problems,
+especially language models</li>
+</ul>
+</div>
+<div id="brief-recap-of-neural-nets" class="slide section level1">
+<h1>Brief recap of neural nets</h1>
+<div class="float">
+<img src="./neural_net.svg" alt="“Neural net visualization”" />
+<div class="figcaption">“Neural net visualization”</div>
+</div>
+</div>
+<div id="brief-recap-of-neural-nets-more" class="slide section level1">
+<h1>Brief recap of neural nets (more)</h1>
+<ul class="incremental">
+<li>Individually simple neurons connected via layers</li>
+<li>Weights and biases are changed in training
+<ul class="incremental">
+<li>Number of neurons and layer structures do not change in
+training</li>
+</ul></li>
+<li>Theoretically universal
+<ul class="incremental">
+<li>In practice often learns spurious relationships without more
+safeguards</li>
+<li>Architectures provide these safeguards and are therefore
+<strong>subtractive</strong> not <strong>additive</strong></li>
+</ul></li>
+<li>Calculating with weights and biases can be rewritten as matrix
+multiplication and addition
+<ul class="incremental">
+<li>Every layer-to-layer connection of weights can be interpreted as a
+matrix of size n x m
+<ul class="incremental">
+<li>n is the size of the previous layer and m is the size of the next
+layer</li>
+<li>Entries in matrices are connection weights between two neurons</li>
+<li>Passing the outputs of one layer as the inputs to the next is
+multiplication of those inputs by the matrix</li>
+</ul></li>
+<li>Every set of biases of a layer of neurons can be interpreted as a
+matrix (with a single column)
+<ul class="incremental">
+<li>Each neuron in the layer has one bias entry in the matrix</li>
+</ul></li>
+</ul></li>
+</ul>
+</div>
+<div id="transformers-architecture" class="slide section level1">
+<h1>Transformers Architecture</h1>
+<div class="columns">
+<div class="column">
+<p><img src="./transformer_arch.webp" height="800" /></p>
+</div><div class="column">
+<blockquote>
+<p><strong>1. Encoder + decoder</strong></p>
+</blockquote>
+<ol start="2" style="list-style-type: decimal">
+<li>Attention</li>
+<li>Multi-Head Attention</li>
+<li>Positional Encoding</li>
+<li>Transformer Blocks</li>
+</ol>
+</div>
+</div>
+</div>
+<div id="encoder-decoder" class="slide section level1">
+<h1>1. Encoder + decoder</h1>
+<ul class="incremental">
+<li>Have one neural net (or set of nets) that outputs some abstract
+representation of text</li>
+<li>Have another neural net (or set of nets) decode that abstract
+representation back to natural language</li>
+<li>Not new with transformers (e.g. seq2seq 2014)</li>
+</ul>
+</div>
+<div class="slide section level1">
+
+<div class="columns">
+<div class="column">
+<p><img src="./transformer_arch.webp" height="800" /></p>
+</div><div class="column">
+<ol style="list-style-type: decimal">
+<li>Encoder + decoder</li>
+</ol>
+<blockquote>
+<p><strong>2. Attention</strong></p>
+</blockquote>
+<ol start="3" style="list-style-type: decimal">
+<li>Multi-Head Attention</li>
+<li>Positional Encoding</li>
+<li>Transformer Blocks</li>
+</ol>
+</div>
+</div>
+</div>
+<div id="attention" class="slide section level1">
+<h1>2. Attention</h1>
+<ul class="incremental">
+<li>Inspired by the idea of human attention</li>
+<li>Allows the model to “attend to” different parts of the input
+sequence at a given time</li>
+<li>NLP Professor Raymond Mooney: <em>“You can’t cram the meaning of a
+whole %&amp;!$# sentence into a single $&amp;!#* vector!”</em>
+<ul class="incremental">
+<li><a href="https://www.cs.utexas.edu/~mooney/cramming.html">…you can
+use your language model of informal English to fill in the masked
+portions</a></li>
+</ul></li>
+<li>Instead, consider attention as a series of queries, keys, and values
+(<span class="math inline">\(W_k\)</span>, <span
+class="math inline">\(W_q\)</span>, <span
+class="math inline">\(W_v\)</span>)</li>
+<li>Two ways to explore:
+<ol class="incremental" style="list-style-type: lower-alpha">
+<li><a
+href="https://people.cs.umass.edu/~miyyer/cs685_f21/slides/04-attention.pdf">Visual,
+via Mohit Iyyer, University of Massachusetts Amherst 2021</a></li>
+<li>Analogy from Changlin</li>
+</ol></li>
+</ul>
+</div>
+<div id="a.-attention-visualized-iyyer-2021"
+class="slide section level1">
+<h1>2a. Attention visualized (Iyyer 2021)</h1>
+<p><img src="iyyer_slide18.png" height="800" /></p>
+</div>
+<div id="a.-attention-visualized-iyyer-2021-1"
+class="slide section level1">
+<h1>2a. Attention visualized (Iyyer 2021)</h1>
+<p><img src="iyyer_slide19.png" height="800" /></p>
+</div>
+<div id="a.-attention-visualized-iyyer-2021-2"
+class="slide section level1">
+<h1>2a. Attention visualized (Iyyer 2021)</h1>
+<p><img src="iyyer_slide20.png" height="800" /></p>
+</div>
+<div id="a.-attention-visualized-iyyer-2021-3"
+class="slide section level1">
+<h1>2a. Attention visualized (Iyyer 2021)</h1>
+<p><img src="iyyer_slide21.png" height="800" /></p>
+</div>
+<div id="a.-attention-visualized-iyyer-2021-4"
+class="slide section level1">
+<h1>2a. Attention visualized (Iyyer 2021)</h1>
+<p><img src="iyyer_slide22.png" height="800" /></p>
+</div>
+<div id="a.-attention-visualized-iyyer-2021-5"
+class="slide section level1">
+<h1>2a. Attention visualized (Iyyer 2021)</h1>
+<p><img src="iyyer_slide23.png" height="800" /></p>
+</div>
+<div id="b.-attention-as-a-db-query" class="slide section level1">
+<h1>2b. Attention as a DB query</h1>
+<ul class="incremental">
+<li>I have a database with keys and values. Keys are chosen to play
+nicer with queries, values are what I actually return in the data.</li>
+<li><code>[("Alice", "some data about Alice"), ("Bob", "some data about Bob")]</code></li>
+<li>Query “Get me data about keys/names that start with ‘A’”</li>
+<li>Match query against key</li>
+</ul>
+</div>
+<div id="b.-attention-as-a-db-query-1" class="slide section level1">
+<h1>2b. Attention as a DB query</h1>
+<div class="columns">
+<div class="column">
+<ul class="incremental">
+<li><strong>Abstract steps of DB query</strong></li>
+<li>Split data into keys and values</li>
+<li>Generate a query</li>
+<li>Compare queries with keys</li>
+<li>Use comparison to select which values to return</li>
+</ul>
+</div><div class="column">
+<ul class="incremental">
+<li><strong>What about a “fuzzy” DB query?</strong></li>
+<li>Split data into keys and values</li>
+<li>Generate a query</li>
+<li>Instead of binary comparison, yes/no, do a fuzzy match score between
+0 and 1</li>
+<li>Multiply each value by the fuzzy match and combine them all together
+to return a “fuzzy” match</li>
+<li>This degenerates to a normal DB query if we just constrain the
+fuzziness to either 0 or 1</li>
+</ul>
+</div>
+</div>
+</div>
+<div id="b.-attention-equivalents-in-attention"
+class="slide section level1">
+<h1>2b. Attention: Equivalents in attention</h1>
+<ul class="incremental">
+<li>Generate a key and value vector from a given word</li>
+<li>Generate a query vector from the word</li>
+<li>Dot product the query vector against the key vector to generate
+weights</li>
+<li>Multiply each value vector by the weights</li>
+</ul>
+</div>
+<div id="b.-attention-generating-the-key-value-and-query-vectors"
+class="slide section level1">
+<h1>2b. Attention: Generating the key, value, and query vectors</h1>
+<ul class="incremental">
+<li>We have matrices for each key, value, and query
+<ul class="incremental">
+<li><span class="math inline">\(W_k\)</span>, <span
+class="math inline">\(W_v\)</span>, and <span
+class="math inline">\(W_q\)</span></li>
+</ul></li>
+<li>These matrix values are learned during training</li>
+</ul>
+</div>
+<div id="b.-attention-working-through-a-specific-example"
+class="slide section level1">
+<h1>2b. Attention: Working through a specific example</h1>
+<ul class="incremental">
+<li>“The car was driving too quickly through the field. <em>It</em>
+crashed into a tree.”</li>
+<li>Look at a single given word “it”, which has some vector form after
+embedding</li>
+<li>Multiply <em>every word</em>’s embedding by <span
+class="math inline">\(W_k\)</span> to generate key vectors for all of
+them</li>
+<li>Multiply <em>every word</em>’s embedding by <span
+class="math inline">\(W_v\)</span> to generate value vectors for all of
+them</li>
+<li>Multiply “it” embedding by <span class="math inline">\(W_q\)</span>
+to generate a single query vector</li>
+<li>Dot product query vector against every key vector to get weights
+against every value</li>
+<li>Multiply every value by weight and add them altogether to the final
+attention result</li>
+<li><div class="float">
+<img src="attention_example.png" alt="Attention weight visualization" />
+<div class="figcaption">Attention weight visualization</div>
+</div></li>
+</ul>
+</div>
+<div class="slide section level1">
+
+<div class="columns">
+<div class="column">
+<p><img src="./transformer_arch.webp" height="800" /></p>
+</div><div class="column">
+<ol style="list-style-type: decimal">
+<li>Encoder + decoder</li>
+<li>Attention</li>
+</ol>
+<blockquote>
+<p><strong>3. Multi-Head Attention</strong></p>
+</blockquote>
+<ol start="4" style="list-style-type: decimal">
+<li>Positional Encoding</li>
+<li>Transformer Blocks</li>
+</ol>
+</div>
+</div>
+</div>
+<div id="multi-head-attention" class="slide section level1">
+<h1>3. Multi-Head Attention</h1>
+<ul class="incremental">
+<li>Empirical tuning (like so much of ML!)</li>
+<li>The entirety of the reasoning in the original paper: “We found it
+beneficial” <a href="https://arxiv.org/pdf/1706.03762.pdf">the original
+paper</a></li>
+</ul>
+</div>
+<div id="attention-is-relatively-unexpressive-vaswani-2024"
+class="slide section level1">
+<h1>3. Attention is relatively unexpressive (Vaswani 2024)</h1>
+<ul class="incremental">
+<li><img src="attention.png" /></li>
+<li><img src="cnn.png" /></li>
+</ul>
+</div>
+<div id="multi-head-attention-increases-expressivity-vaswani-2024"
+class="slide section level1">
+<h1>3. Multi-Head Attention increases expressivity (Vaswani 2024)</h1>
+<p><img src="multihead_attention.png" /></p>
+</div>
+<div id="section" class="slide section level1">
+<h1></h1>
+<div class="columns">
+<div class="column">
+<p><img src="./transformer_arch.webp" height="800" /></p>
+</div><div class="column">
+<ol style="list-style-type: decimal">
+<li>Encoder + decoder</li>
+<li>Attention</li>
+<li>Multi-Head Attention</li>
+</ol>
+<blockquote>
+<p><strong>4. Positional Encoding</strong></p>
+</blockquote>
+<ol start="5" style="list-style-type: decimal">
+<li>Transformer Blocks</li>
+</ol>
+</div>
+</div>
+</div>
+<div id="positional-encoding" class="slide section level1">
+<h1>4. Positional Encoding</h1>
+<ul class="incremental">
+<li>Attention is position invariant, as is almost everything in a
+transformer block</li>
+<li>It is therefore common to explicitly encode position
+information</li>
+<li>This is called a <em>positional encoding</em>, where after embedding
+a token as a vector of floats, there is another operation that modifies
+the vector based on what the index of the token is in the input</li>
+</ul>
+</div>
+<div id="section-1" class="slide section level1">
+<h1></h1>
+<div class="columns">
+<div class="column">
+<p><img src="./transformer_arch.webp" height="800" /></p>
+</div><div class="column">
+<ol style="list-style-type: decimal">
+<li>Encoder + decoder</li>
+<li>Attention</li>
+<li>Multi-Head Attention</li>
+<li>Positional Encoding</li>
+</ol>
+<blockquote>
+<p><strong>5. Transformer Blocks</strong></p>
+</blockquote>
+</div>
+</div>
+</div>
+<div id="transformer-blocks" class="slide section level1">
+<h1>5. Transformer blocks</h1>
+<ul class="incremental">
+<li><p>A transformer model consists of all of the components we’ve
+discussed, but some of them are repeated in structures called
+“blocks”</p></li>
+<li><p>Remember MLP is just a vanilla neural net.</p></li>
+</ul>
+<pre><code>----------------------------
+|      Output              |
+|        ^                 |
+|        |                 |
+|   Normalization &lt;-----|  |
+|        ^              |  |
+|        |              |  |
+|       MLP             |  |
+|        ^              |  |
+|        | -------------|  |
+|        |                 |
+|   Normalization &lt;-----|  |
+|        ^              |  |
+|        |              |  |
+| Multi-Head Attention  |  |
+|        ^              |  |
+|        |              |  |
+|      Input -----------|  |
+|                          |
+----------------------------</code></pre>
+</div>
+<div id="stacking-attention-on-top-of-attention"
+class="slide section level1">
+<h1>5. Stacking attention on top of attention</h1>
+<ul class="incremental">
+<li>Keep stacking attention matrices on top of rounds of merging
+multiple attention streams</li>
+<li>Query, key, value intuition kind of falls apart
+<ul class="incremental">
+<li>What is attention “<em>really</em>”?</li>
+<li>At the end of the day a particular set of guardrails on neural nets
+that seems to make models good at language</li>
+<li>Again no reason in theory why a sufficiently large single neural net
+couldn’t subsume the idea of attention
+<ul class="incremental">
+<li>It just doesn’t happen in practice</li>
+<li>Too many spurious relationships</li>
+<li>The guardrails provided by attention cut down on spurious
+relationships (i.e. subtractive, not additive new capabilities)</li>
+</ul></li>
+</ul></li>
+</ul>
+</div>
+<div id="putting-it-all-back-together" class="slide section level1">
+<h1>Putting It All Back Together</h1>
+<ol class="incremental" style="list-style-type: decimal">
+<li>Start with an input text sequence consisting of <code>n</code>
+tokens</li>
+<li>Convert that to <code>n</code> vectors of size <code>d_model</code>
+using some pretrained embedding (will use <code>n</code> x
+<code>d_model</code> as short-hand for this)</li>
+<li>Add positional encoding: output is new set of <code>n</code> x
+<code>d_model</code> vectors</li>
+<li>Pass into (multi-head) attention mechanism: output is new set of
+<code>n</code> x <code>d_model</code> vectors</li>
+<li>Normalize the sum of input into attention and its output from the
+previous step: output is new set of <code>n</code> x
+<code>d_model</code> vectors</li>
+<li>Pass vectors into MLP: output is new set of <code>n</code> x
+<code>d_model</code> vectors</li>
+<li>Normalize the sum of input into MLP and its output from the previous
+step: output is new set of <code>n</code> x <code>d_model</code>
+vectors</li>
+<li>Repeat steps 4-7 for as many transformer blocks as the model has:
+output is new set of <code>n</code> x <code>d_model</code> vectors</li>
+<li>Pass into final linear layer: output is new set of <code>n</code> x
+<code>d_vocabulary</code> vectors (<code>d_vocabulary</code> is the
+number of possible distinct tokens)</li>
+<li>Choose the last vector: output is <code>1</code> x
+<code>d_vocabulary</code> vector</li>
+<li>Choose index of vector with highest scalar value: output is
+<code>1</code> scalar</li>
+<li>Lookup that index using vocabulary dictionary back to a text token:
+output is a single new token</li>
+</ol>
+</div>
+</body>
+</html>
diff --git a/transformer_review/slides.md b/transformer_review/slides.md
new file mode 100644
index 0000000..d415a8f
--- /dev/null
+++ b/transformer_review/slides.md
@@ -0,0 +1,301 @@
+% Transformers Review 
+
+# Transformers Overview
++ "Attention is all you need" (2017)
++ The go-to architecture for most machine learning problems, especially language models
+
+# Brief recap of neural nets
+
+!["Neural net visualization"](./neural_net.svg)
+
+# Brief recap of neural nets (more)
+
++ Individually simple neurons connected via layers
++ Weights and biases are changed in training
+    * Number of neurons and layer structures do not change in training
++ Theoretically universal
+    * In practice often learns spurious relationships without more safeguards
+    * Architectures provide these safeguards and are therefore **subtractive** not
+      **additive**
++ Calculating with weights and biases can be rewritten as matrix multiplication
+  and addition
+    * Every layer-to-layer connection of weights can be interpreted as a matrix of size n x m
+        - n is the size of the previous layer and m is the size of the next layer
+        - Entries in matrices are connection weights between two neurons
+        - Passing the outputs of one layer as the inputs to the next is
+          multiplication of those inputs by the matrix
+    * Every set of biases of a layer of neurons can be interpreted as a matrix (with a single column)
+        - Each neuron in the layer has one bias entry in the matrix
+
+# Transformers Architecture
+::: columns
+
+:::: column
+![](./transformer_arch.webp){ height=800px }
+::::
+
+:::: column
+> **1. Encoder + decoder**
+
+> 2. Attention
+> 3. Multi-Head Attention
+> 4. Positional Encoding
+> 5. Transformer Blocks
+
+::::
+:::
+
+# 1. Encoder + decoder
++ Have one neural net (or set of nets) that outputs some abstract representation
+  of text
++ Have another neural net (or set of nets) decode that abstract representation
+  back to natural language
++ Not new with transformers (e.g. seq2seq 2014)
+
+---
+
+::: columns
+
+:::: column
+![](./transformer_arch.webp){ height=800px }
+::::
+
+:::: column
+
+> 1. Encoder + decoder
+
+> **2. Attention**
+
+> 3. Multi-Head Attention
+> 4. Positional Encoding
+> 5. Transformer Blocks
+
+::::
+:::
+
+# 2. Attention
+
++ Inspired by the idea of human attention
++ Allows the model to "attend to" different parts of the input sequence at a given time 
++ NLP Professor Raymond Mooney: *"You can’t cram the meaning of a whole %&!$# sentence into a single $&!#* vector!"*
+  * [...you can use your language model of informal English to fill in the masked portions](https://www.cs.utexas.edu/~mooney/cramming.html)
++ Instead, consider attention as a series of queries, keys, and values ($W_k$, $W_q$, $W_v$)
++ Two ways to explore:
+    a. [Visual, via Mohit Iyyer, University of Massachusetts Amherst 2021](https://people.cs.umass.edu/~miyyer/cs685_f21/slides/04-attention.pdf)
+    b. Analogy from Changlin
+
+# 2a. Attention visualized (Iyyer 2021)
+![](iyyer_slide18.png){ height=800px }
+
+# 2a. Attention visualized (Iyyer 2021)
+![](iyyer_slide19.png){ height=800px }
+
+# 2a. Attention visualized (Iyyer 2021)
+![](iyyer_slide20.png){ height=800px }
+
+# 2a. Attention visualized (Iyyer 2021)
+![](iyyer_slide21.png){ height=800px }
+
+# 2a. Attention visualized (Iyyer 2021)
+![](iyyer_slide22.png){ height=800px }
+
+# 2a. Attention visualized (Iyyer 2021)
+![](iyyer_slide23.png){ height=800px }
+
+# 2b. Attention as a DB query
++ I have a database with keys and values. Keys are chosen to play nicer with
+  queries, values are what I actually return in the data.
++ `[("Alice", "some data about Alice"), ("Bob", "some data about Bob")]`
++ Query "Get me data about keys/names that start with 'A'"
++ Match query against key
+
+# 2b. Attention as a DB query
+::: columns
+
+:::: column
++ **Abstract steps of DB query**
++ Split data into keys and values
++ Generate a query
++ Compare queries with keys
++ Use comparison to select which values to return
+::::
+
+:::: column
++ **What about a "fuzzy" DB query?**
++ Split data into keys and values
++ Generate a query
++ Instead of binary comparison, yes/no, do a fuzzy match score between 0 and 1
++ Multiply each value by the fuzzy match and combine them all together to return
+  a "fuzzy" match
++ This degenerates to a normal DB query if we just constrain the fuzziness to
+  either 0 or 1
+::::
+:::
+
+# 2b. Attention: Equivalents in attention
++ Generate a key and value vector from a given word
++ Generate a query vector from the word
++ Dot product the query vector against the key vector to generate weights
++ Multiply each value vector by the weights
+
+# 2b. Attention: Generating the key, value, and query vectors
+
++ We have matrices for each key, value, and query
+    * $W_k$, $W_v$, and $W_q$
++ These matrix values are learned during training
+
+# 2b. Attention: Working through a specific example
++ "The car was driving too quickly through the field. *It* crashed into a tree."
++ Look at a single given word "it", which has some vector form after embedding
++ Multiply *every word*'s embedding by $W_k$ to generate key vectors for all of
+  them
++ Multiply *every word*'s embedding by $W_v$ to generate value vectors for all
+  of them
++ Multiply "it" embedding by $W_q$ to generate a single query vector
++ Dot product query vector against every key vector to get weights against every
+  value
++ Multiply every value by weight and add them altogether to the final attention
+  result
++ ![Attention weight visualization](attention_example.png)
+
+---
+
+::: columns
+
+:::: column
+![](./transformer_arch.webp){ height=800px }
+::::
+
+:::: column
+
+> 1. Encoder + decoder
+> 2. Attention
+
+> **3. Multi-Head Attention**
+
+> 4. Positional Encoding
+> 5. Transformer Blocks
+
+::::
+:::
+
+# 3. Multi-Head Attention
++ Empirical tuning (like so much of ML!) 
++ The entirety of the reasoning in the original paper: "We found it beneficial"
+  [the original paper](https://arxiv.org/pdf/1706.03762.pdf)
+
+# 3. Attention is relatively unexpressive (Vaswani 2024)
++ ![](attention.png)
++ ![](cnn.png)
+
+# 3. Multi-Head Attention increases expressivity (Vaswani 2024)
+![](multihead_attention.png)
+
+
+# 
+::: columns
+
+:::: column
+![](./transformer_arch.webp){ height=800px }
+::::
+
+:::: column
+
+> 1. Encoder + decoder
+> 2. Attention
+> 3. Multi-Head Attention
+
+> **4. Positional Encoding**
+
+> 5. Transformer Blocks
+
+::::
+:::
+
+# 4. Positional Encoding
+
++ Attention is position invariant, as is almost everything in a transformer block
++ It is therefore common to explicitly encode position information
++ This is called a *positional encoding*, where after embedding a token as a vector of floats, there is another operation that modifies the vector based on what the
+  index of the token is in the input
+
+# 
+::: columns
+
+:::: column
+![](./transformer_arch.webp){ height=800px }
+::::
+
+:::: column
+
+> 1. Encoder + decoder
+> 2. Attention
+> 3. Multi-Head Attention
+> 4. Positional Encoding
+
+> **5. Transformer Blocks**
+
+::::
+:::
+
+# 5. Transformer blocks
++ A transformer model consists of all of the components we've discussed, but some of them are repeated in structures called "blocks" 
+
++ Remember MLP is just a vanilla neural net.
+```
+----------------------------
+|      Output              |
+|        ^                 |
+|        |                 |
+|   Normalization <-----|  |
+|        ^              |  |
+|        |              |  |
+|       MLP             |  |
+|        ^              |  |
+|        | -------------|  |
+|        |                 |
+|   Normalization <-----|  |
+|        ^              |  |
+|        |              |  |
+| Multi-Head Attention  |  |
+|        ^              |  |
+|        |              |  |
+|      Input -----------|  |
+|                          |
+----------------------------
+```
+
+
+# 5. Stacking attention on top of attention
++ Keep stacking attention matrices on top of rounds of merging multiple
+  attention streams
++ Query, key, value intuition kind of falls apart
+    * What is attention "*really*"?
+    * At the end of the day a particular set of guardrails on neural nets that
+      seems to make models good at language
+    * Again no reason in theory why a sufficiently large single neural net
+      couldn't subsume the idea of attention
+        - It just doesn't happen in practice
+        - Too many spurious relationships
+        - The guardrails provided by attention cut down on spurious
+          relationships (i.e. subtractive, not additive new capabilities)
+
+# Putting It All Together
+1. Start with an input text sequence consisting of `n` tokens
+2. Convert that to `n` vectors of size `d_model` using some pretrained
+   embedding (will use `n` x `d_model` as short-hand for this)
+3. Add positional encoding: output is new set of `n` x `d_model` vectors
+4. Pass into (multi-head) attention mechanism: output is new set of `n` x `d_model` vectors
+5. Normalize the sum of input into attention and its output from the previous
+   step: output is new set of `n` x `d_model` vectors
+6. Pass vectors into MLP: output is new set of `n` x `d_model` vectors
+7. Normalize the sum of input into MLP and its output from the previous step:
+   output is new set of `n` x `d_model` vectors
+8. Repeat steps 4-7 for as many transformer blocks as the model has: output is
+   new set of `n` x `d_model` vectors
+9. Pass into final linear layer: output is new set of `n` x `d_vocabulary`
+   vectors (`d_vocabulary` is the number of possible distinct tokens)
+10. Choose the last vector: output is `1` x `d_vocabulary` vector
+11. Choose index of vector with highest scalar value: output is `1` scalar
+12. Lookup that index using vocabulary dictionary back to a text token: output is a single new token
+
diff --git a/transformer_review/transformer_arch.webp b/transformer_review/transformer_arch.webp
new file mode 100644
index 0000000..e06e3a6
Binary files /dev/null and b/transformer_review/transformer_arch.webp differ