From 9872d76387cad3d07defeb6cad5ab73cdab1f8de Mon Sep 17 00:00:00 2001
From: Huanyi Chen <huanyi.chen@uwaterloo.ca>
Date: Sun, 7 Jan 2024 22:22:58 -0500
Subject: [PATCH] Prepare L06 flipped note

---
 lectures/flipped/L06.md | 43 +++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/lectures/flipped/L06.md b/lectures/flipped/L06.md
index 7115c8a3..6fe2c2a5 100644
--- a/lectures/flipped/L06.md
+++ b/lectures/flipped/L06.md
@@ -1,28 +1,41 @@
-# Lecture 06: Modern Processors
+# Lecture 6 — Modern Processors
+
+## Roadmap
+
+We will talk about some techniques to speed up CPU execution
+
+## Mini-lecture
+
+CISC to RISC leads to impressive scaling on CPU frequency during a time, but, we
+hit the wall eventually: clock speeds stop getting faster around 2005, stopping
+at around 3 GHz. That's why we seek other techniques.
 
 - Pipelining
 
+This is straightforward I guess.
+
 - Register renaming
 
 ```asm
 MOV R2, R7 + 32
 ADD R1, R2
-MOV R2, R9 + 64
+MOV R2, R9 + 64 ; we can rename R2 to say RY
 ADD R3, R2
 ```
 
 - Speculation
 
 ```asm
-ld rax, rbx+16 ; assume cache miss
-add rbx, 16    ; carry on anyway, ADD doesn’t need rax value from LD
-               ; register renaming => LD (write to reg)/ADD (read from reg) don’t interfere
-cmp rax, 0     ; needs rax value, queue till available
-jeq null_chk   ; oops! need cmp result
-               ; speculate: assume branch not taken
-st rbx-16, rcx ; speculatively store to store buf (not L1)
-ld rcx, rdx    ; unrelated cache miss: 2 misses now active, 1 speculative
-ld rax, rax+8  ; now must wait for result of first LD
+ld  rax, rbx+16 ; assume cache miss
+add rbx, 16     ; carry on anyway, ADD doesn’t need rax value from LD
+                ; register renaming => LD (read rbx)/ADD (write to renamed rbx) don’t interfere
+cmp rax, 0      ; needs rax value, queue till available
+jeq null_chk    ; oops! need cmp result
+                ; speculate: assume branch not taken
+st  rbx-16, rcx ; speculatively store to store buf (not L1)
+ld  rcx, rdx    ; unrelated cache miss: 2 misses now active, 1 speculative
+ld  rax, rax+8  ; now must wait for result of first LD since we need rax
+                ; but we still almost cut the time in half
 ```
 
 ## Calculation
@@ -30,8 +43,8 @@ ld rax, rax+8  ; now must wait for result of first LD
 ### q1
 
 Assume we can always find the data in L3 cache, cache miss rates are 40 per 1000
-for L1D and 4 per 1000 for L2, and cache miss penalty are 5 cycles for L1D and 300
-cycles for L2, what is the average running time for an instruction?
+for L1D (L1 data) and 4 per 1000 for L2, and cache miss penalty are 5 cycles for
+L1D and 300 cycles for L2, what is the average running time for an instruction?
 
 ### q2
 
@@ -44,7 +57,9 @@ if you have a page fault?
 
 Talked about frequency scaling.
 
-Pipelining: Put 5 instructions on post-it notes. First, had a student acting out executing the stages of the instructions sequentially. Then, had 4 more students come up, and acted out pipelining the instructions. Just a bit of chaos here.
+Pipelining: Put 5 instructions on post-it notes. First, had a student acting out
+executing the stages of the instructions sequentially. Then, had 4 more students
+come up, and acted out pipelining the instructions. Just a bit of chaos here.
 
 Did an illustration of waiting for cache/working in the miss shadow.