From c8558ee310e0acc1bdc74b7147f094ccbf7e5227 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 15 May 2024 18:53:15 +0200 Subject: [PATCH] selftests/bpf: Add bpf_burst scheduler & test This patch implements the burst BPF MPTCP scheduler, named bpf_burst, which is the default scheduler in protocol.c. bpf_burst_get_send() uses the same logic as mptcp_subflow_get_send() and bpf_burst_get_retrans uses the same logic as mptcp_subflow_get_retrans(). Using MPTCP_SCHED_TEST macro to add a new test for this bpf_burst scheduler, the arguments "1 1" means data has been sent on both net devices. Run this test by RUN_MPTCP_TEST macro. Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) --- .../testing/selftests/bpf/prog_tests/mptcp.c | 15 ++ .../selftests/bpf/progs/mptcp_bpf_burst.c | 202 ++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index ce3c352cc6d5fd..4472aa404da033 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -14,6 +14,7 @@ #include "mptcp_bpf_bkup.skel.h" #include "mptcp_bpf_rr.skel.h" #include "mptcp_bpf_red.skel.h" +#include "mptcp_bpf_burst.skel.h" #define NS_TEST "mptcp_ns" #define ADDR_1 "10.0.1.1" @@ -607,6 +608,18 @@ static void test_red(void) mptcp_bpf_red__destroy(skel); } +static void test_burst(void) +{ + struct mptcp_bpf_burst *skel; + + skel = mptcp_bpf_burst__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load: burst")) + return; + + test_bpf_sched(skel->obj, "burst", WITH_DATA, WITH_DATA); + mptcp_bpf_burst__destroy(skel); +} + void test_mptcp(void) { if (test__start_subtest("base")) @@ -625,4 +638,6 @@ void test_mptcp(void) test_rr(); if (test__start_subtest("red")) test_red(); + if (test__start_subtest("burst")) + test_burst(); } diff --git a/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c new file mode 100644 index 00000000000000..0026587a94df3d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_bpf_burst.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023, SUSE. */ + +#include "mptcp_bpf.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define MPTCP_SEND_BURST_SIZE 65428 + +#define min(a, b) ((a) < (b) ? (a) : (b)) + +struct bpf_subflow_send_info { + __u8 subflow_id; + __u64 linger_time; +}; + +extern bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) __ksym; +extern void mptcp_set_timeout(struct sock *sk) __ksym; +extern __u64 mptcp_wnd_end(const struct mptcp_sock *msk) __ksym; +extern bool tcp_stream_memory_free(const struct sock *sk, int wake) __ksym; +extern bool bpf_mptcp_subflow_queues_empty(struct sock *sk) __ksym; +extern void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) __ksym; + +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + +static __always_inline __u64 div_u64(__u64 dividend, __u32 divisor) +{ + return dividend / divisor; +} + +static __always_inline bool tcp_write_queue_empty(struct sock *sk) +{ + const struct tcp_sock *tp = bpf_skc_to_tcp_sock(sk); + + return tp ? tp->write_seq == tp->snd_nxt : true; +} + +static __always_inline bool tcp_rtx_and_write_queues_empty(struct sock *sk) +{ + return bpf_mptcp_subflow_queues_empty(sk) && tcp_write_queue_empty(sk); +} + +static __always_inline bool __sk_stream_memory_free(const struct sock *sk, int wake) +{ + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return tcp_stream_memory_free(sk, wake); +} + +static __always_inline bool sk_stream_memory_free(const struct sock *sk) +{ + return __sk_stream_memory_free(sk, 0); +} + +SEC("struct_ops") +void BPF_PROG(mptcp_sched_burst_init, struct mptcp_sock *msk) +{ +} + +SEC("struct_ops") +void BPF_PROG(mptcp_sched_burst_release, struct mptcp_sock *msk) +{ +} + +static int bpf_burst_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct bpf_subflow_send_info send_info[SSK_MODE_MAX]; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + __u32 pace, burst, wmem; + __u64 linger_time; + struct sock *ssk; + int i; + + /* pick the subflow with the lower wmem/wspace ratio */ + for (i = 0; i < SSK_MODE_MAX; ++i) { + send_info[i].subflow_id = MPTCP_SUBFLOWS_MAX; + send_info[i].linger_time = -1; + } + + for (i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) { + subflow = bpf_mptcp_subflow_ctx_by_pos(data, i); + if (!subflow) + break; + + ssk = mptcp_subflow_tcp_sock(subflow); + if (!mptcp_subflow_active(subflow)) + continue; + + pace = subflow->avg_pacing_rate; + if (!pace) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = ssk->sk_pacing_rate; + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } + + linger_time = div_u64((__u64)ssk->sk_wmem_queued << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { + send_info[subflow->backup].subflow_id = i; + send_info[subflow->backup].linger_time = linger_time; + } + } + mptcp_set_timeout(sk); + + /* pick the best backup if no other subflow is active */ + if (send_info[SSK_MODE_ACTIVE].subflow_id == MPTCP_SUBFLOWS_MAX) + send_info[SSK_MODE_ACTIVE].subflow_id = send_info[SSK_MODE_BACKUP].subflow_id; + + subflow = bpf_mptcp_subflow_ctx_by_pos(data, send_info[SSK_MODE_ACTIVE].subflow_id); + if (!subflow) + return -1; + ssk = mptcp_subflow_tcp_sock(subflow); + if (!ssk || !sk_stream_memory_free(ssk)) + return -1; + + burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + wmem = ssk->sk_wmem_queued; + if (!burst) + goto out; + + subflow->avg_pacing_rate = div_u64((__u64)subflow->avg_pacing_rate * wmem + + ssk->sk_pacing_rate * burst, + burst + wmem); + msk->snd_burst = burst; + +out: + mptcp_subflow_set_scheduled(subflow, true); + return 0; +} + +static int bpf_burst_get_retrans(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + int backup = MPTCP_SUBFLOWS_MAX, pick = MPTCP_SUBFLOWS_MAX, subflow_id; + struct mptcp_subflow_context *subflow; + int min_stale_count = INT_MAX; + struct sock *ssk; + + for (int i = 0; i < data->subflows && i < MPTCP_SUBFLOWS_MAX; i++) { + subflow = bpf_mptcp_subflow_ctx_by_pos(data, i); + if (!subflow) + break; + + if (!mptcp_subflow_active(subflow)) + continue; + + ssk = mptcp_subflow_tcp_sock(subflow); + /* still data outstanding at TCP level? skip this */ + if (!tcp_rtx_and_write_queues_empty(ssk)) { + mptcp_pm_subflow_chk_stale(msk, ssk); + min_stale_count = min(min_stale_count, subflow->stale_count); + continue; + } + + if (subflow->backup) { + if (backup == MPTCP_SUBFLOWS_MAX) + backup = i; + continue; + } + + if (pick == MPTCP_SUBFLOWS_MAX) + pick = i; + } + + if (pick < MPTCP_SUBFLOWS_MAX) { + subflow_id = pick; + goto out; + } + subflow_id = min_stale_count > 1 ? backup : MPTCP_SUBFLOWS_MAX; + +out: + subflow = bpf_mptcp_subflow_ctx_by_pos(data, subflow_id); + if (!subflow) + return -1; + mptcp_subflow_set_scheduled(subflow, true); + return 0; +} + +SEC("struct_ops") +int BPF_PROG(bpf_burst_get_subflow, struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + if (data->reinject) + return bpf_burst_get_retrans(msk, data); + return bpf_burst_get_send(msk, data); +} + +SEC(".struct_ops") +struct mptcp_sched_ops burst = { + .init = (void *)mptcp_sched_burst_init, + .release = (void *)mptcp_sched_burst_release, + .get_subflow = (void *)bpf_burst_get_subflow, + .name = "bpf_burst", +};