From 98199b21e39dc2d9862a185a4d15bca8f7b0f43a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Jul 2014 15:08:59 -0400 Subject: [PATCH 01/20] add msg_tstamp.c --- tests/msg_tstamp.c | 483 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 483 insertions(+) create mode 100644 tests/msg_tstamp.c diff --git a/tests/msg_tstamp.c b/tests/msg_tstamp.c new file mode 100644 index 0000000..a870dce --- /dev/null +++ b/tests/msg_tstamp.c @@ -0,0 +1,483 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Conformance tests for MSG_TSTAMP, including + * + * - UDP MSG_TSTAMP, MSG_TSTAMP_ENQ + * - TCP MSG_TSTAMP, MSG_TSTAMP_ENQ and MSG_TSTAMP_ACK + * - IPv4 and IPv6 + * - various packet sizes (to test GSO and TSO) + * + * Consult the command line arguments for help on running + * the various testcases. + * + * This test requires a dummy TCP server. + * A simple `nc6 [-u] -l -p $DESTPORT` will do + * + * Tested against Linux 3.16-rc1 (7171511eaec5) + * + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* should be defined in include/uapi/linux/socket.h */ +#define MSG_TSTAMP 0x100000 +#define MSG_TSTAMP_ACK 0x200000 +#define MSG_TSTAMP_ENQ 0x400000 + +#ifndef SCM_TSTAMP_SND +struct scm_timestamping { + struct timespec ts[3]; +}; + +#define SCM_TSTAMP_SND 0x1 /* driver passed skb to NIC */ +#define SCM_TSTAMP_ACK 0x2 /* transport layer saw ACK */ +#define SCM_TSTAMP_ENQ 0x4 /* stack passed skb to TC layer */ +#endif + +#ifndef SOF_TIMESTAMPING_OPT_TSONLY +#define SOF_TIMESTAMPING_OPT_TSONLY (1<<7) +#endif + +#define NUM_RUNS 4 + +/* command line parameters */ +static int cfg_proto = SOCK_STREAM; +static int cfg_ipproto = IPPROTO_TCP; +static int do_ipv4 = 1; +static int do_ipv6 = 1; +static int payload_len = 10; +static int tstamp_no_payload; +static uint16_t dest_port = 9000; + +struct sockaddr_in daddr; +struct sockaddr_in6 daddr6; + +/* random globals */ +static struct timeval tv; +static struct timespec ts_prev; +static int tstamp_payload_len; + +static void __print_timestamp(const char *name, struct timespec *cur, + uint32_t key) +{ + if (!(cur->tv_sec | cur->tv_nsec)) + return; + + fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)", + name, cur->tv_sec, cur->tv_nsec / 1000, + key, tstamp_payload_len); + + if ((ts_prev.tv_sec | ts_prev.tv_nsec)) { + int64_t cur_ms, prev_ms; + + cur_ms = (long) cur->tv_sec * 1000 * 1000; + cur_ms += cur->tv_nsec / 1000; + + prev_ms = (long) ts_prev.tv_sec * 1000 * 1000; + prev_ms += ts_prev.tv_nsec / 1000; + + fprintf(stderr, " (%+ld us)", cur_ms - prev_ms); + } + + ts_prev = *cur; + fprintf(stderr, "\n"); +} + +static void print_timestamp_usr(void) +{ + struct timespec ts; + + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; + __print_timestamp(" USR", &ts, 0); + +} + +static void print_timestamp(struct scm_timestamping *tss, int tstype, int tskey) +{ + const char *tsname; + + switch (tstype) { + case SCM_TSTAMP_ENQ: + tsname = " ENQ"; + break; + case SCM_TSTAMP_SND: + tsname = " SND"; + break; + case SCM_TSTAMP_ACK: + tsname = " ACK"; + break; + default: + error(1, 0, "unknown timestamp type: %u", + tstype); + } + __print_timestamp(tsname, &tss->ts[0], tskey); +} + +static void __recv_errmsg_cmsg(struct msghdr *msg) +{ + struct sock_extended_err *serr = NULL; + struct scm_timestamping *tss = NULL; + struct cmsghdr *cm; + + for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) { + if (cm->cmsg_level == SOL_SOCKET && + cm->cmsg_type == SCM_TIMESTAMPING) { + tss = (void *) CMSG_DATA(cm); + } else if ((cm->cmsg_level == SOL_IP && + cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_RECVERR)) { + + serr = (void *) CMSG_DATA(cm); + if (serr->ee_errno != ENOMSG || + serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) { + fprintf(stderr, "unknown ip error %d %d\n", + serr->ee_errno, + serr->ee_origin); + serr = NULL; + } + } else + fprintf(stderr, "%d, %d\n", + cm->cmsg_level, cm->cmsg_type); + } + + if (serr && tss) + print_timestamp(tss, serr->ee_info & 0x3FF, serr->ee_data); +} + +static int recv_errmsg(int fd) +{ + static char ctrl[1024 /* overcommit */]; + static struct msghdr msg; + struct iovec entry; + static char *data; + int ret = 0; + + data = malloc(payload_len); + if (!data) + error(1, 0, "malloc"); + + memset(&msg, 0, sizeof(msg)); + memset(&entry, 0, sizeof(entry)); + memset(ctrl, 0, sizeof(ctrl)); + memset(data, 0, sizeof(data)); + + entry.iov_base = data; + /* for TCP we specify payload length to read one packet at a time. */ + entry.iov_len = payload_len; + msg.msg_iov = &entry; + msg.msg_iovlen = 1; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT); + if (ret == -1 && (errno == EINTR || errno == EWOULDBLOCK)) + goto done; + if (ret == -1) + error(1, errno, "recvmsg"); + + tstamp_payload_len = ret; + if (tstamp_no_payload && tstamp_payload_len) + error(1, 0, "recv: payload when configured without"); + else if (!tstamp_no_payload && !tstamp_payload_len) + error(1, 0, "recv: no payload when configured with"); + + __recv_errmsg_cmsg(&msg); + +done: + free(data); + return ret == -1; +} + +static void do_test(int family, unsigned int flags) +{ + char *buf; + int fd, i, val, total_len; + + if (family == IPPROTO_IPV6 && cfg_proto != SOCK_STREAM) { + /* due to lack of checksum generation code */ + fprintf(stderr, "test: skipping datagram over IPv6\n"); + return; + } + + total_len = payload_len; + if (cfg_proto == SOCK_RAW) { + total_len += sizeof(struct udphdr); + if (cfg_ipproto == IPPROTO_RAW) + total_len += sizeof(struct iphdr); + } + + buf = malloc(total_len); + if (!buf) + error(1, 0, "malloc"); + + fd = socket(family, cfg_proto, cfg_ipproto); + if (fd < 0) + error(1, errno, "socket"); + + if (cfg_proto == SOCK_STREAM) { + val = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (char*) &val, sizeof(val))) + error(1, 0, "setsockopt no nagle"); + + if (family == PF_INET) { + if (connect(fd, (void *) &daddr, sizeof(daddr))) + error(1, errno, "connect ipv4"); + } else { + if (connect(fd, (void *) &daddr6, sizeof(daddr6))) + error(1, errno, "connect ipv6"); + } + } + + if (tstamp_no_payload) { + val = SOF_TIMESTAMPING_OPT_TSONLY; + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, + (char *) &val, sizeof(val))) + error(1, 0, "setsockopt no payload"); + } + + for (i = 0; i < NUM_RUNS; i++) { + memset(&ts_prev, 0, sizeof(ts_prev)); + memset(buf, 'a' + i, total_len); + buf[total_len - 2] = '\n'; + buf[total_len - 1] = '\0'; + + if (cfg_proto == SOCK_RAW) { + struct udphdr *udph; + struct iphdr *iph; + int off = 0; + + if (cfg_ipproto == IPPROTO_RAW) { + iph = (void *) buf; + + memset(iph, 0, sizeof(*iph)); + iph->ihl = 5; + iph->version = 4; + iph->ttl = 2; + iph->daddr = daddr.sin_addr.s_addr; + iph->protocol = IPPROTO_UDP; + /* kernel writes saddr, csum, len */ + + off = sizeof(*iph); + } + + udph = (void *) buf + off; + udph->source = ntohs(9000); /* random spoof */ + udph->dest = ntohs(dest_port); + udph->len = ntohs(sizeof(*udph) + payload_len); + udph->check = 0; /* not allowed for IPv6 */ + } + + gettimeofday(&tv, NULL); + if (cfg_proto != SOCK_STREAM) { + if (family == PF_INET) + val = sendto(fd, buf, total_len, flags, (void *) &daddr, sizeof(daddr)); + else + val = sendto(fd, buf, total_len, flags, (void *) &daddr6, sizeof(daddr6)); + } else { + val = send(fd, buf, payload_len, flags); + } + if (val != total_len) + error(1, errno, "send"); + + usleep(50 * 1000); + + print_timestamp_usr(); + while (!recv_errmsg(fd)) {} + } + + if (close(fd)) + error(1, errno, "close"); + + free(buf); + usleep(400 * 1000); +} + +static void __attribute__((noreturn)) usage(const char *filepath) +{ + fprintf(stderr, "\nUsage: %s [options] hostname\n" + "\nwhere options are:\n" + " -4: only IPv4\n" + " -6: only IPv6\n" + " -h: show this message\n" + " -l N: send N bytes at a time\n" + " -n: no payload on tstamp\n" + " -r: use raw\n" + " -R: use raw (IP_HDRINCL)\n" + " -p N: connect to port N\n" + " -u: use udp\n", + filepath); + exit(1); +} + +static void parse_opt(int argc, char **argv) +{ + int proto_count = 0; + char c; + + while ((c = getopt(argc, argv, "46hl:np:rRu")) != -1) { + switch (c) { + case '4': + do_ipv6 = 0; + break; + case '6': + do_ipv4 = 0; + break; + case 'r': + proto_count++; + cfg_proto = SOCK_RAW; + cfg_ipproto = IPPROTO_UDP; + break; + case 'R': + proto_count++; + cfg_proto = SOCK_RAW; + cfg_ipproto = IPPROTO_RAW; + break; + case 'u': + proto_count++; + cfg_proto = SOCK_DGRAM; + cfg_ipproto = IPPROTO_UDP; + break; + case 'l': + payload_len = strtoul(optarg, NULL, 10); + break; + case 'n': + tstamp_no_payload = 1; + break; + case 'p': + dest_port = strtoul(optarg, NULL, 10); + break; + case 'h': + default: + usage(argv[0]); + } + } + + if (cfg_proto != SOCK_STREAM && payload_len > 1472) + error(1, 0, "udp packet might exceed expected MTU"); + if (!do_ipv4 && !do_ipv6) + error(1, 0, "pass -4 or -6, not both"); + if (proto_count > 1) + error(1, 0, "pass -r, -R or -u, not multiple"); + + if (optind != argc - 1) + error(1, 0, "missing required hostname argument"); +} + +static void resolve_hostname(const char *hostname) +{ + struct addrinfo *addrs, *cur; + int have_ipv4 = 0, have_ipv6 = 0; + + if (getaddrinfo(hostname, NULL, NULL, &addrs)) + error(1, errno, "getaddrinfo"); + + cur = addrs; + while (cur && !have_ipv4 && !have_ipv6) { + if (!have_ipv4 && cur->ai_family == AF_INET) { + memcpy(&daddr, cur->ai_addr, sizeof(daddr)); + daddr.sin_port = htons(dest_port); + have_ipv4 = 1; + } + else if (!have_ipv6 && cur->ai_family == AF_INET6) { + memcpy(&daddr6, cur->ai_addr, sizeof(daddr6)); + daddr6.sin6_port = htons(dest_port); + have_ipv6 = 1; + } + cur = cur->ai_next; + } + if (addrs) + freeaddrinfo(addrs); + + do_ipv4 &= have_ipv4; + do_ipv6 &= have_ipv6; +} + +static void do_main(int family) +{ + fprintf(stderr, "family: %s\n", + family == PF_INET ? "INET" : "INET6"); + + fprintf(stderr, "test SND\n"); + do_test(family, MSG_TSTAMP); + + fprintf(stderr, "test ENQ\n"); + do_test(family, MSG_TSTAMP_ENQ); + + fprintf(stderr, "test ENQ + SND\n"); + do_test(family, MSG_TSTAMP_ENQ | MSG_TSTAMP); + + if (cfg_proto == SOCK_STREAM) { + fprintf(stderr, "\ntest ACK\n"); + do_test(family, MSG_TSTAMP_ACK); + + fprintf(stderr, "\ntest SND + ACK\n"); + do_test(family, MSG_TSTAMP | MSG_TSTAMP_ACK); + + fprintf(stderr, "\ntest ENQ + SND + ACK\n"); + do_test(family, MSG_TSTAMP_ENQ | MSG_TSTAMP | MSG_TSTAMP_ACK); + } +} + +const char *sock_names[] = { NULL, "TCP", "UDP", "RAW" }; + +int main(int argc, char **argv) +{ + parse_opt(argc, argv); + resolve_hostname(argv[argc - 1]); + + fprintf(stderr, "protocol: %s\n", sock_names[cfg_proto]); + fprintf(stderr, "payload: %u\n", payload_len); + fprintf(stderr, "server port: %u\n", dest_port); + fprintf(stderr, "\n"); + + if (do_ipv4) + do_main(PF_INET); + if (do_ipv6) + do_main(PF_INET6); + + return 0; +} From 35b6b14dafa71af9076b5861a5977e63645116bd Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 7 Aug 2014 19:33:01 -0400 Subject: [PATCH 02/20] msg_test: update to match v5 interface --- tests/{msg_tstamp.c => txtimestamp.c} | 79 +++++++++++++++++++++------ 1 file changed, 61 insertions(+), 18 deletions(-) rename tests/{msg_tstamp.c => txtimestamp.c} (88%) diff --git a/tests/msg_tstamp.c b/tests/txtimestamp.c similarity index 88% rename from tests/msg_tstamp.c rename to tests/txtimestamp.c index a870dce..58cc723 100644 --- a/tests/msg_tstamp.c +++ b/tests/txtimestamp.c @@ -2,10 +2,10 @@ * Copyright 2014 Google Inc. * Author: willemb@google.com (Willem de Bruijn) * - * Conformance tests for MSG_TSTAMP, including + * Conformance tests for software tx timestamping, including * - * - UDP MSG_TSTAMP, MSG_TSTAMP_ENQ - * - TCP MSG_TSTAMP, MSG_TSTAMP_ENQ and MSG_TSTAMP_ACK + * - SCHED, SND and ACK timestamps + * - RAW, UDP and TCP * - IPv4 and IPv6 * - various packet sizes (to test GSO and TSO) * @@ -15,7 +15,7 @@ * This test requires a dummy TCP server. * A simple `nc6 [-u] -l -p $DESTPORT` will do * - * Tested against Linux 3.16-rc1 (7171511eaec5) + * Tested against net-next (09ddb8e) * * * This program is free software; you can redistribute it and/or modify it @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -62,19 +63,20 @@ #define MSG_TSTAMP 0x100000 #define MSG_TSTAMP_ACK 0x200000 #define MSG_TSTAMP_ENQ 0x400000 +#define MSG_TSTAMP_ANY (MSG_TSTAMP | MSG_TSTAMP_ACK | MSG_TSTAMP_ENQ) #ifndef SCM_TSTAMP_SND struct scm_timestamping { struct timespec ts[3]; }; -#define SCM_TSTAMP_SND 0x1 /* driver passed skb to NIC */ -#define SCM_TSTAMP_ACK 0x2 /* transport layer saw ACK */ -#define SCM_TSTAMP_ENQ 0x4 /* stack passed skb to TC layer */ -#endif +#define SCM_TSTAMP_SND 0 +#define SCM_TSTAMP_SCHED 1 +#define SCM_TSTAMP_ACK 2 -#ifndef SOF_TIMESTAMPING_OPT_TSONLY -#define SOF_TIMESTAMPING_OPT_TSONLY (1<<7) +#define SOF_TIMESTAMPING_OPT_ID (1<<7) +#define SOF_TIMESTAMPING_TX_SCHED (1<<8) +#define SOF_TIMESTAMPING_TX_ACK (1<<9) #endif #define NUM_RUNS 4 @@ -137,7 +139,7 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, int tskey) const char *tsname; switch (tstype) { - case SCM_TSTAMP_ENQ: + case SCM_TSTAMP_SCHED: tsname = " ENQ"; break; case SCM_TSTAMP_SND: @@ -153,6 +155,19 @@ static void print_timestamp(struct scm_timestamping *tss, int tstype, int tskey) __print_timestamp(tsname, &tss->ts[0], tskey); } +static void __poll(int fd) +{ + struct pollfd pollfd; + int ret; + + memset(&pollfd, 0, sizeof(pollfd)); + pollfd.events = POLLIN; + pollfd.fd = fd; + ret = poll(&pollfd, 1, 100); + if (ret == -1 && errno != EAGAIN) + error(1, errno, "poll"); +} + static void __recv_errmsg_cmsg(struct msghdr *msg) { struct sock_extended_err *serr = NULL; @@ -182,7 +197,7 @@ static void __recv_errmsg_cmsg(struct msghdr *msg) } if (serr && tss) - print_timestamp(tss, serr->ee_info & 0x3FF, serr->ee_data); + print_timestamp(tss, serr->ee_info, serr->ee_data); } static int recv_errmsg(int fd) @@ -231,6 +246,36 @@ static int recv_errmsg(int fd) return ret == -1; } +static int setsockopt_ts(int fd, int flags) +{ + int val; + + val = 0; + if (flags & MSG_TSTAMP_ANY) { + if (flags & MSG_TSTAMP) + val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (flags & MSG_TSTAMP_ENQ) + val |= SOF_TIMESTAMPING_TX_SCHED; + if (flags & MSG_TSTAMP_ACK) + val |= SOF_TIMESTAMPING_TX_ACK; + + val |= SOF_TIMESTAMPING_OPT_ID; + + flags &= ~MSG_TSTAMP_ANY; + } + +#if 0 + if (tstamp_no_payload) + val |= SOF_TIMESTAMPING_OPT_TX_NO_PAYLOAD; +#endif + + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, + (char *) &val, sizeof(val))) + error(1, 0, "setsockopt"); + + return flags; +} + static void do_test(int family, unsigned int flags) { char *buf; @@ -272,12 +317,7 @@ static void do_test(int family, unsigned int flags) } } - if (tstamp_no_payload) { - val = SOF_TIMESTAMPING_OPT_TSONLY; - if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, - (char *) &val, sizeof(val))) - error(1, 0, "setsockopt no payload"); - } + flags = setsockopt_ts(fd, flags); for (i = 0; i < NUM_RUNS; i++) { memset(&ts_prev, 0, sizeof(ts_prev)); @@ -326,6 +366,9 @@ static void do_test(int family, unsigned int flags) usleep(50 * 1000); print_timestamp_usr(); + + usleep(100); // to handle bug where poll fails + __poll(fd); while (!recv_errmsg(fd)) {} } From a5e4ce87197fd982a73d242d005aa9d1e95a6164 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 8 Aug 2014 10:13:03 -0400 Subject: [PATCH 03/20] minor: remove superfluous (and hacky) sleep --- tests/txtimestamp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/txtimestamp.c b/tests/txtimestamp.c index 58cc723..9c70a69 100644 --- a/tests/txtimestamp.c +++ b/tests/txtimestamp.c @@ -367,7 +367,6 @@ static void do_test(int family, unsigned int flags) print_timestamp_usr(); - usleep(100); // to handle bug where poll fails __poll(fd); while (!recv_errmsg(fd)) {} } From 8fda9d4d3415e0f7a5b54d7fe27c2c703ae4e99e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 May 2015 08:31:55 -0400 Subject: [PATCH 04/20] test: benchmark for packet socket rollover A benchmark process that creates multiple packet sockets in a single fanout group. It spawns one process per socket and pins each process to its own core. Each process reads the packets arriving on that cpu. The socket group has flag PACKET_FANOUT_FLAG_ROLLOVER enabled, so that a saturated cpu can offload packets to others in the group. Use this with a remote packet generator (such as pktgen.ko) to cause high load. Send a single 4-tuple to cause load imbalance. To create socket overload without saturating the physical link, limit the processing rate of each socket reader process by passing -l $RATE. Signed-off-by: Willem de Bruijn --- tests/bench_rollover.c | 306 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 tests/bench_rollover.c diff --git a/tests/bench_rollover.c b/tests/bench_rollover.c new file mode 100644 index 0000000..2ebef55 --- /dev/null +++ b/tests/bench_rollover.c @@ -0,0 +1,306 @@ +/* + * Copyright 2015 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * bench_rollover: stress test the Linux kernel PF_PACKET rollover feature. + * + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DFLT_NUM_SOCK 8 + +#define RING_NUM_FRAMES 1024 +#define RING_FRAME_LEN 4096 + +#ifndef PACKET_ROLLOVER_STATS +#define PACKET_ROLLOVER_STATS 21 + +struct tpacket_rollover_stats { + unsigned long tp_all; + unsigned long tp_huge; + unsigned long tp_failed; +}; +#endif + +static bool do_stop; + +static int cfg_num_sock = DFLT_NUM_SOCK; +static bool cfg_use_ring; +static int cfg_ratelimit_ms; +static bool cfg_stats_rollover; +static bool cfg_verbose; + +static void setcpu(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask)) + error(1, errno, "sched.%d", cpu); +} + +static void bindtodev(int cpu, int fd, const char *dev) +{ + struct sockaddr_ll addr = {}; + static int ifindex; + + if (!ifindex) + ifindex = if_nametoindex(dev); + if (!ifindex) + error(1, errno, "if_nametoindex.%s", dev); + + addr.sll_family = AF_PACKET; + addr.sll_ifindex = ifindex; + addr.sll_protocol = htons(ETH_P_IP); + addr.sll_halen = ETH_ALEN; + if (bind(fd, (void *) &addr, sizeof(addr))) + error(1, errno, "bind.%d", cpu); +} + +static void sighandler(int sig) +{ + do_stop = true; +} + +static char *setrxring(int fd) +{ + struct tpacket_req req = { + .tp_block_size = RING_FRAME_LEN, + .tp_frame_size = RING_FRAME_LEN, + .tp_block_nr = RING_NUM_FRAMES, + .tp_frame_nr = RING_NUM_FRAMES, + }; + char *ring; + int val = TPACKET_V2; + + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req))) + error(1, errno, "setsockopt ring"); + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (!ring) + error(1, errno, "setsockopt mmap"); + + return ring; +} + +int reader(int cpu, int fd) +{ + struct tpacket2_hdr *hdr; + struct tpacket_stats tpstats; + struct tpacket_rollover_stats rstats; + struct pollfd pfd; + char buf[ETH_FRAME_LEN], *ring = NULL; + unsigned long packets = 0; + socklen_t slen; + int ret, index = 0; + + setcpu(cpu); + + if (cfg_use_ring) { + ring = setrxring(fd); + hdr = (void *) ring; + memset(ring, 0, RING_NUM_FRAMES * RING_FRAME_LEN); + } + + while (!do_stop) { + if (ring) { + int budget = RING_NUM_FRAMES; + while (hdr->tp_status & TP_STATUS_USER && budget--) { + memcpy(buf, ((void *) hdr) + hdr->tp_net, + ETH_FRAME_LEN /* add some copy cost */); + hdr->tp_status = TP_STATUS_KERNEL; + + packets++; + index++; + if (index == RING_NUM_FRAMES) + index = 0; + + hdr = (void *) ((unsigned long) ring) + + (index * RING_FRAME_LEN); + + if (cfg_ratelimit_ms && + (packets % (cfg_ratelimit_ms / 10)) == 0) + usleep(100); + } + if (do_stop) + continue; + + pfd.fd = fd; + pfd.events = POLLIN; + pfd.revents = 0; + ret = poll(&pfd, 1, 100); + } else { + ret = read(fd, buf, sizeof(buf)); + } + if (ret == -1 && errno == EINTR) + break; + if (ret == -1) + error(1, errno, "%s.%d", ring ? "poll" : "read", cpu); + if (!ring) + packets++; + } + + slen = sizeof(tpstats); + if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, + &tpstats, &slen)) + error(1, errno, "packetstat.%d", cpu); + + if (cfg_stats_rollover) { + slen = sizeof(rstats); + if (getsockopt(fd, SOL_PACKET, PACKET_ROLLOVER_STATS, + &rstats, &slen)) + error(1, errno, "rolloverstat.%d", cpu); + } else { + memset(&rstats, 0, sizeof(rstats)); + } + + if (packets) { + usleep(cpu * 4000); /* poor man's sorting */ + fprintf(stderr, "%3d %10lu %10u %10u %10lu %10lu %10lu\n", + cpu, packets, + tpstats.tp_packets - tpstats.tp_drops, + tpstats.tp_drops, + rstats.tp_all, + rstats.tp_huge, + rstats.tp_failed); + } + + if (close(fd)) + error(1, errno, "close.%d", cpu); + + return 0; +} + +static void __attribute__((noreturn)) usage(const char *filepath) +{ + fprintf(stderr, "Usage: %s [-h] [-l len] [-n num] [-r] [-s] [-v]\n", + filepath); + exit(1); +} + +static void parse_opt(int argc, char **argv) +{ + const char on[] = "ON", off[] = "OFF"; + char c; + + while ((c = getopt(argc, argv, "hl:n:rsv")) != -1) { + switch (c) { + case 'h': + usage(argv[0]); + break; + case 'l': + cfg_ratelimit_ms = strtoul(optarg, NULL, 10); + break; + case 'n': + cfg_num_sock = strtoul(optarg, NULL, 10); + break; + case 'r': + cfg_use_ring = true; + break; + case 's': + cfg_stats_rollover = true; + break; + case 'v': + cfg_verbose = true; + break; + default: + error(1, 0, "unknown parameter %c", c); + } + } + + if (cfg_verbose) + fprintf(stderr, "socks: %d\n" + "rate: %d K pps\n" + "ring: %s\n" + "rstat: %s\n", + cfg_num_sock, + cfg_ratelimit_ms ? cfg_ratelimit_ms : -1, + cfg_use_ring ? on : off, + cfg_stats_rollover ? on : off); +} + +int main(int argc, char **argv) +{ + struct sigaction sig = {}; + pid_t pid, pgid; + int i, fd, val; + + parse_opt(argc, argv); + + pgid = getpgid(0); + + /* make recv return with EINTR */ + sig.sa_handler = sighandler; + if (sigaction(SIGINT, &sig, NULL)) + error(1, errno, "sigaction"); + + for (i = 0; i < cfg_num_sock; i++) { + fd = socket(PF_PACKET, SOCK_RAW, 0); + if (fd == -1) + error(1, errno, "socket.%d", i); + + bindtodev(i, fd, "eth0"); + + val = PACKET_FANOUT_CPU | PACKET_FANOUT_FLAG_ROLLOVER; + val <<= 16; + if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, + &val, sizeof(val))); + + pid = fork(); + if (pid == -1) + error(1, errno, "fork.%d", i); + if (!pid) + return reader(i, fd); + } + + fprintf(stderr, "Press [Enter] to exit\n"); + getchar(); + + fprintf(stderr, "cpu rx rx.k drop.k rollover r.huge r.failed\n"); + kill(-pgid, SIGINT); + usleep(cfg_num_sock * 5000); + return 0; +} + From 54797327c2feb638637b892bc75974c186881938 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 9 May 2015 17:47:13 -0400 Subject: [PATCH 05/20] test: bench_rollover: spawn after socket init delay concurrent execution until the socket fanout group is completely initialized. Signed-off-by: Willem de Bruijn --- tests/bench_rollover.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/bench_rollover.c b/tests/bench_rollover.c index 2ebef55..39fada1 100644 --- a/tests/bench_rollover.c +++ b/tests/bench_rollover.c @@ -45,6 +45,7 @@ #include #include +#define MAX_NUM_SOCK 64 #define DFLT_NUM_SOCK 8 #define RING_NUM_FRAMES 1024 @@ -125,25 +126,20 @@ static char *setrxring(int fd) return ring; } -int reader(int cpu, int fd) +int reader(int cpu, int fd, char *ring) { struct tpacket2_hdr *hdr; struct tpacket_stats tpstats; struct tpacket_rollover_stats rstats; struct pollfd pfd; - char buf[ETH_FRAME_LEN], *ring = NULL; + char buf[ETH_FRAME_LEN]; unsigned long packets = 0; socklen_t slen; int ret, index = 0; setcpu(cpu); - if (cfg_use_ring) { - ring = setrxring(fd); - hdr = (void *) ring; - memset(ring, 0, RING_NUM_FRAMES * RING_FRAME_LEN); - } - + hdr = (void *) ring; while (!do_stop) { if (ring) { int budget = RING_NUM_FRAMES; @@ -207,6 +203,9 @@ int reader(int cpu, int fd) rstats.tp_failed); } + if (ring && munmap(ring, RING_FRAME_LEN * RING_NUM_FRAMES)) + error(1, errno, "munmap.%d", cpu); + if (close(fd)) error(1, errno, "close.%d", cpu); @@ -235,6 +234,8 @@ static void parse_opt(int argc, char **argv) break; case 'n': cfg_num_sock = strtoul(optarg, NULL, 10); + if (cfg_num_sock > MAX_NUM_SOCK) + error(1, 0, "num exceeds %u\n", MAX_NUM_SOCK); break; case 'r': cfg_use_ring = true; @@ -263,9 +264,11 @@ static void parse_opt(int argc, char **argv) int main(int argc, char **argv) { + static int fds[MAX_NUM_SOCK]; + static char *rings[MAX_NUM_SOCK]; struct sigaction sig = {}; pid_t pid, pgid; - int i, fd, val; + int i, val; parse_opt(argc, argv); @@ -277,22 +280,29 @@ int main(int argc, char **argv) error(1, errno, "sigaction"); for (i = 0; i < cfg_num_sock; i++) { - fd = socket(PF_PACKET, SOCK_RAW, 0); - if (fd == -1) + fds[i] = socket(PF_PACKET, SOCK_RAW, 0); + if (fds[i] == -1) error(1, errno, "socket.%d", i); - bindtodev(i, fd, "eth0"); + bindtodev(i, fds[i], "eth0"); + + if (cfg_use_ring) { + rings[i] = setrxring(fds[i]); + memset(rings[i], 0, RING_NUM_FRAMES * RING_FRAME_LEN); + } val = PACKET_FANOUT_CPU | PACKET_FANOUT_FLAG_ROLLOVER; val <<= 16; - if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, + if (setsockopt(fds[i], SOL_PACKET, PACKET_FANOUT, &val, sizeof(val))); + } + for (i = 0; i < cfg_num_sock; i++) { pid = fork(); if (pid == -1) error(1, errno, "fork.%d", i); if (!pid) - return reader(i, fd); + return reader(i, fds[i], rings[i]); } fprintf(stderr, "Press [Enter] to exit\n"); From 5c6cb505808e6307e5f670ce0216f70e991ea3cf Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 10:47:19 -0400 Subject: [PATCH 06/20] test: convert bench_rollover to 64-bit kernel counters --- tests/bench_rollover.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/bench_rollover.c b/tests/bench_rollover.c index 39fada1..90db025 100644 --- a/tests/bench_rollover.c +++ b/tests/bench_rollover.c @@ -25,8 +25,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -55,9 +57,9 @@ #define PACKET_ROLLOVER_STATS 21 struct tpacket_rollover_stats { - unsigned long tp_all; - unsigned long tp_huge; - unsigned long tp_failed; + __aligned_u64 tp_all; + __aligned_u64 tp_huge; + __aligned_u64 tp_failed; }; #endif @@ -194,13 +196,13 @@ int reader(int cpu, int fd, char *ring) if (packets) { usleep(cpu * 4000); /* poor man's sorting */ - fprintf(stderr, "%3d %10lu %10u %10u %10lu %10lu %10lu\n", + fprintf(stderr, "%3d %10lu %10u %10u %10" PRIu64 " %10" PRIu64 " %10" PRIu64 "\n", cpu, packets, tpstats.tp_packets - tpstats.tp_drops, tpstats.tp_drops, - rstats.tp_all, - rstats.tp_huge, - rstats.tp_failed); + (uint64_t) rstats.tp_all, + (uint64_t) rstats.tp_huge, + (uint64_t) rstats.tp_failed); } if (ring && munmap(ring, RING_FRAME_LEN * RING_NUM_FRAMES)) From 951f460dc86eda6605b0752e4279519b5a610d9e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 31 Jan 2016 14:54:52 -0500 Subject: [PATCH 07/20] tests: demo process for tpacket_rcv with vnet Add a test that reads from a packet socket ring with both options PACKET_RX_RING and PACKET_VNET_HDR enabled. --- tests/psock_rxring_vnet.c | 254 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 tests/psock_rxring_vnet.c diff --git a/tests/psock_rxring_vnet.c b/tests/psock_rxring_vnet.c new file mode 100644 index 0000000..fcd3fea --- /dev/null +++ b/tests/psock_rxring_vnet.c @@ -0,0 +1,254 @@ +/* + * A packet sniffer that combines PACKET_RX_RING and PACKET_VNET_HDR + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool cfg_enable_vnet = false; +static int cfg_num_frames = 1024; +static int cfg_runtime_sec = 1; + +static struct tpacket_req req; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static int socket_open(void) +{ + int fd, val; + + fd = socket(PF_PACKET, SOCK_RAW, 0 /* disable until ring is ready */); + if (fd == -1) + error(1, errno, "socket"); + + val = TPACKET_V2; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + + if (cfg_enable_vnet) { + val = 1; + if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, + &val, sizeof(val))) + error(1, errno, "setsockopt vnet_hdr"); + } + + return fd; +} + +static void socket_bind(int fd) +{ + struct sockaddr_ll addr = {}; + + addr.sll_family = AF_PACKET; + addr.sll_protocol = htons(ETH_P_IP); + if (bind(fd, (void *) &addr, sizeof(addr)) == -1) + error(1, errno, "packetsock bind"); +} + +static char * ring_open(int fd) +{ + char *ring; + + req.tp_frame_size = 256; + req.tp_frame_nr = cfg_num_frames; + req.tp_block_size = getpagesize(); + req.tp_block_nr = (req.tp_frame_size * req.tp_frame_nr) / + req.tp_block_size; + + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, + (void*) &req, sizeof(req))) + error(1, errno, "setsockopt ring"); + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ring == MAP_FAILED) + error(1, errno, "mmap"); + + return ring; +} + +/* portability warning: ignoring virtio endiannes */ +static void parse_vnet(struct virtio_net_hdr *vnet) +{ + uint16_t gso_type; + char *type; + + gso_type = vnet->gso_type & ~VIRTIO_NET_HDR_GSO_ECN; + switch (gso_type) { + case VIRTIO_NET_HDR_GSO_NONE: + type = "none"; + break; + case VIRTIO_NET_HDR_GSO_TCPV4: + type = "tcpv4"; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + type = "tcpv6"; + break; + case VIRTIO_NET_HDR_GSO_UDP: + type = "udp"; + break; + default: + type = "unknown"; + } + + fprintf(stderr, "vnet: gso_type=%s gso_size=%u hlen=%u ecn=%s\n", + type, vnet->gso_size, vnet->hdr_len, + (vnet->gso_type & VIRTIO_NET_HDR_GSO_ECN) ? "on " : "off"); + + if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) + fprintf(stderr, "csum: start=%u off=%u\n", + vnet->csum_start, vnet->csum_offset); + +} + +static void parse_ipv4(struct iphdr *iph) +{ + char saddr[INET_ADDRSTRLEN], daddr[INET_ADDRSTRLEN]; + + if (!inet_ntop(AF_INET, &iph->saddr, saddr, sizeof(saddr))) + error(1, errno, "inet_ntop saddr"); + if (!inet_ntop(AF_INET, &iph->daddr, daddr, sizeof(daddr))) + error(1, errno, "inet_ntop daddr"); + + fprintf(stderr, "ip: src=%s dst=%s proto=%u len=%u\n", + saddr, daddr, iph->protocol, ntohs(iph->tot_len)); +} + +/* portability warning: assumes ethernet */ +static void __ring_read(struct tpacket2_hdr *hdr, void *data) +{ + struct timeval tv; + uint16_t eth_proto; + struct ethhdr *eth = (void *) data; + + gettimeofday(&tv, NULL); + fprintf(stderr, "\npkt: %lu.%lu len=%u\n", + tv.tv_sec, tv.tv_usec, hdr->tp_len); + + if (cfg_enable_vnet) + parse_vnet(data - sizeof(struct virtio_net_hdr)); + + eth_proto = htons(eth->h_proto); + fprintf(stderr, "eth: proto=0x%x\n", eth_proto); + if (eth_proto == ETH_P_IP) + parse_ipv4(data + ETH_HLEN); +} + +static void ring_read(void *ring, int index) +{ + struct tpacket2_hdr *header = ring + (index * req.tp_frame_size); + + if (!(header->tp_status & TP_STATUS_USER)) + error(1, 0, "ring: no data (0x%x)", header->tp_status); + + + __ring_read(header, ((void *) header) + header->tp_mac); + header->tp_status = TP_STATUS_KERNEL; +} + +static void ring_close(char *ring) +{ + if (munmap(ring, req.tp_block_size * req.tp_block_nr)) + error(1, errno, "munmap"); +} + +static bool ring_poll(int fd) +{ + struct pollfd pfd; + int ret; + + pfd.fd = fd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 100); + if (ret == -1) + error(1, errno, "poll"); + if (ret == 0) + return false; + if (pfd.revents != POLLIN) + error(1, 0, "unexpected event (0x%x)", pfd.revents); + + return true; +} + +static void do_run(int fd, char *ring) +{ + int64_t tstop, index = 0; + + tstop = gettimeofday_ms() + (cfg_runtime_sec * 1000); + + while (gettimeofday_ms() < tstop) { + if (ring_poll(fd)) { + ring_read(ring, index % cfg_num_frames); + index++; + } + } + + fprintf(stderr, "total: %lu packets\n", index); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "v")) != -1) + { + switch (c) { + case 'v': + cfg_enable_vnet = true; + break; + default: + error(1, 0, "unknown option %c", c); + } + } +} + +int main(int argc, char **argv) +{ + char *ring; + int fd; + + parse_opts(argc, argv); + + fprintf(stderr, "vnet: %sabled\n", cfg_enable_vnet ? "en" : "dis"); + + fd = socket_open(); + socket_bind(fd); + ring = ring_open(fd); + + do_run(fd, ring); + + ring_close(ring); + if (close(fd) == -1) + error(1, errno, "close"); + + return 0; +} From 521a1f7b194980db235c492b93603e887eb34399 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 31 Jan 2016 16:22:40 -0500 Subject: [PATCH 08/20] tests: demo process for tpacket_snd with vnet Add a test that writes over a packet socket both - with and without PACKET_TX_RING - with and without PACKET_VNET_HDR --- tests/psock_txring_vnet.c | 324 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 tests/psock_txring_vnet.c diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c new file mode 100644 index 0000000..45b2528 --- /dev/null +++ b/tests/psock_txring_vnet.c @@ -0,0 +1,324 @@ +/* Inject packets with PACKET_TX_RING and PACKET_VNET_HDR */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool cfg_enable_ring = true; +static bool cfg_enable_vnet = false; +static char *cfg_ifname = "eth0"; +static int cfg_ifindex; +static int cfg_num_frames = 3; +static int cfg_payload_len = 500; + +static struct tpacket_req req; +static struct in_addr ip_saddr, ip_daddr; + +/* must configure real daddr (should really infer or pass on cmdline) */ +const char cfg_mac_src[] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }; +const char cfg_mac_dst[] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }; + +static int socket_open(void) +{ + int fd, val; + + fd = socket(PF_PACKET, SOCK_RAW, 0 /* disable reading */); + if (fd == -1) + error(1, errno, "socket"); + + val = TPACKET_V2; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + + if (cfg_enable_vnet) { + val = 1; + if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, + &val, sizeof(val))) + error(1, errno, "setsockopt vnet_hdr"); + } + + return fd; +} + +static char * ring_open(int fd) +{ + char *ring; + + req.tp_frame_size = getpagesize() << 1; + req.tp_frame_nr = cfg_num_frames; + req.tp_block_size = req.tp_frame_size; + req.tp_block_nr = cfg_num_frames; + + if (setsockopt(fd, SOL_PACKET, PACKET_TX_RING, + (void*) &req, sizeof(req))) + error(1, errno, "setsockopt ring"); + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ring == MAP_FAILED) + error(1, errno, "mmap"); + + return ring; +} + +/* warning: does not handle odd length */ +static unsigned long add_csum_hword(const uint16_t *start, int num_u16) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < num_u16; i++) + sum += start[i]; + + return sum; +} + +static uint16_t build_ip_csum(const uint16_t *start, int num_u16, + unsigned long sum) +{ + sum += add_csum_hword(start, num_u16); + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +static uint16_t get_tcp_v4_csum(const struct iphdr *iph, + const struct tcphdr *tcph, + int length) +{ + unsigned long pseudo_sum = 0; + + pseudo_sum += add_csum_hword((void *) &iph->saddr, 2); + pseudo_sum += add_csum_hword((void *) &iph->daddr, 2); + pseudo_sum += htons(IPPROTO_TCP); + pseudo_sum += htons(length); + + if (cfg_enable_vnet) + return pseudo_sum; + else + return build_ip_csum((void *) tcph, length >> 1, pseudo_sum); +} + +static int frame_fill(void *buffer, unsigned int payload_len) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + int off = 0; + + if (cfg_enable_vnet) { + struct virtio_net_hdr *vnet; + + vnet = buffer; + + vnet->hdr_len = ETH_HLEN + sizeof(*iph) + sizeof(*tcph); + + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = ETH_HLEN + sizeof(*iph); + vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); + + vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - + sizeof(struct tcphdr); + off += sizeof(*vnet); + } + + eth = buffer + off; + memcpy(ð->h_source, cfg_mac_src, ETH_ALEN); + memcpy(ð->h_dest, cfg_mac_dst, ETH_ALEN); + eth->h_proto = htons(ETH_P_IP); + off += ETH_HLEN; + + iph = buffer + off; + iph->ttl = 8; + iph->ihl = 5; + iph->version = 4; + iph->saddr = ip_saddr.s_addr; + iph->daddr = ip_daddr.s_addr; + iph->protocol = IPPROTO_TCP; + iph->tot_len = htons(sizeof(*iph) + sizeof(*tcph) + payload_len); + iph->check = build_ip_csum((const void *) iph, 10 /* hwords */, 0); + off += sizeof(*iph); + + tcph = buffer + off; + tcph->dest = htons(9); + tcph->source = htons(9); + tcph->doff = sizeof(*tcph) >> 2; + off += sizeof(*tcph); + + memset(buffer + off, 'a', payload_len); + + tcph->check = get_tcp_v4_csum(iph, tcph, + (sizeof(*tcph) + payload_len)); + return off + payload_len; +} + +static void ring_write(void *slot) +{ + struct tpacket2_hdr *header = slot; + + if (header->tp_status != TP_STATUS_AVAILABLE) + error(1, 0, "write: slot not available"); + + memset(slot + TPACKET2_HDRLEN, 0, req.tp_frame_size - TPACKET2_HDRLEN); + + header->tp_mac = TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); + header->tp_len = frame_fill(slot + header->tp_mac, cfg_payload_len); + header->tp_status = TP_STATUS_SEND_REQUEST; +} + +static void socket_write(int fd) +{ + static char buf[ETH_HLEN + (1 << 16)]; + int len, ret; + + memset(buf, 0, sizeof(buf)); + len = frame_fill(buf, cfg_payload_len); + ret = send(fd, buf, len, 0); + if (ret == -1) + error(1, errno, "send"); + if (ret < len) + error(1, 0, "send: %uB < %uB\n", ret, len); +} + +static void socket_bind(int fd) +{ + struct sockaddr_ll addr = { 0 }; + + addr.sll_family = AF_PACKET; + addr.sll_ifindex = cfg_ifindex; + addr.sll_protocol = htons(ETH_P_IP); + addr.sll_halen = ETH_ALEN; + + if (bind(fd, (void *) &addr, sizeof(addr))) + error(1, errno, "bind"); +} + +static void ring_wake_kernel(int fd) +{ + int ret; + + ret = send(fd, NULL, 0, 0); + if (ret < 0) + error(1, errno, "send"); + if (!ret) + error(1, 0, "send: no data"); + + fprintf(stderr, "send: %uB\n", ret); +} + +static void ring_close(char *ring) +{ + if (munmap(ring, req.tp_block_size * req.tp_block_nr)) + error(1, errno, "munmap"); +} + +static void do_run_ring(int fd, char *ring) +{ + int i; + + for (i = 0; i < cfg_num_frames; i++) + ring_write(ring + (i * req.tp_frame_size)); + + ring_wake_kernel(fd); +} + +static void do_run(int fd) +{ + int i; + + for (i = 0; i < cfg_num_frames; i++) + socket_write(fd); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "d:vi:l:ns:")) != -1) + { + switch (c) { + case 'd': + if (!inet_aton(optarg, &ip_daddr)) + error(1, 0, "bad ipv4 destination address"); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'l': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'n': + cfg_enable_ring = false; + break; + case 's': + if (!inet_aton(optarg, &ip_saddr)) + error(1, 0, "bad ipv4 destination address"); + break; + case 'v': + cfg_enable_vnet = true; + break; + default: + error(1, 0, "unknown option %c", c); + } + } + + if (!ip_saddr.s_addr || !ip_daddr.s_addr) + error(1, 0, "must specify ipv4 source and destination"); + + cfg_ifindex = if_nametoindex(cfg_ifname); + if (!cfg_ifindex) + error(1, errno, "ifnametoindex"); +} + +int main(int argc, char **argv) +{ + char *ring; + int fd; + + parse_opts(argc, argv); + + fprintf(stderr, "vnet: %sabled\n", cfg_enable_vnet ? "en" : "dis"); + + fd = socket_open(); + socket_bind(fd); + + if (cfg_enable_ring) { + ring = ring_open(fd); + do_run_ring(fd, ring); + ring_close(ring); + } else { + do_run(fd); + } + + if (close(fd) == -1) + error(1, errno, "close"); + + return 0; +} From ed7fd8ad0c9f11785aa4567703ea9b0e8f9ce30e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 20 Mar 2016 21:19:05 -0400 Subject: [PATCH 09/20] tests: extend psock_txring_vnet features Add - PACKET_QDISC_BYPASS support - Short input support - override length flag - drop CAP_SYS_RAWIO to force min length check in kernel Fix - Always allocate large enough ring slot --- tests/psock_txring_vnet.c | 88 +++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c index 45b2528..c9bdf28 100644 --- a/tests/psock_txring_vnet.c +++ b/tests/psock_txring_vnet.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -28,12 +30,18 @@ #include #include +#ifndef PACKET_QDISC_BYPASS +#define PACKET_QDISC_BYPASS 20 +#endif + static bool cfg_enable_ring = true; static bool cfg_enable_vnet = false; static char *cfg_ifname = "eth0"; static int cfg_ifindex; -static int cfg_num_frames = 3; -static int cfg_payload_len = 500; +static int cfg_num_frames = 4; +static unsigned int cfg_override_len = UINT_MAX; +static unsigned int cfg_payload_len = 500; +static bool cfg_qdisc_bypass = false; static struct tpacket_req req; static struct in_addr ip_saddr, ip_daddr; @@ -54,6 +62,13 @@ static int socket_open(void) if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) error(1, errno, "setsockopt version"); + if (cfg_qdisc_bypass) { + val = 1; + if (setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, + &val, sizeof(val))) + error(1, errno, "setsockopt qdisc bypass"); + } + if (cfg_enable_vnet) { val = 1; if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, @@ -67,8 +82,16 @@ static int socket_open(void) static char * ring_open(int fd) { char *ring; - - req.tp_frame_size = getpagesize() << 1; + unsigned int frame_sz; + + frame_sz = cfg_payload_len + 100 /* overestimate */; + frame_sz = 1 << (32 - __builtin_clz(frame_sz)); + if (frame_sz < getpagesize()) + frame_sz = getpagesize(); + + fprintf(stderr, "frame size: %u\n", frame_sz); + + req.tp_frame_size = frame_sz; req.tp_frame_nr = cfg_num_frames; req.tp_block_size = req.tp_frame_size; req.tp_block_nr = cfg_num_frames; @@ -182,14 +205,19 @@ static int frame_fill(void *buffer, unsigned int payload_len) static void ring_write(void *slot) { struct tpacket2_hdr *header = slot; + int len; if (header->tp_status != TP_STATUS_AVAILABLE) error(1, 0, "write: slot not available"); memset(slot + TPACKET2_HDRLEN, 0, req.tp_frame_size - TPACKET2_HDRLEN); + len = frame_fill(slot + header->tp_mac, cfg_payload_len); + if (cfg_override_len < len) + len = cfg_override_len; + header->tp_mac = TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); - header->tp_len = frame_fill(slot + header->tp_mac, cfg_payload_len); + header->tp_len = len; header->tp_status = TP_STATUS_SEND_REQUEST; } @@ -200,6 +228,10 @@ static void socket_write(int fd) memset(buf, 0, sizeof(buf)); len = frame_fill(buf, cfg_payload_len); + + if (cfg_override_len < len) + len = cfg_override_len; + ret = send(fd, buf, len, 0); if (ret == -1) error(1, errno, "send"); @@ -257,13 +289,42 @@ static void do_run(int fd) socket_write(fd); } +static void drop_capability(uint32_t capability) +{ + struct __user_cap_header_struct hdr = {}; + struct __user_cap_data_struct data = {}; + + hdr.pid = getpid(); + hdr.version = _LINUX_CAPABILITY_VERSION; + + if (capget(&hdr, &data) == -1) + error(1, errno, "capget"); + fprintf(stderr, "cap.1: eff=0x%x perm=0x%x\n", + data.effective, data.permitted); + + data.effective &= ~CAP_TO_MASK(capability); + data.permitted &= ~CAP_TO_MASK(capability); + data.inheritable = 0; + + if (capset(&hdr, &data) == -1) + error(1, errno, "capset"); + + if (capget(&hdr, &data) == -1) + error(1, errno, "capget"); + fprintf(stderr, "cap.2: eff=0x%x perm=0x%x\n", + data.effective, data.permitted); +} + static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "d:vi:l:ns:")) != -1) + while ((c = getopt(argc, argv, "cd:i:l:L:n:Nqs:v")) != -1) { switch (c) { + case 'c': + drop_capability(CAP_SYS_RAWIO); + break; case 'd': if (!inet_aton(optarg, &ip_daddr)) error(1, 0, "bad ipv4 destination address"); @@ -274,9 +335,18 @@ static void parse_opts(int argc, char **argv) case 'l': cfg_payload_len = strtoul(optarg, NULL, 0); break; + case 'L': + cfg_override_len = strtoul(optarg, NULL, 0); + break; case 'n': + cfg_num_frames = strtoul(optarg, NULL, 0); + break; + case 'N': cfg_enable_ring = false; break; + case 'q': + cfg_qdisc_bypass = true; + break; case 's': if (!inet_aton(optarg, &ip_saddr)) error(1, 0, "bad ipv4 destination address"); @@ -295,6 +365,10 @@ static void parse_opts(int argc, char **argv) cfg_ifindex = if_nametoindex(cfg_ifname); if (!cfg_ifindex) error(1, errno, "ifnametoindex"); + + fprintf(stderr, "len: %u\n", cfg_num_frames); + fprintf(stderr, "num: %u\n", cfg_payload_len); + fprintf(stderr, "vnet: %sabled\n", cfg_enable_vnet ? "en" : "dis"); } int main(int argc, char **argv) @@ -304,8 +378,6 @@ int main(int argc, char **argv) parse_opts(argc, argv); - fprintf(stderr, "vnet: %sabled\n", cfg_enable_vnet ? "en" : "dis"); - fd = socket_open(); socket_bind(fd); From fa6d07b04912983cf56be89025503fc477aaa565 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Aug 2016 14:17:48 -0400 Subject: [PATCH 10/20] tools: import tcplate tcplate computes traffic shaping latency by reading egress packet timestamps with nflog and packet sockets. SO_TIMESTAMPING offers a more complete timestamping solution for processes that own file descriptors. Tcplate is geared at casual latency monitoring by administrators. --- tools/tcplate/Makefile | 16 ++ tools/tcplate/libnflog.c | 311 ++++++++++++++++++++ tools/tcplate/libnflog.h | 27 ++ tools/tcplate/libpsock.c | 186 ++++++++++++ tools/tcplate/libpsock.h | 36 +++ tools/tcplate/tcplate.c | 604 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 1180 insertions(+) create mode 100644 tools/tcplate/Makefile create mode 100644 tools/tcplate/libnflog.c create mode 100644 tools/tcplate/libnflog.h create mode 100644 tools/tcplate/libpsock.c create mode 100644 tools/tcplate/libpsock.h create mode 100644 tools/tcplate/tcplate.c diff --git a/tools/tcplate/Makefile b/tools/tcplate/Makefile new file mode 100644 index 0000000..4d8928c --- /dev/null +++ b/tools/tcplate/Makefile @@ -0,0 +1,16 @@ + +.PHONY: all clean distclean + +all: tcplate + +tcplate: tcplate.o libnflog.o libpsock.o + gcc -static $+ -o $@ + +%.o: %.c + gcc -c -Wall -Werror $< -o $@ + +clean: + -rm -f *.o + +distclean: clean + -rm -f tcplate diff --git a/tools/tcplate/libnflog.c b/tools/tcplate/libnflog.c new file mode 100644 index 0000000..08f808a --- /dev/null +++ b/tools/tcplate/libnflog.c @@ -0,0 +1,311 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Netlink support library + * Geared at NETLINK_NETFILTER + * + * Read netfilter nflog output, for instance: + * `iptables -A OUTPUT -j NFLOG --nflog-group=10` + * + * To timestamp every packet, use the xt_time match: + * `iptables -A OUTPUT \ + * -m time --timestart 00:00 --timestop 23:59 \ + * -j NFLOG --nflog-group=10` + * or even + * `iptables -A OUTPUT -m time -j NFLOG --nflog-group=10` + * + * TODO(willemb): optimize by using mmapped ring. + */ + +#define _GNU_SOURCE +#define _BSD_SOURCE /* for be64toh */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnflog.h" + +static int config_group = 10; /* nfnetlink group to follow */ +static int config_debug_lvl = 0; + +#define IOVLEN 8 +#define PKTLEN (1 << 11) + +static void __nflog_sendcmd(int fd, uint8_t cmd, void *msg, int msglen, + uint16_t family, uint16_t group_id) +{ + static int seq_id; + char buf[1024] __attribute__((aligned)); + struct nlmsghdr *nh; + struct nfgenmsg *ng; + struct nfattr *nfa; + int ret; + + memset(buf, 0, sizeof(buf)); + + nh = (void *) buf; + ng = (void *) buf + sizeof(*nh); + + nh->nlmsg_len = NLMSG_LENGTH(sizeof(*ng)); + nh->nlmsg_type = (NFNL_SUBSYS_ULOG << 8) | NFULNL_MSG_CONFIG; + nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nh->nlmsg_pid = 0; + nh->nlmsg_seq = ++seq_id; + + ng->nfgen_family = family; + ng->version = NFNETLINK_V0; + ng->res_id = htons(group_id); + + nfa = (void *) buf + NLMSG_ALIGN(nh->nlmsg_len); + nfa->nfa_type = cmd; + nfa->nfa_len = NFA_LENGTH(msglen); + + memcpy(NFA_DATA(nfa), msg, msglen); + + nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NFA_ALIGN(nfa->nfa_len); + + if (send(fd, buf, nh->nlmsg_len, 0) != nh->nlmsg_len) + error(1, errno, "sendcmd"); + + /* TODO(willemb): handle EINTR */ + ret = recv(fd, buf, sizeof(buf), 0); + if (ret == -1) + error(1, errno, "recv ctrl: sock error"); + if (ret < NLMSG_OK(nh, ret)) + error(1, 0, "recv ctrl: insufficient length"); + if (nh->nlmsg_type != NLMSG_ERROR) + error(1, 0, "recv ctrl: unexpected type"); + ret = *(int *) NLMSG_DATA(nh); + if (ret) + error(1, ret, "recv ctrl: nflog error"); +} + +static void nflog_sendcmd(int fd, uint8_t cmd, uint16_t family, + uint16_t group_id) +{ + struct nfulnl_msg_config_cmd msg; + + memset(&msg, 0, sizeof(msg)); + msg.command = cmd; + __nflog_sendcmd(fd, NFULA_CFG_CMD, &msg, sizeof(msg), family, group_id); +} + +static void nflog_sendcmd_mode(int fd, uint16_t family, uint16_t group_id, + uint8_t mode, uint32_t value) +{ + struct nfulnl_msg_config_mode msg; + + memset(&msg, 0, sizeof(msg)); + msg.copy_mode = mode; + msg.copy_range = htonl(value); + __nflog_sendcmd(fd, NFULA_CFG_MODE, &msg, sizeof(msg), family, group_id); +} + +static void nflog_attach_inet(int fd, unsigned int snaplen) +{ + nflog_sendcmd(fd, NFULNL_CFG_CMD_PF_UNBIND, AF_INET, 0); + /* TODO: recv ack */ + nflog_sendcmd(fd, NFULNL_CFG_CMD_PF_BIND, AF_INET, 0); + /* TODO: recv ack */ + nflog_sendcmd(fd, NFULNL_CFG_CMD_BIND, AF_UNSPEC, config_group); + /* TODO: recv ack */ + + nflog_sendcmd_mode(fd, AF_UNSPEC, config_group, NFULNL_COPY_PACKET, snaplen); + /* TODO: recv ack */ +} + +int nflog_init(unsigned int snaplen) +{ + struct sockaddr_nl nladdr; + int fd, val; + + if (snaplen > PKTLEN) + error(1, 0, "snaplen exceeds pktlen: can cause drops"); + + fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); + if (fd == -1) + error(1, errno, "socket"); + + val = 1 << 21; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val))) + error(1, errno, "setsockopt SO_RCVBUF"); + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_groups = 1 << config_group; + + if (bind(fd, (void *) &nladdr, sizeof(nladdr))) + error(1, errno, "bind"); + + nflog_attach_inet(fd, snaplen); + return fd; +} + +void nflog_exit(int fd) +{ + if (close(fd)) + error(1, errno, "close"); +} + +void nflog_parse(const void *data, unsigned int len, log_fn fn) +{ + const struct nlmsghdr *nh; + + for (nh = (void *) data; NLMSG_OK(nh, len); nh = NLMSG_NEXT(nh, len)) { + const struct nfulnl_msg_packet_timestamp *nf_ts; + const struct nfgenmsg *ng; + const struct nfattr *attr; + uint64_t ts_sec = 0, ts_usec = 0; + const char *pkt; + int plen = 0; + int alen; + + if (nh->nlmsg_type == NLMSG_ERROR) + error(1, 0, "netlink error"); + if (nh->nlmsg_type == NLMSG_NOOP) + error(1, 0, "netlink noop"); + if (nh->nlmsg_len < sizeof(*nh) || len < nh->nlmsg_len) { + fprintf(stderr, "message truncated\n"); + continue; + } + + ng = NLMSG_DATA(nh); + if (config_debug_lvl) + fprintf(stderr, "P family=%s version=%d group=%d len=%d type=%hu\n", + ng->nfgen_family == AF_INET ? "INET" : "other", + ng->version, + ntohs(ng->res_id), + nh->nlmsg_len, + nh->nlmsg_type); + + attr = NFM_NFA(ng); + alen = nh->nlmsg_len - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(*ng))); + while (NFA_OK(attr, alen)) { + switch (NFA_TYPE(attr)) { + case NFULA_PAYLOAD: + pkt = NFA_DATA(attr); + plen = NFA_PAYLOAD(attr); + break; + case NFULA_TIMESTAMP: + nf_ts = NFA_DATA(attr); + ts_sec = be64toh(nf_ts->sec); + ts_usec = be64toh(nf_ts->usec); + break; + case NFULA_GID: + case NFULA_PACKET_HDR: + case NFULA_PREFIX: + case NFULA_IFINDEX_OUTDEV: + case NFULA_UID: + default: + if (config_debug_lvl) + fprintf(stderr, " attr @%lu other type=%d\n", + ((unsigned long) attr) - (unsigned long) ng, + NFA_TYPE(attr)); + } + attr = NFA_NEXT(attr, alen); + } + + if (nh->nlmsg_type == NLMSG_DONE) + break; + + fn(pkt, plen, ts_sec, ts_usec); + } +} + +int nflog_read(int fd, log_fn fn) +{ + static char data[IOVLEN][PKTLEN]; + struct mmsghdr msgs[IOVLEN]; + struct iovec iovecs[IOVLEN]; + int i, len; + + memset(msgs, 0, sizeof(msgs)); + for (i = 0; i < IOVLEN; i++) { + iovecs[i].iov_base = data[i]; + iovecs[i].iov_len = PKTLEN; + msgs[i].msg_hdr.msg_iov = &iovecs[i]; + msgs[i].msg_hdr.msg_iovlen = 1; + } + + len = recvmmsg(fd, msgs, IOVLEN, MSG_DONTWAIT, NULL); + if (len == -1) { + if (errno == EAGAIN || errno == EINTR) + return 0; + if (errno == ENOBUFS) { + static int report_overflow; + if (!report_overflow) { + report_overflow = 1; + fprintf(stderr, "nflog: socket overflow detected. some packets will be lost (only warning once).\n"); + } + return 0; + } + error(1, errno, "recvmsg"); + } + + if (config_debug_lvl > 1) + fprintf(stderr, "recvmmsg len=%u\n", len); + + for (i = 0; i < len; i++) + nflog_parse(data[i], msgs[i].msg_len, fn); + + return 1; +} + +static int nflog_wait(int fd) +{ + struct pollfd pollfd[2]; + int len; + + do { + memset(&pollfd, 0, sizeof(pollfd)); + + pollfd[0].events = POLLIN; + pollfd[0].fd = 0; + + pollfd[1].events = POLLIN; + pollfd[1].fd = fd; + + len = poll(pollfd, 2, 50); + if (len == -1) { + if (errno == EINTR) + continue; + error(1, errno, "poll"); + } + if (len && pollfd[0].revents) + return 0; + } while (!len); + + return 1; +} + +void nflog_loop(int fd, log_fn fn) +{ + while (nflog_wait(fd)) { + while (nflog_read(fd, fn)) {} + } +} + +void nflog_all(log_fn fn, unsigned int snaplen) +{ + int fd; + + fd = nflog_init(snaplen); + nflog_loop(fd, fn); + nflog_exit(fd); +} + diff --git a/tools/tcplate/libnflog.h b/tools/tcplate/libnflog.h new file mode 100644 index 0000000..96af568 --- /dev/null +++ b/tools/tcplate/libnflog.h @@ -0,0 +1,27 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Netfilter LOG support library + * + * Only reads outgoing packets + */ + +#ifndef _LIBNFLOG_H_ +#define _LIBNFLOG_H_ + +#include + +/* can be called with len 0 */ +typedef void (*log_fn)(const void *pkt, unsigned int len, + uint64_t ts_sec, uint64_t ts_usec); + +void nflog_all(log_fn fn, unsigned int snaplen); + +int nflog_init(unsigned int snaplen); +int nflog_read(int fd, log_fn fn); +void nflog_loop(int fd, log_fn fn); +void nflog_exit(int fd); + +#endif // _LIBNFLOG_H_ + diff --git a/tools/tcplate/libpsock.c b/tools/tcplate/libpsock.c new file mode 100644 index 0000000..d810e3d --- /dev/null +++ b/tools/tcplate/libpsock.c @@ -0,0 +1,186 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Packet socket support library + * + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpsock.h" + +static void +psock_init_ring(struct psock *ps) +{ + struct tpacket_req tp; + int frames_per_block; + + if (ps->frame_size & (TPACKET_ALIGNMENT - 1)) + error(1, 0, "illegal frame size"); + + tp.tp_frame_size = ps->frame_size; + tp.tp_frame_nr = ps->frame_count; + + frames_per_block = getpagesize() / ps->frame_size; + tp.tp_block_size = getpagesize(); + tp.tp_block_nr = ps->frame_count / frames_per_block; + + if (setsockopt(ps->fd, SOL_PACKET, PACKET_RX_RING, (void*) &tp, sizeof(tp))) + error(1, errno, "setsockopt() ring"); + + ps->ring = mmap(0, tp.tp_block_size * tp.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, ps->fd, 0); + if (!ps->ring) + error(1, 0, "mmap()"); +} + +struct sock_filter egress_filter[] = { + { BPF_LD|BPF_B|BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_PKTTYPE }, + { BPF_JMP|BPF_JEQ, 1, 0, PACKET_OUTGOING }, + { BPF_RET, 0, 0, 0x00000000 }, + { BPF_RET, 0, 0, 0x0000ffff } +}; + +struct sock_fprog egress_fprog = { + .len = sizeof(egress_filter) / sizeof(egress_filter[0]), + .filter = egress_filter, +}; + +void +psock_init(struct psock *ps) +{ + int val; + + ps->fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (ps->fd < 0) + error(1, errno, "socket()"); + + val = TPACKET_V2; + if (setsockopt(ps->fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt() version"); + val = 1; + if (setsockopt(ps->fd, SOL_PACKET, PACKET_TIMESTAMP, &val, sizeof(val))) + error(1, errno, "setsockopt() tstamp"); + if (setsockopt(ps->fd, SOL_SOCKET, SO_ATTACH_FILTER, + &egress_fprog, sizeof(egress_fprog))) + error(1, errno, "setsockopt() filter"); + + if (ps->dev) { + struct sockaddr_ll laddr; + + memset(&laddr, 0, sizeof(laddr)); + laddr.sll_family = AF_PACKET; + laddr.sll_protocol = htons(ETH_P_ALL); /* must be on ptype_all to sniff egress */ + laddr.sll_ifindex = if_nametoindex(ps->dev); + if (!laddr.sll_ifindex) + error(1, errno, "no such device: %s", ps->dev); + if (bind(ps->fd, (void *) &laddr, sizeof(laddr))) + error(1, errno, "bind device: %s (%d)", ps->dev, laddr.sll_ifindex); + } + + psock_init_ring(ps); +} + +static int +psock_wait(struct psock *ps) +{ + struct pollfd pollset[2]; + int ret; + + pollset[0].fd = 0; + pollset[0].events = POLLIN; + pollset[0].revents = 0; + + pollset[1].fd = ps->fd; + pollset[1].events = POLLIN; + pollset[1].revents = 0; + + ret = poll(pollset, 2, 100); + if (ret < 0 && errno != EINTR && errno != EAGAIN) + error(1, errno, "poll()"); + + if (ret > 0 && pollset[0].revents) + return 0; + + return 1; +} + +int +psock_read(struct psock *ps, psock_fn fn) +{ + struct tpacket2_hdr *header; + + header = (void *) ps->ring + (ps->idx_reader * ps->frame_size); + + if (!(header->tp_status & TP_STATUS_USER)) + return 0; + if (header->tp_status & TP_STATUS_COPY) + error(1, 0, "detected incomplete packed"); + if (header->tp_status & TP_STATUS_LOSING) { + static int report_overflow; + if (!report_overflow) { + report_overflow = 1; + fprintf(stderr, "psock: socket overflow detected. some packets will be lost (only warning once).\n"); + } + } + + fn(header, ((void *) header) + header->tp_mac); + + header->tp_status = TP_STATUS_KERNEL; + ps->idx_reader = (ps->idx_reader + 1) & (ps->frame_count - 1); + return 1; +} + +void +psock_loop(struct psock *ps, psock_fn fn) +{ + while (psock_wait(ps)) { + while (psock_read(ps, fn)) {} + } +} + +void +psock_exit(struct psock *ps) +{ + if (munmap(ps->ring, ps->frame_count * ps->frame_size)) + error(1, errno, "munmap"); + + if (close(ps->fd)) + error(1, errno, "close"); +} + +void +psock_all(int frame_count, int frame_size, const char *dev, psock_fn fn) +{ + struct psock ps; + + memset(&ps, 0, sizeof(ps)); + + ps.frame_count = frame_count; + ps.frame_size = frame_size; + if (dev) + ps.dev = dev; + + psock_init(&ps); + psock_loop(&ps, fn); + psock_exit(&ps); +} + diff --git a/tools/tcplate/libpsock.h b/tools/tcplate/libpsock.h new file mode 100644 index 0000000..e86dfeb --- /dev/null +++ b/tools/tcplate/libpsock.h @@ -0,0 +1,36 @@ +/* + * Copyright 2014 Google Inc. All Rights Reserved. + * Author: willemb@google.com (Willem de Bruijn) + * + * Packet socket support library + * + * Only reads outgoing packets + */ + +#ifndef _LIBPSOCK_H_ +#define _LIBPSOCK_H_ + +#include + +struct psock { + int frame_size; + int frame_count; + const char *dev; /* (optional) device to bind to */ + + /* internal */ + int fd; + char *ring; + int idx_reader; +}; + +typedef void (*psock_fn)(struct tpacket2_hdr *tp, void *pkt); + +void psock_all(int frame_count, int frame_size, const char *dev, psock_fn fn); + +void psock_init(struct psock *ps); +int psock_read(struct psock *ps, psock_fn fn); +void psock_loop(struct psock *ps, psock_fn fn); +void psock_exit(struct psock *ps); + +#endif // _LIBPSOCK_H_ + diff --git a/tools/tcplate/tcplate.c b/tools/tcplate/tcplate.c new file mode 100644 index 0000000..77927f1 --- /dev/null +++ b/tools/tcplate/tcplate.c @@ -0,0 +1,604 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Measure tcp latency through the kernel using pcap and nflog + * + * Read TCP/IP packets using pcap and nflog and calculate the + * latency spent within traffic shaping by subtracting timestamp + * of the first occurrence (iptables) from the timestamp of the + * second occurrence (packet socket). + * + * It has two modes: + * normal: latency of traffic shaping from protocol layer to dev: + * this subtracts a tstamp in packetsock on dev (eth0) + * from a tstamp in the ip layer at iptables NFLOG + * bonding: latency of traffic shaping on bonding slaves: + * this reads packets on every device, sees the same + * on both master (e.g., bond0) and slaves. + * + * Testing: + * verified correctness by adding delay at the relevant traffic + * shaping layer with + * `tc qdisc add dev $ETH root est 1sec 4sec netem limit 40000 delay 20ms` + * + * Implementation: + * tcplate uses two datastructures: + * - table: a hashtable to store new TCP segments and their timestamp + * - logs: a circular buffer to store tstamp diff on second viewing + * Logs is double buffered to allow sorting results offline. + * + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnflog.h" +#include "libpsock.h" + +static int log_len = 10000; +static int table_len = 57251; /* prime */ +static int ival = 1; +static int frame_count = (1 << 14); +static int frame_size = 128; +static int bond_mode; +static int debug_mode; +static int show_extended; +static int verbose; +static char dev[IFNAMSIZ + 1] = "eth0"; +static uint8_t tos_mask = UCHAR_MAX; +static bool tos_filter = false; + +/* race condition. TODO: protect */ +static uint64_t collisions; +static uint64_t pktcount; +static uint64_t count_nflog; +static uint64_t count_psock; + +/* double buffered list of observations */ +static int64_t *logs[2]; +static int log_selector; +static int log_head; +static int exit_hard; + +struct table_key_full { + __be32 ip_src; + __be32 ip_dst; + __be16 tcp_src; + __be16 tcp_dst; + __be32 seqno; +} __attribute__((packed)); + +union table_key { + struct table_key_full full; + __int128 cmp; +}; + +struct table_elem { + union table_key key; + int64_t tstamp; +}; + +/* not thread safe */ +struct table_elem *table; + +/* Show how many table elements are in use */ +static int +table_scan(void) +{ + int i, used = 0; + + for (i = 0; i < table_len; i++) + if (table[i].key.cmp) + used++; + + return used; +} + +static void +log_record(int64_t val) +{ + /* do not wrap log_head, to discern a partial from full log */ + logs[log_selector][log_head % log_len] = val; + log_head++; +} + +/* switch between double buffered logs, return number of recorded events */ +static int +log_rotate(void) +{ + int old_head; + + log_selector = (log_selector + 1) & 0x1; + old_head = log_head; + log_head = 0; + + return old_head; +} + +/* qsort comparison callback */ +static int +log_compar(const void *_a, const void *_b) +{ + const int64_t *a = _a, *b = _b; + return *a < *b ? -1 : (*a > *b ? 1 : 0); +} + +static void +log_show(void) +{ + int len, matches, selector; + + matches = log_rotate(); + len = matches < log_len ? matches : log_len; + selector = (log_selector + 1) & 0x1; + + qsort(logs[selector], len, sizeof(logs[0][0]), log_compar); + if (len >= 100) { + fprintf(stderr, " %8ld %8ld %8ld", + logs[selector][len / 2], + logs[selector][(len * 9) / 10], + logs[selector][(len * 99) / 100]); + + if (show_extended) + fprintf(stderr, " %10lu %10u %10lu %10d", + pktcount, matches, collisions, + table_scan()); + if (show_extended && verbose > 0) + fprintf(stderr, " %10lu %10lu", + count_nflog, count_psock); + write(2, "\n", 1); + } else { + write(2, ".\n", 2); + } + + collisions = 0; + pktcount = 0; + count_nflog = 0; + count_psock = 0; +} + +/* From "The Practice of Programming" via + * PERL_HASH in Perl 5.005, which is GPLv1 */ +static int hash_compute(void *_key, int klen) +{ + const unsigned int multiplier = 37; + unsigned char *cur, *key = _key; + unsigned int h = 0; + + for (cur = key; cur - key < klen; cur++) + h = (h * multiplier) + *cur; + return h + (h >> 5); +} + +static void +packet_process(__be32 ip_src, __be32 ip_dst, + __be16 tcp_src, __be16 tcp_dst, + __be32 seqno, int64_t tstamp, + int caller_type) +{ + union table_key key; + unsigned int idx; + + key.full.ip_src = ip_src; + key.full.ip_dst = ip_dst; + key.full.tcp_src = tcp_src; + key.full.tcp_dst = tcp_dst; + key.full.seqno = seqno; + + idx = hash_compute(&key, sizeof(key)); + idx %= table_len; + + /* if key is new, insert new tstamp */ + if (!table[idx].key.cmp) { +insert: + table[idx].key.cmp = key.cmp; + table[idx].tstamp = tstamp; + pktcount++; + } + /* if collision, record and insert */ + else if (table[idx].key.cmp != key.cmp) { + collisions++; + goto insert; + } + /* else log the diff and clear the key */ + else { + tstamp = tstamp - table[idx].tstamp; + if (tstamp < 0) + tstamp = -tstamp; + log_record(tstamp); + table[idx].key.cmp = 0; + } + + if (debug_mode) + fprintf(stderr, "%s %u:%hu > %u:%hu seqno=%u time=%lu\n", + caller_type == 0 ? "nflog" : "psock", + ntohl(ip_src), ntohs(tcp_src), + ntohl(ip_dst), ntohs(tcp_dst), + ntohl(seqno), tstamp); +} + +static bool +tos_match(uint8_t tos) +{ + if (!tos_filter) + return true; + + if (tos & tos_mask) + return true; + + if (tos == tos_mask) + return true; + + return false; +} + +static void +packet_callback(struct tpacket2_hdr *tp, void *pkt) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + eth = pkt; + + if (eth->h_proto != htons(ETH_P_IP)) + return; + + iph = pkt + sizeof(*eth); + + /* TODO: support IPv6 */ + if (iph->version != 4) + error(1, 0, "bug in parsing ip header"); + + if (iph->protocol != IPPROTO_TCP) + return; + if (!tos_match(iph->tos)) + return; + + tcph = ((void *) iph) + (iph->ihl << 2); + packet_process(iph->saddr, iph->daddr, + tcph->source, tcph->dest, + tcph->seq, + (1000LL * 1000 * tp->tp_sec) + tp->tp_nsec / 1000, + 1); + + count_psock++; +} + +static void +nflog_callback(const void *data, unsigned int len, + uint64_t ts_sec, uint64_t ts_usec) +{ + const struct iphdr *iph = data; + const struct tcphdr *tcph; + + if (!len) + return; + + if (iph->version != 4) + error(1, 0, "bug in parsing ip header"); + if ((iph->ihl << 2) + sizeof(*tcph) > len) + error(1, 0, "nflog snaplen too small"); + + if (iph->protocol != IPPROTO_TCP) + return; + if (!tos_match(iph->tos)) + return; + + tcph = ((void *) iph) + (iph->ihl << 2); + packet_process(iph->saddr, iph->daddr, + tcph->source, tcph->dest, + tcph->seq, + (1000LL * 1000 * ts_sec) + ts_usec, + 0); + + count_nflog++; +} + +static void +sigalrm_handler(int signum) +{ + log_show(); + alarm(ival); +} + +static void +sigint_handler(int signum) +{ + + if (exit_hard) + exit(1); + + /* first try to exit gracefully based in EINTR in poll */ + exit_hard = 1; +} + +static void +__init(void) +{ + logs[0] = malloc(log_len * sizeof(logs[0][0])); + logs[1] = malloc(log_len * sizeof(logs[0][0])); + table = calloc(table_len, sizeof(struct table_elem)); + if (!logs[0] || !logs[1] || !table) + error(1, 0, "alloc"); +} + +static void +__exit(void) +{ + free(table); + free(logs[1]); + free(logs[0]); +} + +static void __attribute__((noreturn)) +usage(const char *filepath) +{ + fprintf(stderr, "usage: %s [-bdfFhqvx] [-c count] [-i iface] [-l loglen] [-L tbllen] [-t ival]\n" + "\n" + "where\n" + " -b sets bonded mode: latency in slave device tc\n" + " -c sets capture queue length (in packets)\n" + " -d debug mode, displays individual records\n" + " -f filter by TOS bits (pass as base 10 or 16)\n" + " -h to show this message and exits\n" + " -i interface (default: eth0)\n" + " -l sets the timestamp log length\n" + " -L sets the tcp segment hashtable length\n" + " -q quiet: suppresses more output\n" + " -t sets the display interval (secs)\n" + " -v sets the verbose option\n" + " -x show extended stats: #matched, collisions, ..\n", + filepath); + exit(1); +} + +static void +parse_opt(int argc, char **argv) +{ + int c; + + while ((c = getopt (argc, argv, "bc:df:hi:l:L:qt:vx")) != -1) + { + switch (c) { + case 'b': + bond_mode = 1; + break; + case 'c': + frame_count = strtoul(optarg, NULL, 10); + break; + case 'd': + debug_mode = 1; + break; + case 'f': + tos_mask = strtoul(optarg, NULL, 0); + tos_filter = true; + break; + case 'h': + usage(argv[0]); + break; + case 'i': + strncpy(dev, optarg, IFNAMSIZ); + break; + case 'l': + log_len = strtoul(optarg, NULL, 10); + break; + case 'L': + table_len = strtoul(optarg, NULL, 10); + break; + case 'q': + if (verbose > 0) + error(1, 0, "pass -q or -v"); + verbose = -1; + break; + case 't': + ival = strtoul(optarg, NULL, 10); + break; + case 'v': + if (verbose < 0) + error(1, 0, "pass -q or -v"); + verbose = 1; + break; + case 'x': + show_extended = 1; + break; + } + } + + if (verbose > 0) { + fprintf(stderr, "mode: %s\n", bond_mode ? "bond" : dev); + fprintf(stderr, "log_len: %u\n", log_len); + fprintf(stderr, "table_len: %u\n", table_len); + fprintf(stderr, "frame_count: %u\n", frame_count); + fprintf(stderr, "frame_size: %u\n", frame_size); + fprintf(stderr, "interval: %u\n", ival); + if (tos_filter) + fprintf(stderr, "tos mask: 0x%x\n", tos_mask); + } +} + +/* @return 1 if data ready, 0 to exit */ +static int do_wait(int fd1, int fd2) +{ + struct pollfd pollset[3]; + int ret; + + pollset[0].fd = 0; + pollset[0].events = POLLIN; + pollset[0].revents = 0; + + pollset[1].fd = fd1; + pollset[1].events = POLLIN; + pollset[1].revents = 0; + + pollset[2].fd = fd2; + pollset[2].events = POLLIN; + pollset[2].revents = 0; + + /* minor race with entering poll(), below */ + if (exit_hard) + return 0; + + ret = poll(pollset, fd2 >= 0 ? 3 : 2, 100); + if (ret < 0 && errno != EINTR) + error(1, errno, "poll()"); + + if (ret > 0 && pollset[0].revents) + return 0; + + return 1; +} + +#define IPT_RULE " -m time -j NFLOG --nflog-group=10 --nflog-threshold=1" +static void __exit_nflog(void) +{ + if (verbose > 0) + system("iptables -v -nL OUTPUT | grep NFLOG"); + if (system("iptables -D OUTPUT " IPT_RULE)) { + error(1, 0, "error while removing log module"); + } +} + +/* + * System configuration change: insert an iptables rule. + * Ensure rollback with atexit() (though this fails with SIGINT, ..) + */ +static void __init_nflog(void) +{ + int ret; + + ret = system("iptables -L OUTPUT | grep -q NFLOG"); + if (ret == -1) + error(1, 0, "read iptables"); + if (WEXITSTATUS(ret) == 0) + error(1, 0, "log module still loaded? try iptables -L"); + + if (system("iptables -A OUTPUT" IPT_RULE)) { + __exit_nflog(); + error(1, 0, "load log module"); + } + atexit(__exit_nflog); +} + +static void __main(void) +{ + struct psock ps; + int logfd; + + memset(&ps, 0, sizeof(ps)); + ps.frame_count = frame_count; + ps.frame_size = frame_size; + /* + * in normal mode, get timestamp at ip layer and eth0 dequeue + * in bond mode, get timestamp at bond0 and eth0 dequeue. + * + * filter psock on eth0 if calculating latency from ip to eth0. + * else, do not filter to read packet on both master and slave, + * but disable nflog. + */ + if (bond_mode) { + logfd = -1; + } else { + /* + * snaplen must be smaller than PKTLEN in nflog_read + * or packets that are > PKTLEN && <= snaplen are dropped + */ + const int snaplen = 60; + + logfd = nflog_init(snaplen); + ps.dev = dev; + } + + psock_init(&ps); + + while (do_wait(ps.fd, logfd)) { + if (logfd != -1) + while (nflog_read(logfd, nflog_callback)) {} + while (psock_read(&ps, packet_callback)) {} + } + + psock_exit(&ps); + if (logfd != -1) + nflog_exit(logfd); +} + +static void +print_header(void) +{ +#define MAIN_HEADER "latency: 50 90 99 (%% us)" +#define EXTRA_HEADER " #total #matches #collis. #tblkeys" + + fprintf(stderr, "\npress Enter to exit\n" + "\n. indicates insufficient data\n" + "\n"); + + if (show_extended) + fprintf(stderr, MAIN_HEADER EXTRA_HEADER "\n"); + else + fprintf(stderr, MAIN_HEADER "\n"); +} + +int +main(int argc, char **argv) +{ + if (verbose >= 0) + fprintf(stderr, "tcplate v1.2: measure traffic shaping TCP latency\n"); + + parse_opt(argc, argv); + + if (verbose >= 0) + print_header(); + + __init(); + + signal(SIGALRM, &sigalrm_handler); + signal(SIGINT, &sigint_handler); + + alarm(ival); + + __init_nflog(); + __main(); + __exit(); + + return 0; +} + From a55042e3c241a9031809f9c0224a2b9816cf6cc8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 8 Aug 2016 15:49:12 -0400 Subject: [PATCH 11/20] tools: clarify tcplate GPL license --- tools/tcplate/libnflog.c | 21 ++++++++++++++++++--- tools/tcplate/libnflog.h | 17 ++++++++++++++++- tools/tcplate/libpsock.c | 16 +++++++++++++++- tools/tcplate/libpsock.h | 17 ++++++++++++++++- tools/tcplate/tcplate.c | 2 +- 5 files changed, 66 insertions(+), 7 deletions(-) diff --git a/tools/tcplate/libnflog.c b/tools/tcplate/libnflog.c index 08f808a..9fd1006 100644 --- a/tools/tcplate/libnflog.c +++ b/tools/tcplate/libnflog.c @@ -1,5 +1,5 @@ /* - * Copyright 2014 Google Inc. All Rights Reserved. + * Copyright 2014 Google Inc. * Author: willemb@google.com (Willem de Bruijn) * * Netlink support library @@ -15,7 +15,22 @@ * or even * `iptables -A OUTPUT -m time -j NFLOG --nflog-group=10` * - * TODO(willemb): optimize by using mmapped ring. + * TODO: optimize by using mmapped ring, similar to psock + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #define _GNU_SOURCE @@ -82,7 +97,7 @@ static void __nflog_sendcmd(int fd, uint8_t cmd, void *msg, int msglen, if (send(fd, buf, nh->nlmsg_len, 0) != nh->nlmsg_len) error(1, errno, "sendcmd"); - /* TODO(willemb): handle EINTR */ + /* TODO: handle EINTR */ ret = recv(fd, buf, sizeof(buf), 0); if (ret == -1) error(1, errno, "recv ctrl: sock error"); diff --git a/tools/tcplate/libnflog.h b/tools/tcplate/libnflog.h index 96af568..d1cdbb7 100644 --- a/tools/tcplate/libnflog.h +++ b/tools/tcplate/libnflog.h @@ -1,10 +1,25 @@ /* - * Copyright 2014 Google Inc. All Rights Reserved. + * Copyright 2014 Google Inc. * Author: willemb@google.com (Willem de Bruijn) * * Netfilter LOG support library * * Only reads outgoing packets + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #ifndef _LIBNFLOG_H_ diff --git a/tools/tcplate/libpsock.c b/tools/tcplate/libpsock.c index d810e3d..c92cb78 100644 --- a/tools/tcplate/libpsock.c +++ b/tools/tcplate/libpsock.c @@ -1,9 +1,23 @@ /* - * Copyright 2014 Google Inc. All Rights Reserved. + * Copyright 2014 Google Inc. * Author: willemb@google.com (Willem de Bruijn) * * Packet socket support library * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #define _GNU_SOURCE diff --git a/tools/tcplate/libpsock.h b/tools/tcplate/libpsock.h index e86dfeb..922f607 100644 --- a/tools/tcplate/libpsock.h +++ b/tools/tcplate/libpsock.h @@ -1,10 +1,25 @@ /* - * Copyright 2014 Google Inc. All Rights Reserved. + * Copyright 2014 Google Inc. * Author: willemb@google.com (Willem de Bruijn) * * Packet socket support library * * Only reads outgoing packets + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #ifndef _LIBPSOCK_H_ diff --git a/tools/tcplate/tcplate.c b/tools/tcplate/tcplate.c index 77927f1..b6064f7 100644 --- a/tools/tcplate/tcplate.c +++ b/tools/tcplate/tcplate.c @@ -198,7 +198,7 @@ log_show(void) } /* From "The Practice of Programming" via - * PERL_HASH in Perl 5.005, which is GPLv1 */ + * PERL_HASH in Perl 5.005, which is GPL */ static int hash_compute(void *_key, int klen) { const unsigned int multiplier = 37; From 87f019e0cb3a1fc85bede6bc2d70886d0e038b58 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 21 Oct 2016 17:27:57 -0400 Subject: [PATCH 12/20] tests: extend psock_txring_vnet with vnet_hdr without GSO Allow passing a vnet_hdr to the packet socket without triggering segmentation offload. This allows testing checksum offload on MTU sized packets. --- tests/psock_txring_vnet.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c index c9bdf28..74e84e7 100644 --- a/tests/psock_txring_vnet.c +++ b/tests/psock_txring_vnet.c @@ -30,12 +30,21 @@ #include #include +#if 0 +/* requires libcap-dev */ +#include +#else +extern int capset(cap_user_header_t header, cap_user_data_t data); +extern int capget(cap_user_header_t header, const cap_user_data_t data); +#endif + #ifndef PACKET_QDISC_BYPASS #define PACKET_QDISC_BYPASS 20 #endif static bool cfg_enable_ring = true; static bool cfg_enable_vnet = false; +static bool cfg_enable_gso = true; static char *cfg_ifname = "eth0"; static int cfg_ifindex; static int cfg_num_frames = 4; @@ -160,15 +169,17 @@ static int frame_fill(void *buffer, unsigned int payload_len) vnet = buffer; - vnet->hdr_len = ETH_HLEN + sizeof(*iph) + sizeof(*tcph); - vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; vnet->csum_start = ETH_HLEN + sizeof(*iph); vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); - vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - - sizeof(struct tcphdr); + if (cfg_enable_gso) { + vnet->hdr_len = ETH_HLEN + sizeof(*iph) + sizeof(*tcph); + vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - + sizeof(struct tcphdr); + } + off += sizeof(*vnet); } @@ -319,7 +330,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "cd:i:l:L:n:Nqs:v")) != -1) + while ((c = getopt(argc, argv, "cd:Gi:l:L:n:Nqs:v")) != -1) { switch (c) { case 'c': @@ -329,6 +340,10 @@ static void parse_opts(int argc, char **argv) if (!inet_aton(optarg, &ip_daddr)) error(1, 0, "bad ipv4 destination address"); break; + case 'G': + fprintf(stderr, "gso disabled (-v only sets csum)\n"); + cfg_enable_gso = false; + break; case 'i': cfg_ifname = optarg; break; From 4eb0ac6bcdf03a4e268362108ef1039983c0cc03 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 25 Oct 2016 15:37:22 -0400 Subject: [PATCH 13/20] tests: fix memset in psock_txring_vnet --- tests/psock_txring_vnet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c index 74e84e7..8641187 100644 --- a/tests/psock_txring_vnet.c +++ b/tests/psock_txring_vnet.c @@ -221,13 +221,13 @@ static void ring_write(void *slot) if (header->tp_status != TP_STATUS_AVAILABLE) error(1, 0, "write: slot not available"); - memset(slot + TPACKET2_HDRLEN, 0, req.tp_frame_size - TPACKET2_HDRLEN); + header->tp_mac = TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); + memset(slot + header->tp_mac, 0, req.tp_frame_size - header->tp_mac); len = frame_fill(slot + header->tp_mac, cfg_payload_len); if (cfg_override_len < len) len = cfg_override_len; - header->tp_mac = TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); header->tp_len = len; header->tp_status = TP_STATUS_SEND_REQUEST; } From ace9a03f80fbe4b27ea27a3e6818f5968abbbdf7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 25 Oct 2016 15:39:02 -0400 Subject: [PATCH 14/20] tests: add test for recvmsg cmsg IP_CHECKSUM --- tests/recv_cmsg_ipchecksum.c | 156 +++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 tests/recv_cmsg_ipchecksum.c diff --git a/tests/recv_cmsg_ipchecksum.c b/tests/recv_cmsg_ipchecksum.c new file mode 100644 index 0000000..5c0508a --- /dev/null +++ b/tests/recv_cmsg_ipchecksum.c @@ -0,0 +1,156 @@ +/* + * Test recv cmsg IP_CHECKSUM + * + * For both IPv4 and v4-mapped-v6: + * - read 100 B packet + * - peek 100 B packet at 2 B offset + * + * The cmsg is expected to arrive with CHECKSUM_COMPLETE, including + * on receive checksum conversion. It does not work with IPv6 or with + * hardware checksum disabled. + * + * To run: start on one host and send traffic from another host: + * + * dd if=/dev/zero bs=1 count=100 | sed 's/./\x01/2' > payload + * for i in 1 2; do nc -p 9000 -q 1 -u $hostA 8000 < payload; done + * + * The sum of the payload of range [2, 99] is 0. + * The sum of the payload of range [0, 99] is 1. + * + * Expected sum16 output is + * peek: 0xFFFF (because all zeroes) + * read: 0x0100 (because one \x01 halfword) + * + * + * Author: Willem de Bruijn (willemb@google.com) + * GPL v2 applies + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IP_CHECKSUM 23 + +#define CFG_PORT 8000 +#define PAYLOAD_CHAR 0 +#define PAYLOAD_LEN 100 +#define PEEK_OFF 2 + +static inline uint16_t csum_fold(__wsum csum) +{ + uint32_t sum = csum; + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (uint16_t) sum; +} + +static void do_rx(int fd, bool peek) +{ + char rbuf[100]; + struct cmsghdr *cm; + struct msghdr msg = {0}; + struct iovec iov = {0}; + char control[CMSG_SPACE(sizeof(__wsum))]; + int ret, expected; + + iov.iov_base = rbuf; + iov.iov_len = sizeof(rbuf); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, peek ? MSG_PEEK : 0); + if (ret == -1) + error(1, errno, "recv"); + if (msg.msg_flags & MSG_TRUNC) + error(1, errno, "recv: truncated data"); + if (msg.msg_flags & MSG_CTRUNC) + error(1, errno, "recv: truncated control"); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + __wsum check32; + + if (cm->cmsg_level != SOL_IP) + error(1, 0, "cmsg: level=%u", cm->cmsg_level); + if (cm->cmsg_type != IP_CHECKSUM) + error(1, 0, "cmsg: type=%u", cm->cmsg_level); + + check32 = *((__wsum*) CMSG_DATA(cm)); + fprintf(stderr, "csum: sum32=0x%0x sum16=%hx ~sum16=%hx\n", + check32, csum_fold(check32), ~csum_fold(check32)); + } + + expected = PAYLOAD_LEN - (peek ? PEEK_OFF : 0); + if (ret != expected) + error(1, 0, "recv: %uB != %uB", ret, expected); + if (rbuf[0] != PAYLOAD_CHAR) + error(1, 0, "recv: payload mismatch"); +} + +static void do_main(struct sockaddr *addr, socklen_t alen) +{ + int fd, ret, one = 1, two = 2; + + fd = socket(addr->sa_family, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket rx"); + + ret = bind(fd, addr, alen); + if (ret) + error(1, errno, "bind rx"); + + if (setsockopt(fd, SOL_IP, IP_CHECKSUM, &one, sizeof(one))) + error(1, errno, "setsockopt csum"); + + if (setsockopt(fd, SOL_SOCKET, SO_PEEK_OFF, &two, sizeof(two))) + error(1, errno, "setsockopt peek_off"); + + do_rx(fd, true); + do_rx(fd, false); + + if (close(fd)) + error(1, errno, "close"); +} + +int main(int argc, char **argv) +{ + struct sockaddr_in addr4 = {0}; + struct sockaddr_in6 addr6 = {0}; + + fprintf(stderr, "PF_INET\n"); + addr4.sin_family = PF_INET; + addr4.sin_port = htons(CFG_PORT); + addr4.sin_addr.s_addr = htonl(INADDR_ANY); + do_main((void *) &addr4, sizeof(addr4)); + + fprintf(stderr, "PF_INET6\n"); + addr6.sin6_family = PF_INET6; + addr6.sin6_port = htons(CFG_PORT); + addr6.sin6_addr = in6addr_any; + do_main((void *) &addr6, sizeof(addr6)); + + return 0; +} From 451e9fbf0d049e32387c8f6a0ede1be3d0752654 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 25 Oct 2016 20:12:21 -0400 Subject: [PATCH 15/20] tests: fix pseudo header calculation in packet_txring_vnet --- tests/psock_txring_vnet.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c index 8641187..7999521 100644 --- a/tests/psock_txring_vnet.c +++ b/tests/psock_txring_vnet.c @@ -129,15 +129,21 @@ static unsigned long add_csum_hword(const uint16_t *start, int num_u16) return sum; } -static uint16_t build_ip_csum(const uint16_t *start, int num_u16, - unsigned long sum) +static uint16_t add_csum_hword_fold(const uint16_t *start, int num_u16, + unsigned long sum) { sum += add_csum_hword(start, num_u16); while (sum >> 16) sum = (sum & 0xffff) + (sum >> 16); - return ~sum; + return sum; +} + +static uint16_t build_ip_csum(const uint16_t *start, int num_u16, + unsigned long sum) +{ + return ~add_csum_hword_fold(start, num_u16, sum); } static uint16_t get_tcp_v4_csum(const struct iphdr *iph, @@ -145,11 +151,12 @@ static uint16_t get_tcp_v4_csum(const struct iphdr *iph, int length) { unsigned long pseudo_sum = 0; + uint16_t proto = htons(IPPROTO_TCP); + uint16_t ulen = htons(length); - pseudo_sum += add_csum_hword((void *) &iph->saddr, 2); - pseudo_sum += add_csum_hword((void *) &iph->daddr, 2); - pseudo_sum += htons(IPPROTO_TCP); - pseudo_sum += htons(length); + pseudo_sum = add_csum_hword_fold((void *) &iph->saddr, 4, 0); + pseudo_sum = add_csum_hword_fold(&proto, 1, pseudo_sum); + pseudo_sum = add_csum_hword_fold(&ulen, 1, pseudo_sum); if (cfg_enable_vnet) return pseudo_sum; From eddecb0ac8357c3afa460f83e255c0021523ec46 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 26 Oct 2016 12:52:41 -0400 Subject: [PATCH 16/20] tests: optionally skip checksum offload in psock_txring_vnet --- tests/psock_txring_vnet.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c index 7999521..10431cf 100644 --- a/tests/psock_txring_vnet.c +++ b/tests/psock_txring_vnet.c @@ -44,7 +44,8 @@ extern int capget(cap_user_header_t header, const cap_user_data_t data); static bool cfg_enable_ring = true; static bool cfg_enable_vnet = false; -static bool cfg_enable_gso = true; +static bool cfg_enable_csum = true; /* only used if cfg_enable_vnet */ +static bool cfg_enable_gso = true; /* only used if cfg_enable_vnet */ static char *cfg_ifname = "eth0"; static int cfg_ifindex; static int cfg_num_frames = 4; @@ -176,15 +177,19 @@ static int frame_fill(void *buffer, unsigned int payload_len) vnet = buffer; - vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = ETH_HLEN + sizeof(*iph); - vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); + if (cfg_enable_csum) { + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = ETH_HLEN + sizeof(*iph); + vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); + } if (cfg_enable_gso) { vnet->hdr_len = ETH_HLEN + sizeof(*iph) + sizeof(*tcph); vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - sizeof(struct tcphdr); + } else { + vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; } off += sizeof(*vnet); @@ -337,18 +342,20 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "cd:Gi:l:L:n:Nqs:v")) != -1) + while ((c = getopt(argc, argv, "cCd:Gi:l:L:n:Nqs:v")) != -1) { switch (c) { case 'c': drop_capability(CAP_SYS_RAWIO); break; + case 'C': + cfg_enable_csum = false; + break; case 'd': if (!inet_aton(optarg, &ip_daddr)) error(1, 0, "bad ipv4 destination address"); break; case 'G': - fprintf(stderr, "gso disabled (-v only sets csum)\n"); cfg_enable_gso = false; break; case 'i': From a75f5aa4cf747fa8014ec22e9d5b89bd9bcf8780 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 2 Nov 2016 11:11:39 -0400 Subject: [PATCH 17/20] tests: add CMSG_RECVFRAGSIZE test corresponds to netdev submission ip: add RECVFRAGSIZE cmsg http://patchwork.ozlabs.org/patch/690431/ http://patchwork.ozlabs.org/patch/690430/ http://patchwork.ozlabs.org/patch/690429/ --- tests/recvfragsize.c | 200 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 tests/recvfragsize.c diff --git a/tests/recvfragsize.c b/tests/recvfragsize.c new file mode 100644 index 0000000..54c79ea --- /dev/null +++ b/tests/recvfragsize.c @@ -0,0 +1,200 @@ +/* Test IP(V6)_RECVFRAGSIZE socket option */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IP_RECVFRAGSIZE 25 +#define IPV6_RECVFRAGSIZE 77 + +static int cfg_dest_port = 6000; +static int cfg_expected_fragsize = 1300; +static int cfg_proto_l3 = PF_INET6; +static int cfg_proto_l4 = SOCK_DGRAM; + +static int socket_rx(int domain, int type, int protocol, + struct sockaddr *addr, socklen_t alen) +{ + int fd, level, optname, one = 1; + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket"); + + if (domain == PF_INET6) { + level = SOL_IPV6; + optname = IPV6_RECVFRAGSIZE; + } else { + level = SOL_IP; + optname = IP_RECVFRAGSIZE; + } + + if (setsockopt(fd, level, optname, &one, sizeof(one))) + error(1, errno, "setsockopt recvfragsize (%u.%u)", + level, optname); + + if (bind(fd, addr, alen)) + error(1, errno, "bind"); + + return fd; +} + +static int socket_rx_ipv6(int type, int protocol) +{ + struct sockaddr_in6 addr = { + .sin6_family = AF_INET6, + .sin6_port = htons(cfg_dest_port), + .sin6_addr = in6addr_any, + }; + + return socket_rx(PF_INET6, type, protocol, (void*) &addr, sizeof(addr)); +} + +static int socket_rx_ipv4(int type, int protocol) +{ + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons(cfg_dest_port), + .sin_addr.s_addr = htons(INADDR_ANY), + }; + + return socket_rx(PF_INET, type, protocol, (void*) &addr, sizeof(addr)); +} + +static void poll_one(int fd) +{ + struct pollfd pfd = {0}; + int ret; + + pfd.fd = fd; + pfd.events = POLLIN; + + ret = poll(&pfd, 1, 1000); + if (ret == -1) + error(1, errno, "poll"); + if (ret == 0) + error(1, 0, "poll: timeout"); + if (!(pfd.revents & POLLIN)) + error(1, 0, "poll: unexpected event(s) 0x%x\n", pfd.revents); +} + +static void rx_one(int fd, int level, int optname) +{ + struct msghdr msg = {0}; + struct cmsghdr *cmsg; + char control[2 * CMSG_SPACE(int)]; + int ret, size, num_cmsg = 0; + + poll_one(fd); + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, MSG_TRUNC); + if (ret == -1) + error(1, errno, "recvmsg"); + if (msg.msg_flags & MSG_CTRUNC) + error(1, 0, "recvmsg: truncated cmsg"); + fprintf(stderr, "recv: %uB\n", ret); + + for (cmsg = CMSG_FIRSTHDR(&msg); + cmsg && cmsg->cmsg_len; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != level || cmsg->cmsg_type != optname) + error(1, 0, "wrong cmsg 0x%x.0x%x", + cmsg->cmsg_level, cmsg->cmsg_type); + num_cmsg++; + fprintf(stderr, "cmsg_level=%u cmsg_type=%u\n", + cmsg->cmsg_level, cmsg->cmsg_type); + size = *((int*) CMSG_DATA(cmsg)); + fprintf(stderr, "max fragsize: %u\n", size); + } + + if (num_cmsg > 1) + error(1, 0, "unexpected #cmsg: %u\n", num_cmsg); + if (num_cmsg == 1 && size != cfg_expected_fragsize) + error(1, 0, "unexpected frag size: %u\n", size); +} + +static void run_one_ipv6(int type) +{ + int fd; + + fprintf(stderr, "ipv6 %s\n", type == SOCK_DGRAM ? "udp" : "raw"); + + /* IPv6 fragments are 8-byte aligned, expect for the last */ + cfg_expected_fragsize = cfg_expected_fragsize >> 3 << 3; + + fd = socket_rx_ipv6(type, type == SOCK_RAW ? IPPROTO_EGP : 0); + rx_one(fd, SOL_IPV6, IPV6_RECVFRAGSIZE); + if (close(fd)) + error(1, errno, "close"); +} + +static void run_one_ipv4(int type) +{ + int fd; + + fprintf(stderr, "ipv4 %s\n", type == SOCK_DGRAM ? "udp" : "raw"); + + fd = socket_rx_ipv4(type, type == SOCK_RAW ? IPPROTO_EGP : 0); + rx_one(fd, SOL_IP, IP_RECVFRAGSIZE); + if (close(fd)) + error(1, errno, "close"); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "46p:ru")) != -1) { + switch (c) { + case '4': + cfg_proto_l3 = PF_INET; + break; + case '6': + cfg_proto_l3 = PF_INET6; + break; + case 'p': + cfg_dest_port = strtoul(optarg, NULL, 0); + break; + case 'r': + cfg_proto_l4 = SOCK_RAW; + break; + case 'u': + cfg_proto_l4 = SOCK_DGRAM; + break; + default: + error(1, 0, "invalid option %c\n", c); + } + } +} + +int main(int argc, char **argv) +{ + parse_opts(argc, argv); + + if (cfg_proto_l3 == PF_INET) + run_one_ipv4(cfg_proto_l4); + else + run_one_ipv6(cfg_proto_l4); + + fprintf(stderr, "OK. All tests passed\n"); + return 0; +} + From 3855018468a764afdb58cfd9d3c955966a7dbab8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 1 May 2017 11:36:00 -0400 Subject: [PATCH 18/20] Add basic echo request/response test pingpong_tcpudp sends a simple echo request/response pair over packet sockets, mimicing TCP or UDP headers. --- tests/pingpong_tcpudp.c | 291 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 tests/pingpong_tcpudp.c diff --git a/tests/pingpong_tcpudp.c b/tests/pingpong_tcpudp.c new file mode 100644 index 0000000..dcc9c44 --- /dev/null +++ b/tests/pingpong_tcpudp.c @@ -0,0 +1,291 @@ +/* GNU GPLv2 applies */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool cfg_is_ack; +static bool cfg_is_client; +static int cfg_ifindex; +static struct in_addr cfg_ip_dst; +static struct in_addr cfg_ip_src; +static char cfg_mac_dst[ETH_HLEN]; +static char cfg_mac_src[ETH_HLEN]; +const int cfg_num_runs = 10; +const int cfg_payload_len = 10; /* must be even */ +static int cfg_pkt_len; +static int cfg_proto = IPPROTO_TCP; +const int cfg_tcp_dst = 0x2222; +const int cfg_tcp_src = 0x1111; +const int cfg_timeout_us = 1000 * 1000; + +static char packet[ETH_DATA_LEN]; + +static unsigned long gettimeofday_us(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (1000UL * 1000 * tv.tv_sec) + tv.tv_usec; +} + +static uint16_t calc_csum(const uint32_t pseudo, const uint16_t *data, int num_words) +{ + unsigned long sum = pseudo; + int i; + + for (i = 0; i < num_words; i++) + sum += data[i]; + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +static void build_pkt(void) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + struct udphdr *udph; + int off = 0, tslen; + + eth = (void *) packet; + memcpy(ð->h_dest, cfg_mac_dst, ETH_ALEN); + memcpy(ð->h_source, cfg_mac_src, ETH_ALEN); + eth->h_proto = htons(ETH_P_IP); + off += sizeof(*eth); + + if (cfg_proto == IPPROTO_UDP) + tslen = sizeof(*udph); + else + tslen = sizeof(*tcph); + cfg_pkt_len = sizeof(*eth) + sizeof(*iph) + tslen + cfg_payload_len; + + iph = (void *) packet + off; + iph->version = 4; + iph->ihl = 5; + iph->ttl = 2; + iph->id = 666; + iph->frag_off = htons(IP_DF); + iph->tot_len = htons((uint16_t) (sizeof(*iph) + tslen + cfg_payload_len)); + iph->saddr = cfg_ip_src.s_addr; + iph->daddr = cfg_ip_dst.s_addr; + iph->protocol = cfg_proto; + iph->check = calc_csum(0, (void *) iph, sizeof(*iph) >> 1); + off += sizeof(*iph); + + if (cfg_proto == IPPROTO_UDP) { + udph = (void *) packet + off; + udph->dest = htons(cfg_tcp_dst); + udph->source = htons(cfg_tcp_src); + udph->len = htons(tslen + cfg_payload_len); + udph->check = 0; + off += sizeof(*udph); + } else { + tcph = (void *) packet + off; + tcph->dest = htons(cfg_tcp_dst); + tcph->source = htons(cfg_tcp_src); + tcph->seq = htonl(1); + tcph->ack_seq = htonl(1); + tcph->doff = 5; + if (cfg_is_ack) + tcph->ack = 1; + tcph->psh = 1; + tcph->window = htons(16000); + tcph->check = htons(0xd1e2); + off += sizeof(*tcph); + } + + memset(packet + off, 'a', cfg_payload_len); +} + +static void do_recv(int fd) +{ + char rdata[ETH_DATA_LEN] = {0}; + int ret; + + ret = read(fd, rdata, sizeof(rdata)); + if (ret == -1) + error(1, errno, "read"); + /* TODO: understand why returned packet exceeds cfg_pkt_len */ + if (ret < cfg_pkt_len) + error(1, 0, "read: %uB != %uB\n", ret, cfg_pkt_len); +} + +static void do_send(int fd) +{ + int ret; + + ret = send(fd, packet, cfg_pkt_len, MSG_DONTWAIT); + if (ret == -1) + error(1, errno, "write"); + if (ret != cfg_pkt_len) + error(1, 0, "write: %uB != %uB\n", ret, cfg_pkt_len); +} + +static void do_server(int fd) +{ + unsigned long t1, t2 = 0; + int i; + + for (i = 0; i < cfg_num_runs; i++) { + do_recv(fd); + t1 = t2; + t2 = gettimeofday_us(); + do_send(fd); + + if (t1) + fprintf(stderr, "%d. RTT: %lu usec\n", i, t2 - t1); + } + do_send(fd); +} + +static void do_client(int fd) +{ + unsigned long t1, t2; + int i; + + for (i = 0; i < cfg_num_runs; i++) { + t1 = gettimeofday_us(); + do_send(fd); + do_recv(fd); + t2 = gettimeofday_us(); + + fprintf(stderr, "%d. RTT: %lu usec\n", i, t2 - t1); + } +} + +static void set_filter(int fd) +{ + const int ip_prot_off = sizeof(struct ethhdr) + + __builtin_offsetof(struct iphdr, protocol); + const int tcp_dst_off = sizeof(struct ethhdr) + sizeof(struct iphdr) + + __builtin_offsetof(struct tcphdr, dest); + + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ip_prot_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_proto, 0, 3), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, tcp_dst_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_tcp_dst, 0, 1), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct sock_fprog prog; + + prog.filter = filter; + prog.len = sizeof(filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter mark"); +} + +static void read_mac(const char *mac_str, char *mac_bin) +{ + if (sscanf(mac_str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac_bin[0], &mac_bin[1], &mac_bin[2], + &mac_bin[3], &mac_bin[4], &mac_bin[5]) != 6) + error(1, 0, "bad mac: %s\n", optarg); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "acd:D:i:s:S:u")) != -1) { + switch (c) { + case 'a': + cfg_is_ack = true; + break; + case 'c': + cfg_is_client = true; + break; + case 'd': + if (inet_pton(PF_INET, optarg, &cfg_ip_dst) != 1) + error(1, 0, "bad src ip: %s\n", optarg); + break; + case 'D': + read_mac(optarg, cfg_mac_dst); + break; + case 'i': + cfg_ifindex = if_nametoindex(optarg); + if (!cfg_ifindex) + error(1, errno, "if_nametoindex"); + break; + case 's': + if (inet_pton(PF_INET, optarg, &cfg_ip_src) != 1) + error(1, 0, "bad src ip: %s\n", optarg); + break; + case 'S': + read_mac(optarg, cfg_mac_src); + break; + case 'u': + cfg_proto = IPPROTO_UDP; + break; + default: + error(1, 0, "unknown option %c", c); + } + } + + if (!cfg_ifindex) { + cfg_ifindex = if_nametoindex("eth0"); + if (!cfg_ifindex) + error(1, errno, "if_nametoindex"); + } +} + +int main(int argc, char **argv) +{ + struct sockaddr_ll addr = {0}; + int fd; + + parse_opts(argc, argv); + fprintf(stderr, "mode: %s\n", cfg_is_client ? "client" : "server"); + + fd = socket(PF_PACKET, SOCK_RAW, 0); + if (fd == -1) + error(1, errno, "socket"); + + set_filter(fd); + + addr.sll_family = AF_PACKET; + addr.sll_protocol = htons(ETH_P_IP); + addr.sll_ifindex = cfg_ifindex; + if (bind(fd, (void*) &addr, sizeof(addr))) + error(1, errno, "bind"); + + build_pkt(); + + if (cfg_is_client) + do_client(fd); + else + do_server(fd); + + if (close(fd)) + error(1, errno, "close"); + + return 0; +} + From 7ad8a936342e8b691053255d9dccbbb3e53253f4 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 1 May 2017 12:21:00 -0400 Subject: [PATCH 19/20] fixup echo request/response test --- tests/pingpong_tcpudp.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/pingpong_tcpudp.c b/tests/pingpong_tcpudp.c index dcc9c44..6a385d1 100644 --- a/tests/pingpong_tcpudp.c +++ b/tests/pingpong_tcpudp.c @@ -51,9 +51,9 @@ static unsigned long gettimeofday_us(void) return (1000UL * 1000 * tv.tv_sec) + tv.tv_usec; } -static uint16_t calc_csum(const uint32_t pseudo, const uint16_t *data, int num_words) +static uint16_t calc_csum(unsigned long sum, const uint16_t *data, + int num_words) { - unsigned long sum = pseudo; int i; for (i = 0; i < num_words; i++) @@ -65,6 +65,22 @@ static uint16_t calc_csum(const uint32_t pseudo, const uint16_t *data, int num_w return ~sum; } +static uint16_t calc_tcp_csum(struct iphdr *iph, struct tcphdr *tcph) +{ + unsigned long sum = 0, tcplen; + + tcplen = ntohs(iph->tot_len) - sizeof(*iph); + if (tcplen & 1) + error(1, 0, "odd length: csum needs padding"); + + sum += iph->daddr; + sum += iph->saddr; + sum += htons(iph->protocol); + sum += htons(tcplen); + + return calc_csum(sum, (void *) tcph, tcplen >> 1); +} + static void build_pkt(void) { struct ethhdr *eth; @@ -116,11 +132,9 @@ static void build_pkt(void) tcph->ack = 1; tcph->psh = 1; tcph->window = htons(16000); - tcph->check = htons(0xd1e2); + tcph->check = calc_tcp_csum(iph, tcph); off += sizeof(*tcph); } - - memset(packet + off, 'a', cfg_payload_len); } static void do_recv(int fd) From 43297b11a0f3e64743ba2436549cf9f48af6a6ce Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Sun, 15 Oct 2017 07:33:29 +0100 Subject: [PATCH 20/20] Add sendmmsg vector IO support 1. Add an option to send using sendmmsg 2. Reorganize packet creation to match sendmmsg semantics 3. Allow for the creation of packets split in 3: vmesg header, header, body - this matches the format that would be generated in a kernel skb is converted to an iov. Signed-off-by: Anton Ivanov --- tests/psock_txring_vnet.c | 146 ++++++++++++++++++++++++++++++-------- 1 file changed, 116 insertions(+), 30 deletions(-) diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c index 10431cf..a951d60 100644 --- a/tests/psock_txring_vnet.c +++ b/tests/psock_txring_vnet.c @@ -46,6 +46,7 @@ static bool cfg_enable_ring = true; static bool cfg_enable_vnet = false; static bool cfg_enable_csum = true; /* only used if cfg_enable_vnet */ static bool cfg_enable_gso = true; /* only used if cfg_enable_vnet */ +static bool cfg_vector_send = false; static char *cfg_ifname = "eth0"; static int cfg_ifindex; static int cfg_num_frames = 4; @@ -68,9 +69,11 @@ static int socket_open(void) if (fd == -1) error(1, errno, "socket"); - val = TPACKET_V2; - if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) - error(1, errno, "setsockopt version"); + if (cfg_enable_ring) { + val = TPACKET_V2; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + } if (cfg_qdisc_bypass) { val = 1; @@ -165,35 +168,31 @@ static uint16_t get_tcp_v4_csum(const struct iphdr *iph, return build_ip_csum((void *) tcph, length >> 1, pseudo_sum); } -static int frame_fill(void *buffer, unsigned int payload_len) +static void set_vheader(void *buffer) +{ + struct virtio_net_hdr *vnet; + vnet = buffer; + if (cfg_enable_csum) { + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = ETH_HLEN + sizeof(struct iphdr); + vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); + } + + if (cfg_enable_gso) { + vnet->hdr_len = ETH_HLEN + sizeof(struct iphdr) + sizeof(struct tcphdr); + vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - + sizeof(struct tcphdr); + } else { + vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; + } +} + +static int set_packet(void *buffer, unsigned int off, unsigned int payload_len) { struct ethhdr *eth; struct iphdr *iph; struct tcphdr *tcph; - int off = 0; - - if (cfg_enable_vnet) { - struct virtio_net_hdr *vnet; - - vnet = buffer; - - if (cfg_enable_csum) { - vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = ETH_HLEN + sizeof(*iph); - vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); - } - - if (cfg_enable_gso) { - vnet->hdr_len = ETH_HLEN + sizeof(*iph) + sizeof(*tcph); - vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - - sizeof(struct tcphdr); - } else { - vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; - } - - off += sizeof(*vnet); - } eth = buffer + off; memcpy(ð->h_source, cfg_mac_src, ETH_ALEN); @@ -225,6 +224,21 @@ static int frame_fill(void *buffer, unsigned int payload_len) return off + payload_len; } +static int frame_fill(void *buffer, unsigned int payload_len) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + int off = 0; + + if (cfg_enable_vnet) { + set_vheader(buffer); + off += sizeof(struct virtio_net_hdr); + } + + return set_packet(buffer, off, payload_len); +} + static void ring_write(void *slot) { struct tpacket2_hdr *header = slot; @@ -262,6 +276,69 @@ static void socket_write(int fd) error(1, 0, "send: %uB < %uB\n", ret, len); } +static void vector_write(int fd, int count) +{ + struct mmsghdr *loop, *msgvec = NULL; + struct iovec *iov = NULL; + int i, ret; + char *packet; + + fprintf(stderr, "vector size: %u\n", count); + msgvec = malloc(sizeof(struct mmsghdr) * count); + if (msgvec == NULL) { + error(1, ENOMEM, "alloc mmsg vector"); + } + iov = malloc(sizeof(struct iovec) * count * 3); + if (iov == NULL) { + error(1, ENOMEM, "alloc iov vector"); + } + loop = msgvec; + for (i = 0; i < count ; i++) { + loop->msg_hdr.msg_iov = iov; + loop->msg_hdr.msg_iovlen = 2; + loop->msg_hdr.msg_control = NULL; + loop->msg_hdr.msg_controllen = 0; + loop->msg_hdr.msg_flags = MSG_DONTWAIT; + loop->msg_hdr.msg_name = NULL; + loop->msg_hdr.msg_namelen = 0; + if (cfg_enable_vnet) { + loop->msg_hdr.msg_iovlen += 1; + iov->iov_base = malloc(sizeof (struct virtio_net_hdr)); + if (iov->iov_base == NULL) { + error(1, ENOMEM, "alloc vnet hdr"); + iov->iov_len = 0; + } else { + iov->iov_len = sizeof(struct virtio_net_hdr); + set_vheader(iov->iov_base); + } + iov++; + } + packet = malloc( + cfg_payload_len + sizeof(struct ethhdr) + + sizeof(struct iphdr) + sizeof(struct tcphdr)); + if (packet == NULL) { + error(1, ENOMEM, "alloc payload"); + iov->iov_len = 0; + } else { + set_packet(packet, 0, cfg_payload_len); + iov->iov_base = packet; + iov->iov_len = sizeof(struct ethhdr) + + sizeof(struct iphdr) + sizeof(struct tcphdr); + iov++; + iov->iov_base = packet + sizeof(struct ethhdr) + + sizeof(struct iphdr) + sizeof(struct tcphdr); + iov->iov_len = cfg_payload_len; + } + iov++; + loop++; + } + ret = sendmmsg(fd, msgvec, count, 0); + if (ret == -1) + error(1, errno, "send"); + if (ret < count) + error(1, 0, "send: %uB < %uB\n", ret, count); +} + static void socket_bind(int fd) { struct sockaddr_ll addr = { 0 }; @@ -342,7 +419,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "cCd:Gi:l:L:n:Nqs:v")) != -1) + while ((c = getopt(argc, argv, "cCd:Gi:l:L:n:Nqs:vZ")) != -1) { switch (c) { case 'c': @@ -383,6 +460,12 @@ static void parse_opts(int argc, char **argv) case 'v': cfg_enable_vnet = true; break; + case 'Z': + { + cfg_enable_ring = false; + cfg_vector_send = true; + } + break; default: error(1, 0, "unknown option %c", c); } @@ -415,7 +498,10 @@ int main(int argc, char **argv) do_run_ring(fd, ring); ring_close(ring); } else { - do_run(fd); + if (cfg_vector_send) { + vector_write(fd, cfg_num_frames); + } else + do_run(fd); } if (close(fd) == -1)