From 7ae9f2817ad5130fdffa3692ef69b9d0e91279e8 Mon Sep 17 00:00:00 2001 From: Michal Rostecki Date: Tue, 18 Jun 2019 20:13:18 +0200 Subject: samples: bpf: Remove bpf_debug macro in favor of bpf_printk ibumad example was implementing the bpf_debug macro which is exactly the same as the bpf_printk macro available in bpf_helpers.h. This change makes use of bpf_printk instead of bpf_debug. Signed-off-by: Michal Rostecki Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- samples/bpf/ibumad_kern.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'samples') diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c index 38b2b3f22049..f281df7e0089 100644 --- a/samples/bpf/ibumad_kern.c +++ b/samples/bpf/ibumad_kern.c @@ -31,15 +31,9 @@ struct bpf_map_def SEC("maps") write_count = { }; #undef DEBUG -#ifdef DEBUG -#define bpf_debug(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) -#else -#define bpf_debug(fmt, ...) +#ifndef DEBUG +#undef bpf_printk +#define bpf_printk(fmt, ...) #endif /* Taken from the current format defined in @@ -86,7 +80,7 @@ int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx) u64 zero = 0, *val; u8 class = ctx->mgmt_class; - bpf_debug("ib_umad read recv : class 0x%x\n", class); + bpf_printk("ib_umad read recv : class 0x%x\n", class); val = bpf_map_lookup_elem(&read_count, &class); if (!val) { @@ -106,7 +100,7 @@ int on_ib_umad_read_send(struct ib_umad_rw_args *ctx) u64 zero = 0, *val; u8 class = ctx->mgmt_class; - bpf_debug("ib_umad read send : class 0x%x\n", class); + bpf_printk("ib_umad read send : class 0x%x\n", class); val = bpf_map_lookup_elem(&read_count, &class); if (!val) { @@ -126,7 +120,7 @@ int on_ib_umad_write(struct ib_umad_rw_args *ctx) u64 zero = 0, *val; u8 class = ctx->mgmt_class; - bpf_debug("ib_umad write : class 0x%x\n", class); + bpf_printk("ib_umad write : class 0x%x\n", class); val = bpf_map_lookup_elem(&write_count, &class); if (!val) { -- cgit v1.2.1 From 9e859e8f199d4ab15ed418f7d17735c77619d14f Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 25 Jun 2019 09:55:36 +0900 Subject: samples: bpf: make the use of xdp samples consistent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, each xdp samples are inconsistent in the use. Most of the samples fetch the interface with it's name. (ex. xdp1, xdp2skb, xdp_redirect_cpu, xdp_sample_pkts, etc.) But some of the xdp samples are fetching the interface with ifindex by command argument. This commit enables xdp samples to fetch interface with it's name without changing the original index interface fetching. ( fetching in the same way as xdp_sample_pkts_user.c does.) Signed-off-by: Daniel T. Lee Acked-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- samples/bpf/xdp_adjust_tail_user.c | 12 ++++++++++-- samples/bpf/xdp_redirect_map_user.c | 15 +++++++++++---- samples/bpf/xdp_redirect_user.c | 15 +++++++++++---- samples/bpf/xdp_tx_iptunnel_user.c | 12 ++++++++++-- 4 files changed, 42 insertions(+), 12 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c index 586ff751aba9..a3596b617c4c 100644 --- a/samples/bpf/xdp_adjust_tail_user.c +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -69,7 +70,7 @@ static void usage(const char *cmd) printf("Start a XDP prog which send ICMP \"packet too big\" \n" "messages if ingress packet is bigger then MAX_SIZE bytes\n"); printf("Usage: %s [...]\n", cmd); - printf(" -i Interface Index\n"); + printf(" -i Interface\n"); printf(" -T Default: 0 (forever)\n"); printf(" -S use skb-mode\n"); printf(" -N enforce native mode\n"); @@ -102,7 +103,9 @@ int main(int argc, char **argv) switch (opt) { case 'i': - ifindex = atoi(optarg); + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); break; case 'T': kill_after_s = atoi(optarg); @@ -136,6 +139,11 @@ int main(int argc, char **argv) return 1; } + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 15bb6f67f9c3..f70ee33907fd 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex) static void usage(const char *prog) { fprintf(stderr, - "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n" + "usage: %s [OPTS] _IN _OUT\n\n" "OPTS:\n" " -S use skb-mode\n" " -N enforce native mode\n" @@ -127,7 +128,7 @@ int main(int argc, char **argv) } if (optind == argc) { - printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]); + printf("usage: %s _IN _OUT\n", argv[0]); return 1; } @@ -136,8 +137,14 @@ int main(int argc, char **argv) return 1; } - ifindex_in = strtoul(argv[optind], NULL, 0); - ifindex_out = strtoul(argv[optind + 1], NULL, 0); + ifindex_in = if_nametoindex(argv[optind]); + if (!ifindex_in) + ifindex_in = strtoul(argv[optind], NULL, 0); + + ifindex_out = if_nametoindex(argv[optind + 1]); + if (!ifindex_out) + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + printf("input: %d output: %d\n", ifindex_in, ifindex_out); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index ce71be187205..39de06f3ec25 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex) static void usage(const char *prog) { fprintf(stderr, - "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n" + "usage: %s [OPTS] _IN _OUT\n\n" "OPTS:\n" " -S use skb-mode\n" " -N enforce native mode\n" @@ -128,7 +129,7 @@ int main(int argc, char **argv) } if (optind == argc) { - printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]); + printf("usage: %s _IN _OUT\n", argv[0]); return 1; } @@ -137,8 +138,14 @@ int main(int argc, char **argv) return 1; } - ifindex_in = strtoul(argv[optind], NULL, 0); - ifindex_out = strtoul(argv[optind + 1], NULL, 0); + ifindex_in = if_nametoindex(argv[optind]); + if (!ifindex_in) + ifindex_in = strtoul(argv[optind], NULL, 0); + + ifindex_out = if_nametoindex(argv[optind + 1]); + if (!ifindex_out) + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + printf("input: %d output: %d\n", ifindex_in, ifindex_out); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 394896430712..dfb68582e243 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -83,7 +84,7 @@ static void usage(const char *cmd) "in an IPv4/v6 header and XDP_TX it out. The dst \n" "is used to select packets to encapsulate\n\n"); printf("Usage: %s [...]\n", cmd); - printf(" -i Interface Index\n"); + printf(" -i Interface\n"); printf(" -a IPv4 or IPv6\n"); printf(" -p A port range (e.g. 433-444) is also allowed\n"); printf(" -s Used in the IPTunnel header\n"); @@ -181,7 +182,9 @@ int main(int argc, char **argv) switch (opt) { case 'i': - ifindex = atoi(optarg); + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); break; case 'a': vip.family = parse_ipstr(optarg, vip.daddr.v6); @@ -253,6 +256,11 @@ int main(int argc, char **argv) return 1; } + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; -- cgit v1.2.1 From 123e8da1d33042a83cedb530fb5efd64f32ce594 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 26 Jun 2019 17:35:27 +0300 Subject: xsk: Change the default frame size to 4096 and allow controlling it The typical XDP memory scheme is one packet per page. Change the AF_XDP frame size in libbpf to 4096, which is the page size on x86, to allow libbpf to be used with the drivers with the packet-per-page scheme. Add a command line option -f to xdpsock to allow to specify a custom frame size. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Acked-by: Saeed Mahameed Signed-off-by: Daniel Borkmann --- samples/bpf/xdpsock_user.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 0f5eb0d7f2df..93eaaf7239b2 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -68,6 +68,7 @@ static int opt_queue; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags; +static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; static __u32 prog_id; struct xsk_umem_info { @@ -276,6 +277,12 @@ static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr) static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) { struct xsk_umem_info *umem; + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = opt_xsk_frame_size, + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + }; int ret; umem = calloc(1, sizeof(*umem)); @@ -283,7 +290,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) exit_with_error(errno); ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - NULL); + &cfg); if (ret) exit_with_error(-ret); @@ -323,11 +330,9 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem) &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) exit_with_error(-ret); - for (i = 0; - i < XSK_RING_PROD__DEFAULT_NUM_DESCS * - XSK_UMEM__DEFAULT_FRAME_SIZE; - i += XSK_UMEM__DEFAULT_FRAME_SIZE) - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i; + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = + i * opt_xsk_frame_size; xsk_ring_prod__submit(&xsk->umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); @@ -346,6 +351,7 @@ static struct option long_options[] = { {"interval", required_argument, 0, 'n'}, {"zero-copy", no_argument, 0, 'z'}, {"copy", no_argument, 0, 'c'}, + {"frame-size", required_argument, 0, 'f'}, {0, 0, 0, 0} }; @@ -365,8 +371,9 @@ static void usage(const char *prog) " -n, --interval=n Specify statistics update interval (default 1 sec).\n" " -z, --zero-copy Force zero-copy mode.\n" " -c, --copy Force copy mode.\n" + " -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n" "\n"; - fprintf(stderr, str, prog); + fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE); exit(EXIT_FAILURE); } @@ -377,7 +384,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:psSNn:cz", long_options, + c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options, &option_index); if (c == -1) break; @@ -420,6 +427,9 @@ static void parse_command_line(int argc, char **argv) case 'F': opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; + case 'f': + opt_xsk_frame_size = atoi(optarg); + break; default: usage(basename(argv[0])); } @@ -432,6 +442,11 @@ static void parse_command_line(int argc, char **argv) usage(basename(argv[0])); } + if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) { + fprintf(stderr, "--frame-size=%d is not a power of two\n", + opt_xsk_frame_size); + usage(basename(argv[0])); + } } static void kick_tx(struct xsk_socket_info *xsk) @@ -583,8 +598,7 @@ static void tx_only(struct xsk_socket_info *xsk) for (i = 0; i < BATCH_SIZE; i++) { xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr - = (frame_nb + i) << - XSK_UMEM__DEFAULT_FRAME_SHIFT; + = (frame_nb + i) * opt_xsk_frame_size; xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = sizeof(pkt_data) - 1; } @@ -661,21 +675,19 @@ int main(int argc, char **argv) } ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ - NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + NUM_FRAMES * opt_xsk_frame_size); if (ret) exit_with_error(ret); /* Create sockets... */ - umem = xsk_configure_umem(bufs, - NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); xsks[num_socks++] = xsk_configure_socket(umem); if (opt_bench == BENCH_TXONLY) { int i; - for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE; - i += XSK_UMEM__DEFAULT_FRAME_SIZE) - (void)gen_eth_frame(umem, i); + for (i = 0; i < NUM_FRAMES; i++) + (void)gen_eth_frame(umem, i * opt_xsk_frame_size); } signal(SIGINT, int_exit); -- cgit v1.2.1 From 71634d7f92093062e7b8b2a9efffa5569edcca8b Mon Sep 17 00:00:00 2001 From: brakmo Date: Tue, 2 Jul 2019 15:09:52 -0700 Subject: bpf: Add support for fq's EDT to HBM Adds support for fq's Earliest Departure Time to HBM (Host Bandwidth Manager). Includes a new BPF program supporting EDT, and also updates corresponding programs. It will drop packets with an EDT of more than 500us in the future unless the packet belongs to a flow with less than 2 packets in flight. This is done so each flow has at least 2 packets in flight, so they will not starve, and also to help prevent delayed ACK timeouts. It will also work with ECN enabled traffic, where the packets will be CE marked if their EDT is more than 50us in the future. The table below shows some performance numbers. The flows are back to back RPCS. One server sending to another, either 2 or 4 flows. One flow is a 10KB RPC, the rest are 1MB RPCs. When there are more than one flow of a given RPC size, the numbers represent averages. The rate limit applies to all flows (they are in the same cgroup). Tests ending with "-edt" ran with the new BPF program supporting EDT. Tests ending with "-hbt" ran on top HBT qdisc with the specified rate (i.e. no HBM). The other tests ran with the HBM BPF program included in the HBM patch-set. EDT has limited value when using DCTCP, but it helps in many cases when using Cubic. It usually achieves larger link utilization and lower 99% latencies for the 1MB RPCs. HBM ends up queueing a lot of packets with its default parameter values, reducing the goodput of the 10KB RPCs and increasing their latency. Also, the RTTs seen by the flows are quite large. Aggr 10K 10K 10K 1MB 1MB 1MB Limit rate drops RTT rate P90 P99 rate P90 P99 Test rate Flows Mbps % us Mbps us us Mbps ms ms -------- ---- ----- ---- ----- --- ---- ---- ---- ---- ---- ---- cubic 1G 2 904 0.02 108 257 511 539 647 13.4 24.5 cubic-edt 1G 2 982 0.01 156 239 656 967 743 14.0 17.2 dctcp 1G 2 977 0.00 105 324 408 744 653 14.5 15.9 dctcp-edt 1G 2 981 0.01 142 321 417 811 660 15.7 17.0 cubic-htb 1G 2 919 0.00 1825 40 2822 4140 879 9.7 9.9 cubic 200M 2 155 0.30 220 81 532 655 74 283 450 cubic-edt 200M 2 188 0.02 222 87 1035 1095 101 84 85 dctcp 200M 2 188 0.03 111 77 912 939 111 76 325 dctcp-edt 200M 2 188 0.03 217 74 1416 1738 114 76 79 cubic-htb 200M 2 188 0.00 5015 8 14ms 15ms 180 48 50 cubic 1G 4 952 0.03 110 165 516 546 262 38 154 cubic-edt 1G 4 973 0.01 190 111 1034 1314 287 65 79 dctcp 1G 4 951 0.00 103 180 617 905 257 37 38 dctcp-edt 1G 4 967 0.00 163 151 732 1126 272 43 55 cubic-htb 1G 4 914 0.00 3249 13 7ms 8ms 300 29 34 cubic 5G 4 4236 0.00 134 305 490 624 1310 10 17 cubic-edt 5G 4 4865 0.00 156 306 425 759 1520 10 16 dctcp 5G 4 4936 0.00 128 485 221 409 1484 7 9 dctcp-edt 5G 4 4924 0.00 148 390 392 623 1508 11 26 v1 -> v2: Incorporated Andrii's suggestions v2 -> v3: Incorporated Yonghong's suggestions v3 -> v4: Removed credit update that is not needed Signed-off-by: Lawrence Brakmo Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- samples/bpf/Makefile | 2 + samples/bpf/do_hbm_test.sh | 22 +++--- samples/bpf/hbm.c | 18 ++++- samples/bpf/hbm_edt_kern.c | 168 +++++++++++++++++++++++++++++++++++++++++++++ samples/bpf/hbm_kern.h | 40 +++++++++-- 5 files changed, 231 insertions(+), 19 deletions(-) create mode 100644 samples/bpf/hbm_edt_kern.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0917f8cf4fab..35640414ebb3 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -168,6 +168,7 @@ always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o always += ibumad_kern.o always += hbm_out_kern.o +always += hbm_edt_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/ @@ -272,6 +273,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h +$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index e48b047d4646..ffe4c0607341 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -14,7 +14,7 @@ Usage() { echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "" echo "USAGE: $name [out] [-b=|--bpf=] [-c=|--cc=]" - echo " [-D] [-d=|--delay=] [--debug] [-E]" + echo " [-D] [-d=|--delay=] [--debug] [-E] [--edt]" echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=|--id=]" echo " [-l] [-N] [--no_cn] [-p=|--port=] [-P]" echo " [-q=] [-R] [-s=|--server= /dev/null 2>&1 tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 elif [ "$qdisc" != "" ] ; then - tc qdisc del dev lo root > /dev/null 2>&1 - tc qdisc add dev lo root $qdisc > /dev/null 2>&1 + tc qdisc del dev eth0 root > /dev/null 2>&1 + tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1 fi n=0 @@ -399,7 +399,9 @@ fi if [ "$netem" -ne "0" ] ; then tc qdisc del dev lo root > /dev/null 2>&1 fi - +if [ "$qdisc" != "" ] ; then + tc qdisc del dev eth0 root > /dev/null 2>&1 +fi sleep 2 hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index b905b32ff185..e0fbab9bec83 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -62,6 +62,7 @@ bool loopback_flag; bool debugFlag; bool work_conserving_flag; bool no_cn_flag; +bool edt_flag; static void Usage(void); static void read_trace_pipe2(void); @@ -372,9 +373,14 @@ static int run_bpf_prog(char *prog, int cg_id) fprintf(fout, "avg rtt:%d\n", (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); // Average credit - fprintf(fout, "avg credit:%d\n", - (int)(qstats.sum_credit / - (1500 * ((int)qstats.pkts_total) + 1))); + if (edt_flag) + fprintf(fout, "avg credit_ms:%.03f\n", + (qstats.sum_credit / + (qstats.pkts_total + 1.0)) / 1000000.0); + else + fprintf(fout, "avg credit:%d\n", + (int)(qstats.sum_credit / + (1500 * ((int)qstats.pkts_total ) + 1))); // Return values stats for (k = 0; k < RET_VAL_COUNT; k++) { @@ -408,6 +414,7 @@ static void Usage(void) " Where:\n" " -o indicates egress direction (default)\n" " -d print BPF trace debug buffer\n" + " --edt use fq's Earliest Departure Time\n" " -l also limit flows using loopback\n" " -n <#> to create cgroup \"/hbm#\" and attach prog\n" " Default is /hbm1\n" @@ -433,6 +440,7 @@ int main(int argc, char **argv) char *optstring = "iodln:r:st:wh"; struct option loptions[] = { {"no_cn", 0, NULL, 1}, + {"edt", 0, NULL, 2}, {NULL, 0, NULL, 0} }; @@ -441,6 +449,10 @@ int main(int argc, char **argv) case 1: no_cn_flag = true; break; + case 2: + prog = "hbm_edt_kern.o"; + edt_flag = true; + break; case'o': break; case 'd': diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c new file mode 100644 index 000000000000..a65b677acdb0 --- /dev/null +++ b/samples/bpf/hbm_edt_kern.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * a user program, such as hbm, can access. + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + long long delta = 0, delta_send; + unsigned long long curtime, sendtime; + struct hbm_queue_stats *qsp = NULL; + unsigned int queue_index = 0; + bool congestion_flag = false; + bool ecn_ce_flag = false; + struct hbm_pkt_info pkti = {}; + struct hbm_vqueue *qdp; + bool drop_flag = false; + bool cwr_flag = false; + int len = skb->len; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + + // Check if we should ignore loopback traffic + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + if (qdp->lasttime == 0) + hbm_init_edt_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + delta = qdp->lasttime - curtime; + // bound bursts to 100us + if (delta < -BURST_SIZE_NS) { + // negative delta is a credit that allows bursts + qdp->lasttime = curtime - BURST_SIZE_NS; + delta = -BURST_SIZE_NS; + } + sendtime = qdp->lasttime; + delta_send = BYTES_TO_NS(len, qdp->rate); + __sync_add_and_fetch(&(qdp->lasttime), delta_send); + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Set EDT of packet + skb->tstamp = sendtime; + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) + qdp->rate = qsp->rate * 128; + + // Set flags (drop, congestion, cwr) + // last packet will be sent in the future, bound latency + if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS && + len > LARGE_PKT_THRESH)) { + drop_flag = true; + if (pkti.is_tcp && pkti.ecn == 0) + cwr_flag = true; + } else if (delta > MARK_THRESH_NS) { + if (pkti.is_tcp) + congestion_flag = true; + else + drop_flag = true; + } + + if (congestion_flag) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (delta >= MARK_THRESH_NS + + (rand % MARK_REGION_SIZE_NS)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + congestion_flag = false; + } + } + } + + if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) { + drop_flag = false; + cwr_flag = true; + congestion_flag = false; + } + + if (qsp != NULL && qsp->no_cn) + cwr_flag = false; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, (int) delta); + + if (drop_flag) { + __sync_add_and_fetch(&(qdp->lasttime), -delta_send); + rv = DROP_PKT; + } + + if (cwr_flag) + rv |= CWR; + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index be19cf1d5cd5..aa207a2eebbd 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -29,6 +29,7 @@ #define DROP_PKT 0 #define ALLOW_PKT 1 #define TCP_ECN_OK 1 +#define CWR 2 #ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging #undef bpf_printk @@ -45,8 +46,18 @@ #define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) #define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) +// Time base accounting for fq's EDT +#define BURST_SIZE_NS 100000 // 100us +#define MARK_THRESH_NS 50000 // 50us +#define DROP_THRESH_NS 500000 // 500us +// Reserve 20us of queuing for small packets (less than 120 bytes) +#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000) +#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS) + // rate in bytes per ns << 20 #define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate)) struct bpf_map_def SEC("maps") queue_state = { .type = BPF_MAP_TYPE_CGROUP_STORAGE, @@ -67,6 +78,7 @@ BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); struct hbm_pkt_info { int cwnd; int rtt; + int packets_out; bool is_ip; bool is_tcp; short ecn; @@ -86,16 +98,20 @@ static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) if (tp) { pkti->cwnd = tp->snd_cwnd; pkti->rtt = tp->srtt_us >> 3; + pkti->packets_out = tp->packets_out; return 0; } } } } + pkti->cwnd = 0; + pkti->rtt = 0; + pkti->packets_out = 0; return 1; } -static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, - struct hbm_pkt_info *pkti) +static void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) { struct iphdr iph; struct ipv6hdr *ip6h; @@ -123,10 +139,22 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) { - bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); - qdp->lasttime = bpf_ktime_get_ns(); - qdp->credit = INIT_CREDIT; - qdp->rate = rate * 128; + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp, + int rate) +{ + unsigned long long curtime; + + curtime = bpf_ktime_get_ns(); + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst + qdp->credit = 0; // not used + qdp->rate = rate * 128; } static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, -- cgit v1.2.1 From 395338843de1e47f6876e58adfe75b6483f96fff Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:14:02 -0700 Subject: samples/bpf: add sample program that periodically dumps TCP stats Uses new RTT callback to dump stats every second. $ mkdir -p /tmp/cgroupv2 $ mount -t cgroup2 none /tmp/cgroupv2 $ mkdir -p /tmp/cgroupv2/foo $ echo $$ >> /tmp/cgroupv2/foo/cgroup.procs $ bpftool prog load ./tcp_dumpstats_kern.o /sys/fs/bpf/tcp_prog $ bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog $ bpftool prog tracelog $ # run neper/netperf/etc Used neper to compare performance with and without this program attached and didn't see any noticeable performance impact. Sample output: -0 [015] ..s. 2074.128800: 0: dsack_dups=0 delivered=242526 -0 [015] ..s. 2074.128808: 0: delivered_ce=0 icsk_retransmits=0 -0 [015] ..s. 2075.130133: 0: dsack_dups=0 delivered=323599 -0 [015] ..s. 2075.130138: 0: delivered_ce=0 icsk_retransmits=0 -0 [005] .Ns. 2076.131440: 0: dsack_dups=0 delivered=404648 -0 [005] .Ns. 2076.131447: 0: delivered_ce=0 icsk_retransmits=0 Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- samples/bpf/Makefile | 1 + samples/bpf/tcp_dumpstats_kern.c | 68 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 samples/bpf/tcp_dumpstats_kern.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 35640414ebb3..f90daadfbc89 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -154,6 +154,7 @@ always += tcp_iw_kern.o always += tcp_clamp_kern.o always += tcp_basertt_kern.o always += tcp_tos_reflect_kern.o +always += tcp_dumpstats_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c new file mode 100644 index 000000000000..8557913106a0 --- /dev/null +++ b/samples/bpf/tcp_dumpstats_kern.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Refer to samples/bpf/tcp_bpf.readme for the instructions on + * how to run this sample program. + */ +#include + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define INTERVAL 1000000000ULL + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +struct { + __u32 type; + __u32 map_flags; + int *key; + __u64 *value; +} bpf_next_dump SEC(".maps") = { + .type = BPF_MAP_TYPE_SK_STORAGE, + .map_flags = BPF_F_NO_PREALLOC, +}; + +SEC("sockops") +int _sockops(struct bpf_sock_ops *ctx) +{ + struct bpf_tcp_sock *tcp_sk; + struct bpf_sock *sk; + __u64 *next_dump; + __u64 now; + + switch (ctx->op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG); + return 1; + case BPF_SOCK_OPS_RTT_CB: + break; + default: + return 1; + } + + sk = ctx->sk; + if (!sk) + return 1; + + next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!next_dump) + return 1; + + now = bpf_ktime_get_ns(); + if (now < *next_dump) + return 1; + + tcp_sk = bpf_tcp_sock(sk); + if (!tcp_sk) + return 1; + + *next_dump = now + INTERVAL; + + bpf_printk("dsack_dups=%u delivered=%u\n", + tcp_sk->dsack_dups, tcp_sk->delivered); + bpf_printk("delivered_ce=%u icsk_retransmits=%u\n", + tcp_sk->delivered_ce, tcp_sk->icsk_retransmits); + + return 1; +} -- cgit v1.2.1 From d78e3f0614f866d4456f860dc5773a6176eb9937 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:14:03 -0700 Subject: samples/bpf: fix tcp_bpf.readme detach command Copy-paste, should be detach, not attach. Signed-off-by: Stanislav Fomichev Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Daniel Borkmann --- samples/bpf/tcp_bpf.readme | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme index fee746621aec..78e247f62108 100644 --- a/samples/bpf/tcp_bpf.readme +++ b/samples/bpf/tcp_bpf.readme @@ -25,4 +25,4 @@ attached to the cgroupv2). To remove (unattach) a socket_ops BPF program from a cgroupv2: - bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog + bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog -- cgit v1.2.1