| // SPDX-License-Identifier: GPL-2.0 |
| /* Copyright (c) 2019 Facebook |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of version 2 of the GNU General Public |
| * License as published by the Free Software Foundation. |
| * |
| * Example program for Host Bandwidth Managment |
| * |
| * This program loads a cgroup skb BPF program to enforce cgroup output |
| * (egress) or input (ingress) bandwidth limits. |
| * |
| * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] |
| * Where: |
| * -d Print BPF trace debug buffer |
| * -l Also limit flows doing loopback |
| * -n <#> To create cgroup \"/hbm#\" and attach prog |
| * Default is /hbm1 |
| * --no_cn Do not return cn notifications |
| * -r <rate> Rate limit in Mbps |
| * -s Get HBM stats (marked, dropped, etc.) |
| * -t <time> Exit after specified seconds (default is 0) |
| * -w Work conserving flag. cgroup can increase its bandwidth |
| * beyond the rate limit specified while there is available |
| * bandwidth. Current implementation assumes there is only |
| * NIC (eth0), but can be extended to support multiple NICs. |
| * Currrently only supported for egress. |
| * -h Print this info |
| * prog BPF program file name. Name defaults to hbm_out_kern.o |
| */ |
| |
| #define _GNU_SOURCE |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| #include <sys/resource.h> |
| #include <sys/time.h> |
| #include <unistd.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <linux/unistd.h> |
| |
| #include <linux/bpf.h> |
| #include <bpf/bpf.h> |
| #include <getopt.h> |
| |
| #include "bpf_load.h" |
| #include "bpf_rlimit.h" |
| #include "cgroup_helpers.h" |
| #include "hbm.h" |
| #include "bpf_util.h" |
| #include "bpf.h" |
| #include "libbpf.h" |
| |
| bool outFlag = true; |
| int minRate = 1000; /* cgroup rate limit in Mbps */ |
| int rate = 1000; /* can grow if rate conserving is enabled */ |
| int dur = 1; |
| bool stats_flag; |
| bool loopback_flag; |
| bool debugFlag; |
| bool work_conserving_flag; |
| bool no_cn_flag; |
| bool edt_flag; |
| |
| static void Usage(void); |
| static void read_trace_pipe2(void); |
| static void do_error(char *msg, bool errno_flag); |
| |
| #define DEBUGFS "/sys/kernel/debug/tracing/" |
| |
| struct bpf_object *obj; |
| int bpfprog_fd; |
| int cgroup_storage_fd; |
| |
| static void read_trace_pipe2(void) |
| { |
| int trace_fd; |
| FILE *outf; |
| char *outFname = "hbm_out.log"; |
| |
| trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); |
| if (trace_fd < 0) { |
| printf("Error opening trace_pipe\n"); |
| return; |
| } |
| |
| // Future support of ingress |
| // if (!outFlag) |
| // outFname = "hbm_in.log"; |
| outf = fopen(outFname, "w"); |
| |
| if (outf == NULL) |
| printf("Error creating %s\n", outFname); |
| |
| while (1) { |
| static char buf[4097]; |
| ssize_t sz; |
| |
| sz = read(trace_fd, buf, sizeof(buf) - 1); |
| if (sz > 0) { |
| buf[sz] = 0; |
| puts(buf); |
| if (outf != NULL) { |
| fprintf(outf, "%s\n", buf); |
| fflush(outf); |
| } |
| } |
| } |
| } |
| |
| static void do_error(char *msg, bool errno_flag) |
| { |
| if (errno_flag) |
| printf("ERROR: %s, errno: %d\n", msg, errno); |
| else |
| printf("ERROR: %s\n", msg); |
| exit(1); |
| } |
| |
| static int prog_load(char *prog) |
| { |
| struct bpf_prog_load_attr prog_load_attr = { |
| .prog_type = BPF_PROG_TYPE_CGROUP_SKB, |
| .file = prog, |
| .expected_attach_type = BPF_CGROUP_INET_EGRESS, |
| }; |
| int map_fd; |
| struct bpf_map *map; |
| |
| int ret = 0; |
| |
| if (access(prog, O_RDONLY) < 0) { |
| printf("Error accessing file %s: %s\n", prog, strerror(errno)); |
| return 1; |
| } |
| if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) |
| ret = 1; |
| if (!ret) { |
| map = bpf_object__find_map_by_name(obj, "queue_stats"); |
| map_fd = bpf_map__fd(map); |
| if (map_fd < 0) { |
| printf("Map not found: %s\n", strerror(map_fd)); |
| ret = 1; |
| } |
| } |
| |
| if (ret) { |
| printf("ERROR: load_bpf_file failed for: %s\n", prog); |
| printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); |
| ret = -1; |
| } else { |
| ret = map_fd; |
| } |
| |
| return ret; |
| } |
| |
| static int run_bpf_prog(char *prog, int cg_id) |
| { |
| int map_fd; |
| int rc = 0; |
| int key = 0; |
| int cg1 = 0; |
| int type = BPF_CGROUP_INET_EGRESS; |
| char cg_dir[100]; |
| struct hbm_queue_stats qstats = {0}; |
| |
| sprintf(cg_dir, "/hbm%d", cg_id); |
| map_fd = prog_load(prog); |
| if (map_fd == -1) |
| return 1; |
| |
| if (setup_cgroup_environment()) { |
| printf("ERROR: setting cgroup environment\n"); |
| goto err; |
| } |
| cg1 = create_and_get_cgroup(cg_dir); |
| if (!cg1) { |
| printf("ERROR: create_and_get_cgroup\n"); |
| goto err; |
| } |
| if (join_cgroup(cg_dir)) { |
| printf("ERROR: join_cgroup\n"); |
| goto err; |
| } |
| |
| qstats.rate = rate; |
| qstats.stats = stats_flag ? 1 : 0; |
| qstats.loopback = loopback_flag ? 1 : 0; |
| qstats.no_cn = no_cn_flag ? 1 : 0; |
| if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { |
| printf("ERROR: Could not update map element\n"); |
| goto err; |
| } |
| |
| if (!outFlag) |
| type = BPF_CGROUP_INET_INGRESS; |
| if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { |
| printf("ERROR: bpf_prog_attach fails!\n"); |
| log_err("Attaching prog"); |
| goto err; |
| } |
| |
| if (work_conserving_flag) { |
| struct timeval t0, t_last, t_new; |
| FILE *fin; |
| unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; |
| signed long long last_cg_tx_bytes, new_cg_tx_bytes; |
| signed long long delta_time, delta_bytes, delta_rate; |
| int delta_ms; |
| #define DELTA_RATE_CHECK 10000 /* in us */ |
| #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ |
| |
| bpf_map_lookup_elem(map_fd, &key, &qstats); |
| if (gettimeofday(&t0, NULL) < 0) |
| do_error("gettimeofday failed", true); |
| t_last = t0; |
| fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); |
| if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) |
| do_error("fscanf fails", false); |
| fclose(fin); |
| last_cg_tx_bytes = qstats.bytes_total; |
| while (true) { |
| usleep(DELTA_RATE_CHECK); |
| if (gettimeofday(&t_new, NULL) < 0) |
| do_error("gettimeofday failed", true); |
| delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + |
| (t_new.tv_usec - t0.tv_usec)/1000; |
| if (delta_ms > dur * 1000) |
| break; |
| delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + |
| (t_new.tv_usec - t_last.tv_usec); |
| if (delta_time == 0) |
| continue; |
| t_last = t_new; |
| fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", |
| "r"); |
| if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) |
| do_error("fscanf fails", false); |
| fclose(fin); |
| printf(" new_eth_tx_bytes:%llu\n", |
| new_eth_tx_bytes); |
| bpf_map_lookup_elem(map_fd, &key, &qstats); |
| new_cg_tx_bytes = qstats.bytes_total; |
| delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; |
| last_eth_tx_bytes = new_eth_tx_bytes; |
| delta_rate = (delta_bytes * 8000000) / delta_time; |
| printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", |
| delta_ms, delta_rate/1000000000.0, |
| rate/1000.0); |
| if (delta_rate < RATE_THRESHOLD) { |
| /* can increase cgroup rate limit, but first |
| * check if we are using the current limit. |
| * Currently increasing by 6.25%, unknown |
| * if that is the optimal rate. |
| */ |
| int rate_diff100; |
| |
| delta_bytes = new_cg_tx_bytes - |
| last_cg_tx_bytes; |
| last_cg_tx_bytes = new_cg_tx_bytes; |
| delta_rate = (delta_bytes * 8000000) / |
| delta_time; |
| printf(" rate:%.3fGbps", |
| delta_rate/1000000000.0); |
| rate_diff100 = (((long long)rate)*1000000 - |
| delta_rate) * 100 / |
| (((long long) rate) * 1000000); |
| printf(" rdiff:%d", rate_diff100); |
| if (rate_diff100 <= 3) { |
| rate += (rate >> 4); |
| if (rate > RATE_THRESHOLD / 1000000) |
| rate = RATE_THRESHOLD / 1000000; |
| qstats.rate = rate; |
| printf(" INC\n"); |
| } else { |
| printf("\n"); |
| } |
| } else { |
| /* Need to decrease cgroup rate limit. |
| * Currently decreasing by 12.5%, unknown |
| * if that is optimal |
| */ |
| printf(" DEC\n"); |
| rate -= (rate >> 3); |
| if (rate < minRate) |
| rate = minRate; |
| qstats.rate = rate; |
| } |
| if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) |
| do_error("update map element fails", false); |
| } |
| } else { |
| sleep(dur); |
| } |
| // Get stats! |
| if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { |
| char fname[100]; |
| FILE *fout; |
| |
| if (!outFlag) |
| sprintf(fname, "hbm.%d.in", cg_id); |
| else |
| sprintf(fname, "hbm.%d.out", cg_id); |
| fout = fopen(fname, "w"); |
| fprintf(fout, "id:%d\n", cg_id); |
| fprintf(fout, "ERROR: Could not lookup queue_stats\n"); |
| } else if (stats_flag && qstats.lastPacketTime > |
| qstats.firstPacketTime) { |
| long long delta_us = (qstats.lastPacketTime - |
| qstats.firstPacketTime)/1000; |
| unsigned int rate_mbps = ((qstats.bytes_total - |
| qstats.bytes_dropped) * 8 / |
| delta_us); |
| double percent_pkts, percent_bytes; |
| char fname[100]; |
| FILE *fout; |
| int k; |
| static const char *returnValNames[] = { |
| "DROP_PKT", |
| "ALLOW_PKT", |
| "DROP_PKT_CWR", |
| "ALLOW_PKT_CWR" |
| }; |
| #define RET_VAL_COUNT 4 |
| |
| // Future support of ingress |
| // if (!outFlag) |
| // sprintf(fname, "hbm.%d.in", cg_id); |
| // else |
| sprintf(fname, "hbm.%d.out", cg_id); |
| fout = fopen(fname, "w"); |
| fprintf(fout, "id:%d\n", cg_id); |
| fprintf(fout, "rate_mbps:%d\n", rate_mbps); |
| fprintf(fout, "duration:%.1f secs\n", |
| (qstats.lastPacketTime - qstats.firstPacketTime) / |
| 1000000000.0); |
| fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); |
| fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / |
| 1000000)); |
| fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); |
| fprintf(fout, "bytes_dropped_MB:%d\n", |
| (int)(qstats.bytes_dropped / |
| 1000000)); |
| // Marked Pkts and Bytes |
| percent_pkts = (qstats.pkts_marked * 100.0) / |
| (qstats.pkts_total + 1); |
| percent_bytes = (qstats.bytes_marked * 100.0) / |
| (qstats.bytes_total + 1); |
| fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); |
| fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); |
| |
| // Dropped Pkts and Bytes |
| percent_pkts = (qstats.pkts_dropped * 100.0) / |
| (qstats.pkts_total + 1); |
| percent_bytes = (qstats.bytes_dropped * 100.0) / |
| (qstats.bytes_total + 1); |
| fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); |
| fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); |
| |
| // ECN CE markings |
| percent_pkts = (qstats.pkts_ecn_ce * 100.0) / |
| (qstats.pkts_total + 1); |
| fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, |
| (int)qstats.pkts_ecn_ce); |
| |
| // Average cwnd |
| fprintf(fout, "avg cwnd:%d\n", |
| (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); |
| // Average rtt |
| fprintf(fout, "avg rtt:%d\n", |
| (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); |
| // Average credit |
| if (edt_flag) |
| fprintf(fout, "avg credit_ms:%.03f\n", |
| (qstats.sum_credit / |
| (qstats.pkts_total + 1.0)) / 1000000.0); |
| else |
| fprintf(fout, "avg credit:%d\n", |
| (int)(qstats.sum_credit / |
| (1500 * ((int)qstats.pkts_total ) + 1))); |
| |
| // Return values stats |
| for (k = 0; k < RET_VAL_COUNT; k++) { |
| percent_pkts = (qstats.returnValCount[k] * 100.0) / |
| (qstats.pkts_total + 1); |
| fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], |
| percent_pkts, (int)qstats.returnValCount[k]); |
| } |
| fclose(fout); |
| } |
| |
| if (debugFlag) |
| read_trace_pipe2(); |
| return rc; |
| err: |
| rc = 1; |
| |
| if (cg1) |
| close(cg1); |
| cleanup_cgroup_environment(); |
| |
| return rc; |
| } |
| |
| static void Usage(void) |
| { |
| printf("This program loads a cgroup skb BPF program to enforce\n" |
| "cgroup output (egress) bandwidth limits.\n\n" |
| "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" |
| " [-s] [-t <secs>] [-w] [-h] [prog]\n" |
| " Where:\n" |
| " -o indicates egress direction (default)\n" |
| " -d print BPF trace debug buffer\n" |
| " --edt use fq's Earliest Departure Time\n" |
| " -l also limit flows using loopback\n" |
| " -n <#> to create cgroup \"/hbm#\" and attach prog\n" |
| " Default is /hbm1\n" |
| " --no_cn disable CN notifications\n" |
| " -r <rate> Rate in Mbps\n" |
| " -s Update HBM stats\n" |
| " -t <time> Exit after specified seconds (default is 0)\n" |
| " -w Work conserving flag. cgroup can increase\n" |
| " bandwidth beyond the rate limit specified\n" |
| " while there is available bandwidth. Current\n" |
| " implementation assumes there is only eth0\n" |
| " but can be extended to support multiple NICs\n" |
| " -h print this info\n" |
| " prog BPF program file name. Name defaults to\n" |
| " hbm_out_kern.o\n"); |
| } |
| |
| int main(int argc, char **argv) |
| { |
| char *prog = "hbm_out_kern.o"; |
| int k; |
| int cg_id = 1; |
| char *optstring = "iodln:r:st:wh"; |
| struct option loptions[] = { |
| {"no_cn", 0, NULL, 1}, |
| {"edt", 0, NULL, 2}, |
| {NULL, 0, NULL, 0} |
| }; |
| |
| while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { |
| switch (k) { |
| case 1: |
| no_cn_flag = true; |
| break; |
| case 2: |
| prog = "hbm_edt_kern.o"; |
| edt_flag = true; |
| break; |
| case'o': |
| break; |
| case 'd': |
| debugFlag = true; |
| break; |
| case 'l': |
| loopback_flag = true; |
| break; |
| case 'n': |
| cg_id = atoi(optarg); |
| break; |
| case 'r': |
| minRate = atoi(optarg) * 1.024; |
| rate = minRate; |
| break; |
| case 's': |
| stats_flag = true; |
| break; |
| case 't': |
| dur = atoi(optarg); |
| break; |
| case 'w': |
| work_conserving_flag = true; |
| break; |
| case '?': |
| if (optopt == 'n' || optopt == 'r' || optopt == 't') |
| fprintf(stderr, |
| "Option -%c requires an argument.\n\n", |
| optopt); |
| case 'h': |
| // fallthrough |
| default: |
| Usage(); |
| return 0; |
| } |
| } |
| |
| if (optind < argc) |
| prog = argv[optind]; |
| printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); |
| |
| return run_bpf_prog(prog, cg_id); |
| } |