From 2bf8c7e735ac8ab81eebd672caee41c77ec1b793 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Tue, 18 Aug 2020 14:16:41 +0900 Subject: samples: bpf: Fix broken bpf programs due to removed symbol >From commit f1394b798814 ("block: mark blk_account_io_completion static") symbol blk_account_io_completion() has been marked as static, which makes it no longer possible to attach kprobe to this event. Currently, there are broken samples due to this reason. As a solution to this, attach kprobe events to blk_account_io_done() to modify them to perform the same behavior as before. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200818051641.21724-1-danieltimlee@gmail.com --- samples/bpf/task_fd_query_kern.c | 2 +- samples/bpf/task_fd_query_user.c | 2 +- samples/bpf/tracex3_kern.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'samples') diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c index 278ade5427c8..c821294e1774 100644 --- a/samples/bpf/task_fd_query_kern.c +++ b/samples/bpf/task_fd_query_kern.c @@ -10,7 +10,7 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } -SEC("kretprobe/blk_account_io_completion") +SEC("kretprobe/blk_account_io_done") int bpf_prog2(struct pt_regs *ctx) { return 0; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index ff2e9c1c7266..4a74531dc403 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -314,7 +314,7 @@ int main(int argc, char **argv) /* test two functions in the corresponding *_kern.c file */ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", BPF_FD_TYPE_KPROBE)); - CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion", + CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done", BPF_FD_TYPE_KRETPROBE)); /* test nondebug fs kprobe */ diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index 659613c19a82..710a4410b2fb 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -49,7 +49,7 @@ struct { __uint(max_entries, SLOTS); } lat_map SEC(".maps"); -SEC("kprobe/blk_account_io_completion") +SEC("kprobe/blk_account_io_done") int bpf_prog2(struct pt_regs *ctx) { long rq = PT_REGS_PARM1(ctx); -- cgit v1.2.3 From 35a8b6dd339f04cbcb0b2d085334263542a12b70 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 23 Aug 2020 17:53:32 +0900 Subject: samples: bpf: Cleanup bpf_load.o from Makefile Since commit cc7f641d637b ("samples: bpf: Refactor BPF map performance test with libbpf") has ommited the removal of bpf_load.o from Makefile, this commit removes the bpf_load.o rule for targets where bpf_load.o is not used. Fixes: cc7f641d637b ("samples: bpf: Refactor BPF map performance test with libbpf") Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200823085334.9413-2-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index f87ee02073ba..0cac89230c6d 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -76,7 +76,7 @@ trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS) lathist-objs := bpf_load.o lathist_user.o offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS) spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS) -map_perf_test-objs := bpf_load.o map_perf_test_user.o +map_perf_test-objs := map_perf_test_user.o test_overhead-objs := bpf_load.o test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o test_cgrp2_attach-objs := test_cgrp2_attach.o -- cgit v1.2.3 From 3677d0a13171bb1dc8db0af84d48dea14a899962 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 23 Aug 2020 17:53:33 +0900 Subject: samples: bpf: Refactor kprobe tracing programs with libbpf For the problem of increasing fragmentation of the bpf loader programs, instead of using bpf_loader.o, which is used in samples/bpf, this commit refactors the existing kprobe tracing programs with libbbpf bpf loader. - For kprobe events pointing to system calls, the SYSCALL() macro in trace_common.h was used. - Adding a kprobe event and attaching a bpf program to it was done through bpf_program_attach(). - Instead of using the existing BPF MAP definition, MAP definition has been refactored with the new BTF-defined MAP format. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200823085334.9413-3-danieltimlee@gmail.com --- samples/bpf/Makefile | 10 ++-- samples/bpf/lathist_kern.c | 24 ++++---- samples/bpf/lathist_user.c | 42 ++++++++++++-- samples/bpf/spintest_kern.c | 36 ++++++------ samples/bpf/spintest_user.c | 68 ++++++++++++++++++----- samples/bpf/test_current_task_under_cgroup_kern.c | 27 ++++----- samples/bpf/test_current_task_under_cgroup_user.c | 52 +++++++++++++---- samples/bpf/test_probe_write_user_kern.c | 12 ++-- samples/bpf/test_probe_write_user_user.c | 49 ++++++++++++---- samples/bpf/trace_output_kern.c | 15 ++--- samples/bpf/trace_output_user.c | 55 ++++++++++++------ 11 files changed, 272 insertions(+), 118 deletions(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0cac89230c6d..c74d477474e2 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -71,11 +71,11 @@ tracex4-objs := tracex4_user.o tracex5-objs := tracex5_user.o $(TRACE_HELPERS) tracex6-objs := tracex6_user.o tracex7-objs := tracex7_user.o -test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o -trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS) -lathist-objs := bpf_load.o lathist_user.o +test_probe_write_user-objs := test_probe_write_user_user.o +trace_output-objs := trace_output_user.o $(TRACE_HELPERS) +lathist-objs := lathist_user.o offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS) -spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS) +spintest-objs := spintest_user.o $(TRACE_HELPERS) map_perf_test-objs := map_perf_test_user.o test_overhead-objs := bpf_load.o test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o @@ -86,7 +86,7 @@ xdp1-objs := xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := xdp1_user.o xdp_router_ipv4-objs := xdp_router_ipv4_user.o -test_current_task_under_cgroup-objs := bpf_load.o $(CGROUP_HELPERS) \ +test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o trace_event-objs := trace_event_user.o $(TRACE_HELPERS) sampleip-objs := sampleip_user.o $(TRACE_HELPERS) diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c index ca9c2e4e69aa..4adfcbbe6ef4 100644 --- a/samples/bpf/lathist_kern.c +++ b/samples/bpf/lathist_kern.c @@ -18,12 +18,12 @@ * trace_preempt_[on|off] tracepoints hooks is not supported. */ -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = MAX_CPU, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, u64); + __uint(max_entries, MAX_CPU); +} my_map SEC(".maps"); SEC("kprobe/trace_preempt_off") int bpf_prog1(struct pt_regs *ctx) @@ -61,12 +61,12 @@ static unsigned int log2l(unsigned long v) return log2(v); } -struct bpf_map_def SEC("maps") my_lat = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(long), - .max_entries = MAX_CPU * MAX_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, long); + __uint(max_entries, MAX_CPU * MAX_ENTRIES); +} my_lat SEC(".maps"); SEC("kprobe/trace_preempt_on") int bpf_prog2(struct pt_regs *ctx) diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c index 2ff2839a52d5..7d8ff2418303 100644 --- a/samples/bpf/lathist_user.c +++ b/samples/bpf/lathist_user.c @@ -6,9 +6,8 @@ #include #include #include -#include +#include #include -#include "bpf_load.h" #define MAX_ENTRIES 20 #define MAX_CPU 4 @@ -81,20 +80,51 @@ static void get_data(int fd) int main(int argc, char **argv) { + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int map_fd, i = 0; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } while (1) { - get_data(map_fd[1]); + get_data(map_fd); print_hist(); sleep(5); } +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c index f508af357251..455da77319d9 100644 --- a/samples/bpf/spintest_kern.c +++ b/samples/bpf/spintest_kern.c @@ -12,25 +12,25 @@ #include #include -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; -struct bpf_map_def SEC("maps") my_map2 = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_map2 SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define PROG(foo) \ int foo(struct pt_regs *ctx) \ diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index fb430ea2ef51..847da9284fa8 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -1,40 +1,77 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include #include #include -#include "bpf_load.h" +#include #include "trace_helpers.h" int main(int ac, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256], symbol[256]; + struct bpf_object *obj = NULL; + struct bpf_link *links[20]; long key, next_key, value; - char filename[256]; + struct bpf_program *prog; + int map_fd, i, j = 0; + const char *title; struct ksym *sym; - int i; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + title = bpf_program__title(prog, false); + if (sscanf(title, "kprobe/%s", symbol) != 1) + continue; + + /* Attach prog only when symbol exists */ + if (ksym_get_addr(symbol)) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } } for (i = 0; i < 5; i++) { key = 0; printf("kprobing funcs:"); - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &value); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); assert(next_key == value); sym = ksym_search(value); key = next_key; @@ -48,10 +85,15 @@ int main(int ac, char **argv) if (key) printf("\n"); key = 0; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) - bpf_map_delete_elem(map_fd[0], &next_key); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) + bpf_map_delete_elem(map_fd, &next_key); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c index 6dc4f41bb6cb..fbd43e2bb4d3 100644 --- a/samples/bpf/test_current_task_under_cgroup_kern.c +++ b/samples/bpf/test_current_task_under_cgroup_kern.c @@ -10,23 +10,24 @@ #include #include #include +#include "trace_common.h" -struct bpf_map_def SEC("maps") cgroup_map = { - .type = BPF_MAP_TYPE_CGROUP_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); -struct bpf_map_def SEC("maps") perf_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 1); +} perf_map SEC(".maps"); /* Writes the last PID that called sync to a map at index 0 */ -SEC("kprobe/sys_sync") +SEC("kprobe/" SYSCALL(sys_sync)) int bpf_prog1(struct pt_regs *ctx) { u64 pid = bpf_get_current_pid_tgid(); diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index 06e9f8ce42e2..ac251a417f45 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -4,10 +4,9 @@ #define _GNU_SOURCE #include -#include #include #include -#include "bpf_load.h" +#include #include "cgroup_helpers.h" #define CGROUP_PATH "/my-cgroup" @@ -15,13 +14,44 @@ int main(int argc, char **argv) { pid_t remote_pid, local_pid = getpid(); - int cg2, idx = 0, rc = 0; + struct bpf_link *link = NULL; + struct bpf_program *prog; + int cg2, idx = 0, rc = 1; + struct bpf_object *obj; char filename[256]; + int map_fd[2]; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "cgroup_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "perf_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } if (setup_cgroup_environment()) @@ -70,12 +100,14 @@ int main(int argc, char **argv) goto err; } - goto out; -err: - rc = 1; + rc = 0; -out: +err: close(cg2); cleanup_cgroup_environment(); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return rc; } diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c index fd651a65281e..220a96438d75 100644 --- a/samples/bpf/test_probe_write_user_kern.c +++ b/samples/bpf/test_probe_write_user_kern.c @@ -13,12 +13,12 @@ #include #include "trace_common.h" -struct bpf_map_def SEC("maps") dnat_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct sockaddr_in), - .value_size = sizeof(struct sockaddr_in), - .max_entries = 256, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sockaddr_in); + __type(value, struct sockaddr_in); + __uint(max_entries, 256); +} dnat_map SEC(".maps"); /* kprobe is NOT a stable ABI * kernel functions can be removed, renamed or completely change semantics. diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c index 045eb5e30f54..00ccfb834e45 100644 --- a/samples/bpf/test_probe_write_user_user.c +++ b/samples/bpf/test_probe_write_user_user.c @@ -1,21 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include -#include "bpf_load.h" +#include #include -#include #include #include int main(int ac, char **argv) { - int serverfd, serverconnfd, clientfd; - socklen_t sockaddr_len; - struct sockaddr serv_addr, mapped_addr, tmp_addr; struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in; + struct sockaddr serv_addr, mapped_addr, tmp_addr; + int serverfd, serverconnfd, clientfd, map_fd; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + socklen_t sockaddr_len; char filename[256]; char *ip; @@ -24,10 +25,35 @@ int main(int ac, char **argv) tmp_addr_in = (struct sockaddr_in *)&tmp_addr; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "dnat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); @@ -51,7 +77,7 @@ int main(int ac, char **argv) mapped_addr_in->sin_port = htons(5555); mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255"); - assert(!bpf_map_update_elem(map_fd[0], &mapped_addr, &serv_addr, BPF_ANY)); + assert(!bpf_map_update_elem(map_fd, &mapped_addr, &serv_addr, BPF_ANY)); assert(listen(serverfd, 5) == 0); @@ -75,5 +101,8 @@ int main(int ac, char **argv) /* Is the server's getsockname = the socket getpeername */ assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c index 1d7d422cae6f..b64815af0943 100644 --- a/samples/bpf/trace_output_kern.c +++ b/samples/bpf/trace_output_kern.c @@ -2,15 +2,16 @@ #include #include #include +#include "trace_common.h" -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 2, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 2); +} my_map SEC(".maps"); -SEC("kprobe/sys_write") +SEC("kprobe/" SYSCALL(sys_write)) int bpf_prog1(struct pt_regs *ctx) { struct S { diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index 60a17dd05345..364b98764d54 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only #include -#include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include "bpf_load.h" -#include "perf-sys.h" static __u64 time_get_ns(void) { @@ -57,20 +44,48 @@ static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) int main(int argc, char **argv) { struct perf_buffer_opts pb_opts = {}; + struct bpf_link *link = NULL; + struct bpf_program *prog; struct perf_buffer *pb; + struct bpf_object *obj; + int map_fd, ret = 0; char filename[256]; FILE *f; - int ret; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (libbpf_get_error(prog)) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } pb_opts.sample_cb = print_bpf_output; - pb = perf_buffer__new(map_fd[0], 8, &pb_opts); + pb = perf_buffer__new(map_fd, 8, &pb_opts); ret = libbpf_get_error(pb); if (ret) { printf("failed to setup perf_buffer: %d\n", ret); @@ -84,5 +99,9 @@ int main(int argc, char **argv) while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) { } kill(0, SIGINT); + +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return ret; } -- cgit v1.2.3 From f0c328f8af5d920a68f8217aec76d9a45288cef1 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 23 Aug 2020 17:53:34 +0900 Subject: samples: bpf: Refactor tracepoint tracing programs with libbpf For the problem of increasing fragmentation of the bpf loader programs, instead of using bpf_loader.o, which is used in samples/bpf, this commit refactors the existing tracepoint tracing programs with libbbpf bpf loader. - Adding a tracepoint event and attaching a bpf program to it was done through bpf_program_attach(). - Instead of using the existing BPF MAP definition, MAP definition has been refactored with the new BTF-defined MAP format. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200823085334.9413-4-danieltimlee@gmail.com --- samples/bpf/Makefile | 6 ++-- samples/bpf/cpustat_kern.c | 36 +++++++++++------------ samples/bpf/cpustat_user.c | 47 +++++++++++++++++++++++++----- samples/bpf/offwaketime_kern.c | 52 ++++++++++++++++----------------- samples/bpf/offwaketime_user.c | 66 ++++++++++++++++++++++++++++++++---------- samples/bpf/syscall_tp_kern.c | 24 +++++++-------- samples/bpf/syscall_tp_user.c | 54 ++++++++++++++++++++++++++-------- 7 files changed, 192 insertions(+), 93 deletions(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index c74d477474e2..a6d3646b3818 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -74,7 +74,7 @@ tracex7-objs := tracex7_user.o test_probe_write_user-objs := test_probe_write_user_user.o trace_output-objs := trace_output_user.o $(TRACE_HELPERS) lathist-objs := lathist_user.o -offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS) +offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) spintest-objs := spintest_user.o $(TRACE_HELPERS) map_perf_test-objs := map_perf_test_user.o test_overhead-objs := bpf_load.o test_overhead_user.o @@ -100,8 +100,8 @@ xdp_redirect_map-objs := xdp_redirect_map_user.o xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o xdp_monitor_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o -syscall_tp-objs := bpf_load.o syscall_tp_user.o -cpustat-objs := bpf_load.o cpustat_user.o +syscall_tp-objs := syscall_tp_user.o +cpustat-objs := cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := xdpsock_user.o xdp_fwd-objs := xdp_fwd_user.o diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c index a86a19d5f033..5aefd19cdfa1 100644 --- a/samples/bpf/cpustat_kern.c +++ b/samples/bpf/cpustat_kern.c @@ -51,28 +51,28 @@ static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; #define MAP_OFF_PSTATE_IDX 3 #define MAP_OFF_NUM 4 -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = MAX_CPU * MAP_OFF_NUM, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAP_OFF_NUM); +} my_map SEC(".maps"); /* cstate_duration records duration time for every idle state per CPU */ -struct bpf_map_def SEC("maps") cstate_duration = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); +} cstate_duration SEC(".maps"); /* pstate_duration records duration time for every operating point per CPU */ -struct bpf_map_def SEC("maps") pstate_duration = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); +} pstate_duration SEC(".maps"); /* * The trace events for cpu_idle and cpu_frequency are taken from: diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c index 869a99406dbf..96675985e9e0 100644 --- a/samples/bpf/cpustat_user.c +++ b/samples/bpf/cpustat_user.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -18,7 +17,9 @@ #include #include -#include "bpf_load.h" +#include + +static int cstate_map_fd, pstate_map_fd; #define MAX_CPU 8 #define MAX_PSTATE_ENTRIES 5 @@ -181,21 +182,50 @@ static void int_exit(int sig) { cpu_stat_inject_cpu_idle_event(); cpu_stat_inject_cpu_frequency_event(); - cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_update(cstate_map_fd, pstate_map_fd); cpu_stat_print(); exit(0); } int main(int argc, char **argv) { + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; int ret; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration"); + pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration"); + if (cstate_map_fd < 0 || pstate_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } ret = cpu_stat_inject_cpu_idle_event(); @@ -210,10 +240,13 @@ int main(int argc, char **argv) signal(SIGTERM, int_exit); while (1) { - cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_update(cstate_map_fd, pstate_map_fd); cpu_stat_print(); sleep(5); } +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index e74ee1cd4b9c..14b792915a9c 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -28,38 +28,38 @@ struct key_t { u32 tret; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; - -struct bpf_map_def SEC("maps") start = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 10000); +} start SEC(".maps"); struct wokeby_t { char name[TASK_COMM_LEN]; u32 ret; }; -struct bpf_map_def SEC("maps") wokeby = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u32), - .value_size = sizeof(struct wokeby_t), - .max_entries = 10000, -}; - -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, struct wokeby_t); + __uint(max_entries, 10000); +} wokeby SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c index 51c7da5341cc..5734cfdaaacb 100644 --- a/samples/bpf/offwaketime_user.c +++ b/samples/bpf/offwaketime_user.c @@ -5,19 +5,19 @@ #include #include #include -#include -#include #include #include -#include #include #include #include -#include "bpf_load.h" +#include #include "trace_helpers.h" #define PRINT_RAW_ADDR 0 +/* counts, stackmap */ +static int map_fd[2]; + static void print_ksym(__u64 addr) { struct ksym *sym; @@ -52,14 +52,14 @@ static void print_stack(struct key_t *key, __u64 count) int i; printf("%s;", key->target); - if (bpf_map_lookup_elem(map_fd[3], &key->tret, ip) != 0) { + if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) { printf("---;"); } else { for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) print_ksym(ip[i]); } printf("-;"); - if (bpf_map_lookup_elem(map_fd[3], &key->wret, ip) != 0) { + if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) { printf("---;"); } else { for (i = 0; i < PERF_MAX_STACK_DEPTH; i++) @@ -96,23 +96,54 @@ static void int_exit(int sig) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj = NULL; + struct bpf_link *links[2]; + struct bpf_program *prog; + int delay = 1, i = 0; char filename[256]; - int delay = 1; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - setrlimit(RLIMIT_MEMLOCK, &r); - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } if (load_kallsyms()) { printf("failed to process /proc/kallsyms\n"); return 2; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } if (argc > 1) @@ -120,5 +151,10 @@ int main(int argc, char **argv) sleep(delay); print_stacks(map_fd[0]); +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c index 5a62b03b1f88..50231c2eff9c 100644 --- a/samples/bpf/syscall_tp_kern.c +++ b/samples/bpf/syscall_tp_kern.c @@ -18,19 +18,19 @@ struct syscalls_exit_open_args { long ret; }; -struct bpf_map_def SEC("maps") enter_open_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} enter_open_map SEC(".maps"); -struct bpf_map_def SEC("maps") exit_open_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} exit_open_map SEC(".maps"); static __always_inline void count(void *map) { diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index 57014bab7cbe..76a1d00128fb 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -5,16 +5,12 @@ #include #include #include -#include -#include #include #include #include -#include -#include #include +#include #include -#include "bpf_load.h" /* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. @@ -49,16 +45,44 @@ static void verify_map(int map_id) static int test(char *filename, int num_progs) { - int i, fd, map0_fds[num_progs], map1_fds[num_progs]; + int map0_fds[num_progs], map1_fds[num_progs], fd, i, j = 0; + struct bpf_link *links[num_progs * 4]; + struct bpf_object *objs[num_progs]; + struct bpf_program *prog; for (i = 0; i < num_progs; i++) { - if (load_bpf_file(filename)) { - fprintf(stderr, "%s", bpf_log_buf); - return 1; + objs[i] = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(objs[i])) { + fprintf(stderr, "opening BPF object file failed\n"); + objs[i] = NULL; + goto cleanup; } - printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]); - map0_fds[i] = map_fd[0]; - map1_fds[i] = map_fd[1]; + + /* load BPF program */ + if (bpf_object__load(objs[i])) { + fprintf(stderr, "loading BPF object file failed\n"); + goto cleanup; + } + + map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "enter_open_map"); + map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i], + "exit_open_map"); + if (map0_fds[i] < 0 || map1_fds[i] < 0) { + fprintf(stderr, "finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, objs[i]) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]); } /* current load_bpf_file has perf_event_open default pid = -1 @@ -80,6 +104,12 @@ static int test(char *filename, int num_progs) verify_map(map1_fds[i]); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + for (i--; i >= 0; i--) + bpf_object__close(objs[i]); return 0; } -- cgit v1.2.3 From 35149b2c048e43562a598fd8ff91467d429bc666 Mon Sep 17 00:00:00 2001 From: Cristian Dumitrescu Date: Fri, 28 Aug 2020 10:26:28 +0200 Subject: samples/bpf: Add new sample xsk_fwd.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This sample code illustrates the packet forwarding between multiple AF_XDP sockets in multi-threading environment. All the threads and sockets are sharing a common buffer pool, with each socket having its own private buffer cache. The sockets are created with the xsk_socket__create_shared() function, which allows multiple AF_XDP sockets to share the same UMEM object. Example 1: Single thread handling two sockets. Packets received from socket A (on top of interface IFA, queue QA) are forwarded to socket B (on top of interface IFB, queue QB) and vice-versa. The thread is affinitized to CPU core C: ./xsk_fwd -i IFA -q QA -i IFB -q QB -c C Example 2: Two threads, each handling two sockets. Packets from socket A are sent to socket B (by thread X), packets from socket B are sent to socket A (by thread X); packets from socket C are sent to socket D (by thread Y), packets from socket D are sent to socket C (by thread Y). The two threads are bound to CPU cores CX and CY: ./xdp_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD -c CX -c CY Signed-off-by: Cristian Dumitrescu Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-15-git-send-email-magnus.karlsson@intel.com --- samples/bpf/Makefile | 3 + samples/bpf/xsk_fwd.c | 1085 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1088 insertions(+) create mode 100644 samples/bpf/xsk_fwd.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index a6d3646b3818..4f1ed0e3cf9f 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -48,6 +48,7 @@ tprogs-y += syscall_tp tprogs-y += cpustat tprogs-y += xdp_adjust_tail tprogs-y += xdpsock +tprogs-y += xsk_fwd tprogs-y += xdp_fwd tprogs-y += task_fd_query tprogs-y += xdp_sample_pkts @@ -104,6 +105,7 @@ syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o xdp_adjust_tail-objs := xdp_adjust_tail_user.o xdpsock-objs := xdpsock_user.o +xsk_fwd-objs := xsk_fwd.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) @@ -203,6 +205,7 @@ TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt TPROGLDLIBS_test_overhead += -lrt TPROGLDLIBS_xdpsock += -pthread +TPROGLDLIBS_xsk_fwd += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c new file mode 100644 index 000000000000..1cd97c84c337 --- /dev/null +++ b/samples/bpf/xsk_fwd.c @@ -0,0 +1,1085 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +typedef __u64 u64; +typedef __u32 u32; +typedef __u16 u16; +typedef __u8 u8; + +/* This program illustrates the packet forwarding between multiple AF_XDP + * sockets in multi-threaded environment. All threads are sharing a common + * buffer pool, with each socket having its own private buffer cache. + * + * Example 1: Single thread handling two sockets. The packets received by socket + * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue + * QB), while the packets received by socket B are forwarded to socket A. The + * thread is running on CPU core X: + * + * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X + * + * Example 2: Two threads, each handling two sockets. The thread running on CPU + * core X forwards all the packets received by socket A to socket B, and all the + * packets received by socket B to socket A. The thread running on CPU core Y is + * performing the same packet forwarding between sockets C and D: + * + * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD + * -c CX -c CY + */ + +/* + * Buffer pool and buffer cache + * + * For packet forwarding, the packet buffers are typically allocated from the + * pool for packet reception and freed back to the pool for further reuse once + * the packet transmission is completed. + * + * The buffer pool is shared between multiple threads. In order to minimize the + * access latency to the shared buffer pool, each thread creates one (or + * several) buffer caches, which, unlike the buffer pool, are private to the + * thread that creates them and therefore cannot be shared with other threads. + * The access to the shared pool is only needed either (A) when the cache gets + * empty due to repeated buffer allocations and it needs to be replenished from + * the pool, or (B) when the cache gets full due to repeated buffer free and it + * needs to be flushed back to the pull. + * + * In a packet forwarding system, a packet received on any input port can + * potentially be transmitted on any output port, depending on the forwarding + * configuration. For AF_XDP sockets, for this to work with zero-copy of the + * packet buffers when, it is required that the buffer pool memory fits into the + * UMEM area shared by all the sockets. + */ + +struct bpool_params { + u32 n_buffers; + u32 buffer_size; + int mmap_flags; + + u32 n_users_max; + u32 n_buffers_per_slab; +}; + +/* This buffer pool implementation organizes the buffers into equally sized + * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the + * pool that are completely filled with buffer pointers (full slabs). + * + * Each buffer cache has a slab for buffer allocation and a slab for buffer + * free, with both of these slabs initially empty. When the cache's allocation + * slab goes empty, it is swapped with one of the available full slabs from the + * pool, if any is available. When the cache's free slab goes full, it is + * swapped for one of the empty slabs from the pool, which is guaranteed to + * succeed. + * + * Partially filled slabs never get traded between the cache and the pool + * (except when the cache itself is destroyed), which enables fast operation + * through pointer swapping. + */ +struct bpool { + struct bpool_params params; + pthread_mutex_t lock; + void *addr; + + u64 **slabs; + u64 **slabs_reserved; + u64 *buffers; + u64 *buffers_reserved; + + u64 n_slabs; + u64 n_slabs_reserved; + u64 n_buffers; + + u64 n_slabs_available; + u64 n_slabs_reserved_available; + + struct xsk_umem_config umem_cfg; + struct xsk_ring_prod umem_fq; + struct xsk_ring_cons umem_cq; + struct xsk_umem *umem; +}; + +static struct bpool * +bpool_init(struct bpool_params *params, + struct xsk_umem_config *umem_cfg) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved; + u64 slabs_size, slabs_reserved_size; + u64 buffers_size, buffers_reserved_size; + u64 total_size, i; + struct bpool *bp; + u8 *p; + int status; + + /* mmap prep. */ + if (setrlimit(RLIMIT_MEMLOCK, &r)) + return NULL; + + /* bpool internals dimensioning. */ + n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) / + params->n_buffers_per_slab; + n_slabs_reserved = params->n_users_max * 2; + n_buffers = n_slabs * params->n_buffers_per_slab; + n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab; + + slabs_size = n_slabs * sizeof(u64 *); + slabs_reserved_size = n_slabs_reserved * sizeof(u64 *); + buffers_size = n_buffers * sizeof(u64); + buffers_reserved_size = n_buffers_reserved * sizeof(u64); + + total_size = sizeof(struct bpool) + + slabs_size + slabs_reserved_size + + buffers_size + buffers_reserved_size; + + /* bpool memory allocation. */ + p = calloc(total_size, sizeof(u8)); + if (!p) + return NULL; + + /* bpool memory initialization. */ + bp = (struct bpool *)p; + memcpy(&bp->params, params, sizeof(*params)); + bp->params.n_buffers = n_buffers; + + bp->slabs = (u64 **)&p[sizeof(struct bpool)]; + bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) + + slabs_size]; + bp->buffers = (u64 *)&p[sizeof(struct bpool) + + slabs_size + slabs_reserved_size]; + bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) + + slabs_size + slabs_reserved_size + buffers_size]; + + bp->n_slabs = n_slabs; + bp->n_slabs_reserved = n_slabs_reserved; + bp->n_buffers = n_buffers; + + for (i = 0; i < n_slabs; i++) + bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab]; + bp->n_slabs_available = n_slabs; + + for (i = 0; i < n_slabs_reserved; i++) + bp->slabs_reserved[i] = &bp->buffers_reserved[i * + params->n_buffers_per_slab]; + bp->n_slabs_reserved_available = n_slabs_reserved; + + for (i = 0; i < n_buffers; i++) + bp->buffers[i] = i * params->buffer_size; + + /* lock. */ + status = pthread_mutex_init(&bp->lock, NULL); + if (status) { + free(p); + return NULL; + } + + /* mmap. */ + bp->addr = mmap(NULL, + n_buffers * params->buffer_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags, + -1, + 0); + if (bp->addr == MAP_FAILED) { + pthread_mutex_destroy(&bp->lock); + free(p); + return NULL; + } + + /* umem. */ + status = xsk_umem__create(&bp->umem, + bp->addr, + bp->params.n_buffers * bp->params.buffer_size, + &bp->umem_fq, + &bp->umem_cq, + umem_cfg); + if (status) { + munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); + pthread_mutex_destroy(&bp->lock); + free(p); + return NULL; + } + memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg)); + + return bp; +} + +static void +bpool_free(struct bpool *bp) +{ + if (!bp) + return; + + xsk_umem__delete(bp->umem); + munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); + pthread_mutex_destroy(&bp->lock); + free(bp); +} + +struct bcache { + struct bpool *bp; + + u64 *slab_cons; + u64 *slab_prod; + + u64 n_buffers_cons; + u64 n_buffers_prod; +}; + +static u32 +bcache_slab_size(struct bcache *bc) +{ + struct bpool *bp = bc->bp; + + return bp->params.n_buffers_per_slab; +} + +static struct bcache * +bcache_init(struct bpool *bp) +{ + struct bcache *bc; + + bc = calloc(1, sizeof(struct bcache)); + if (!bc) + return NULL; + + bc->bp = bp; + bc->n_buffers_cons = 0; + bc->n_buffers_prod = 0; + + pthread_mutex_lock(&bp->lock); + if (bp->n_slabs_reserved_available == 0) { + pthread_mutex_unlock(&bp->lock); + free(bc); + return NULL; + } + + bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1]; + bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2]; + bp->n_slabs_reserved_available -= 2; + pthread_mutex_unlock(&bp->lock); + + return bc; +} + +static void +bcache_free(struct bcache *bc) +{ + struct bpool *bp; + + if (!bc) + return; + + /* In order to keep this example simple, the case of freeing any + * existing buffers from the cache back to the pool is ignored. + */ + + bp = bc->bp; + pthread_mutex_lock(&bp->lock); + bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod; + bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons; + bp->n_slabs_reserved_available += 2; + pthread_mutex_unlock(&bp->lock); + + free(bc); +} + +/* To work correctly, the implementation requires that the *n_buffers* input + * argument is never greater than the buffer pool's *n_buffers_per_slab*. This + * is typically the case, with one exception taking place when large number of + * buffers are allocated at init time (e.g. for the UMEM fill queue setup). + */ +static inline u32 +bcache_cons_check(struct bcache *bc, u32 n_buffers) +{ + struct bpool *bp = bc->bp; + u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; + u64 n_buffers_cons = bc->n_buffers_cons; + u64 n_slabs_available; + u64 *slab_full; + + /* + * Consumer slab is not empty: Use what's available locally. Do not + * look for more buffers from the pool when the ask can only be + * partially satisfied. + */ + if (n_buffers_cons) + return (n_buffers_cons < n_buffers) ? + n_buffers_cons : + n_buffers; + + /* + * Consumer slab is empty: look to trade the current consumer slab + * (full) for a full slab from the pool, if any is available. + */ + pthread_mutex_lock(&bp->lock); + n_slabs_available = bp->n_slabs_available; + if (!n_slabs_available) { + pthread_mutex_unlock(&bp->lock); + return 0; + } + + n_slabs_available--; + slab_full = bp->slabs[n_slabs_available]; + bp->slabs[n_slabs_available] = bc->slab_cons; + bp->n_slabs_available = n_slabs_available; + pthread_mutex_unlock(&bp->lock); + + bc->slab_cons = slab_full; + bc->n_buffers_cons = n_buffers_per_slab; + return n_buffers; +} + +static inline u64 +bcache_cons(struct bcache *bc) +{ + u64 n_buffers_cons = bc->n_buffers_cons - 1; + u64 buffer; + + buffer = bc->slab_cons[n_buffers_cons]; + bc->n_buffers_cons = n_buffers_cons; + return buffer; +} + +static inline void +bcache_prod(struct bcache *bc, u64 buffer) +{ + struct bpool *bp = bc->bp; + u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; + u64 n_buffers_prod = bc->n_buffers_prod; + u64 n_slabs_available; + u64 *slab_empty; + + /* + * Producer slab is not yet full: store the current buffer to it. + */ + if (n_buffers_prod < n_buffers_per_slab) { + bc->slab_prod[n_buffers_prod] = buffer; + bc->n_buffers_prod = n_buffers_prod + 1; + return; + } + + /* + * Producer slab is full: trade the cache's current producer slab + * (full) for an empty slab from the pool, then store the current + * buffer to the new producer slab. As one full slab exists in the + * cache, it is guaranteed that there is at least one empty slab + * available in the pool. + */ + pthread_mutex_lock(&bp->lock); + n_slabs_available = bp->n_slabs_available; + slab_empty = bp->slabs[n_slabs_available]; + bp->slabs[n_slabs_available] = bc->slab_prod; + bp->n_slabs_available = n_slabs_available + 1; + pthread_mutex_unlock(&bp->lock); + + slab_empty[0] = buffer; + bc->slab_prod = slab_empty; + bc->n_buffers_prod = 1; +} + +/* + * Port + * + * Each of the forwarding ports sits on top of an AF_XDP socket. In order for + * packet forwarding to happen with no packet buffer copy, all the sockets need + * to share the same UMEM area, which is used as the buffer pool memory. + */ +#ifndef MAX_BURST_RX +#define MAX_BURST_RX 64 +#endif + +#ifndef MAX_BURST_TX +#define MAX_BURST_TX 64 +#endif + +struct burst_rx { + u64 addr[MAX_BURST_RX]; + u32 len[MAX_BURST_RX]; +}; + +struct burst_tx { + u64 addr[MAX_BURST_TX]; + u32 len[MAX_BURST_TX]; + u32 n_pkts; +}; + +struct port_params { + struct xsk_socket_config xsk_cfg; + struct bpool *bp; + const char *iface; + u32 iface_queue; +}; + +struct port { + struct port_params params; + + struct bcache *bc; + + struct xsk_ring_cons rxq; + struct xsk_ring_prod txq; + struct xsk_ring_prod umem_fq; + struct xsk_ring_cons umem_cq; + struct xsk_socket *xsk; + int umem_fq_initialized; + + u64 n_pkts_rx; + u64 n_pkts_tx; +}; + +static void +port_free(struct port *p) +{ + if (!p) + return; + + /* To keep this example simple, the code to free the buffers from the + * socket's receive and transmit queues, as well as from the UMEM fill + * and completion queues, is not included. + */ + + if (p->xsk) + xsk_socket__delete(p->xsk); + + bcache_free(p->bc); + + free(p); +} + +static struct port * +port_init(struct port_params *params) +{ + struct port *p; + u32 umem_fq_size, pos = 0; + int status, i; + + /* Memory allocation and initialization. */ + p = calloc(sizeof(struct port), 1); + if (!p) + return NULL; + + memcpy(&p->params, params, sizeof(p->params)); + umem_fq_size = params->bp->umem_cfg.fill_size; + + /* bcache. */ + p->bc = bcache_init(params->bp); + if (!p->bc || + (bcache_slab_size(p->bc) < umem_fq_size) || + (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) { + port_free(p); + return NULL; + } + + /* xsk socket. */ + status = xsk_socket__create_shared(&p->xsk, + params->iface, + params->iface_queue, + params->bp->umem, + &p->rxq, + &p->txq, + &p->umem_fq, + &p->umem_cq, + ¶ms->xsk_cfg); + if (status) { + port_free(p); + return NULL; + } + + /* umem fq. */ + xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos); + + for (i = 0; i < umem_fq_size; i++) + *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = + bcache_cons(p->bc); + + xsk_ring_prod__submit(&p->umem_fq, umem_fq_size); + p->umem_fq_initialized = 1; + + return p; +} + +static inline u32 +port_rx_burst(struct port *p, struct burst_rx *b) +{ + u32 n_pkts, pos, i; + + /* Free buffers for FQ replenish. */ + n_pkts = ARRAY_SIZE(b->addr); + + n_pkts = bcache_cons_check(p->bc, n_pkts); + if (!n_pkts) + return 0; + + /* RXQ. */ + n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos); + if (!n_pkts) { + if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { + struct pollfd pollfd = { + .fd = xsk_socket__fd(p->xsk), + .events = POLLIN, + }; + + poll(&pollfd, 1, 0); + } + return 0; + } + + for (i = 0; i < n_pkts; i++) { + b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr; + b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len; + } + + xsk_ring_cons__release(&p->rxq, n_pkts); + p->n_pkts_rx += n_pkts; + + /* UMEM FQ. */ + for ( ; ; ) { + int status; + + status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); + if (status == n_pkts) + break; + + if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { + struct pollfd pollfd = { + .fd = xsk_socket__fd(p->xsk), + .events = POLLIN, + }; + + poll(&pollfd, 1, 0); + } + } + + for (i = 0; i < n_pkts; i++) + *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = + bcache_cons(p->bc); + + xsk_ring_prod__submit(&p->umem_fq, n_pkts); + + return n_pkts; +} + +static inline void +port_tx_burst(struct port *p, struct burst_tx *b) +{ + u32 n_pkts, pos, i; + int status; + + /* UMEM CQ. */ + n_pkts = p->params.bp->umem_cfg.comp_size; + + n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos); + + for (i = 0; i < n_pkts; i++) { + u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i); + + bcache_prod(p->bc, addr); + } + + xsk_ring_cons__release(&p->umem_cq, n_pkts); + + /* TXQ. */ + n_pkts = b->n_pkts; + + for ( ; ; ) { + status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); + if (status == n_pkts) + break; + + if (xsk_ring_prod__needs_wakeup(&p->txq)) + sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, + NULL, 0); + } + + for (i = 0; i < n_pkts; i++) { + xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i]; + xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i]; + } + + xsk_ring_prod__submit(&p->txq, n_pkts); + if (xsk_ring_prod__needs_wakeup(&p->txq)) + sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + p->n_pkts_tx += n_pkts; +} + +/* + * Thread + * + * Packet forwarding threads. + */ +#ifndef MAX_PORTS_PER_THREAD +#define MAX_PORTS_PER_THREAD 16 +#endif + +struct thread_data { + struct port *ports_rx[MAX_PORTS_PER_THREAD]; + struct port *ports_tx[MAX_PORTS_PER_THREAD]; + u32 n_ports_rx; + struct burst_rx burst_rx; + struct burst_tx burst_tx[MAX_PORTS_PER_THREAD]; + u32 cpu_core_id; + int quit; +}; + +static void swap_mac_addresses(void *data) +{ + struct ether_header *eth = (struct ether_header *)data; + struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; + struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; + struct ether_addr tmp; + + tmp = *src_addr; + *src_addr = *dst_addr; + *dst_addr = tmp; +} + +static void * +thread_func(void *arg) +{ + struct thread_data *t = arg; + cpu_set_t cpu_cores; + u32 i; + + CPU_ZERO(&cpu_cores); + CPU_SET(t->cpu_core_id, &cpu_cores); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); + + for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { + struct port *port_rx = t->ports_rx[i]; + struct port *port_tx = t->ports_tx[i]; + struct burst_rx *brx = &t->burst_rx; + struct burst_tx *btx = &t->burst_tx[i]; + u32 n_pkts, j; + + /* RX. */ + n_pkts = port_rx_burst(port_rx, brx); + if (!n_pkts) + continue; + + /* Process & TX. */ + for (j = 0; j < n_pkts; j++) { + u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); + u8 *pkt = xsk_umem__get_data(port_rx->params.bp->addr, + addr); + + swap_mac_addresses(pkt); + + btx->addr[btx->n_pkts] = brx->addr[j]; + btx->len[btx->n_pkts] = brx->len[j]; + btx->n_pkts++; + + if (btx->n_pkts == MAX_BURST_TX) { + port_tx_burst(port_tx, btx); + btx->n_pkts = 0; + } + } + } + + return NULL; +} + +/* + * Process + */ +static const struct bpool_params bpool_params_default = { + .n_buffers = 64 * 1024, + .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .mmap_flags = 0, + + .n_users_max = 16, + .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, +}; + +static const struct xsk_umem_config umem_cfg_default = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + .flags = 0, +}; + +static const struct port_params port_params_default = { + .xsk_cfg = { + .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .libbpf_flags = 0, + .xdp_flags = XDP_FLAGS_DRV_MODE, + .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY, + }, + + .bp = NULL, + .iface = NULL, + .iface_queue = 0, +}; + +#ifndef MAX_PORTS +#define MAX_PORTS 64 +#endif + +#ifndef MAX_THREADS +#define MAX_THREADS 64 +#endif + +static struct bpool_params bpool_params; +static struct xsk_umem_config umem_cfg; +static struct bpool *bp; + +static struct port_params port_params[MAX_PORTS]; +static struct port *ports[MAX_PORTS]; +static u64 n_pkts_rx[MAX_PORTS]; +static u64 n_pkts_tx[MAX_PORTS]; +static int n_ports; + +static pthread_t threads[MAX_THREADS]; +static struct thread_data thread_data[MAX_THREADS]; +static int n_threads; + +static void +print_usage(char *prog_name) +{ + const char *usage = + "Usage:\n" + "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n" + "\n" + "-c CORE CPU core to run a packet forwarding thread\n" + " on. May be invoked multiple times.\n" + "\n" + "-b SIZE Number of buffers in the buffer pool shared\n" + " by all the forwarding threads. Default: %u.\n" + "\n" + "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n" + " pair specifies one forwarding port. May be\n" + " invoked multiple times.\n" + "\n" + "-q QUEUE Network interface queue for RX and TX. Each\n" + " (INTERFACE, QUEUE) pair specified one\n" + " forwarding port. Default: %u. May be invoked\n" + " multiple times.\n" + "\n"; + printf(usage, + prog_name, + bpool_params_default.n_buffers, + port_params_default.iface_queue); +} + +static int +parse_args(int argc, char **argv) +{ + struct option lgopts[] = { + { NULL, 0, 0, 0 } + }; + int opt, option_index; + + /* Parse the input arguments. */ + for ( ; ;) { + opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index); + if (opt == EOF) + break; + + switch (opt) { + case 'b': + bpool_params.n_buffers = atoi(optarg); + break; + + case 'c': + if (n_threads == MAX_THREADS) { + printf("Max number of threads (%d) reached.\n", + MAX_THREADS); + return -1; + } + + thread_data[n_threads].cpu_core_id = atoi(optarg); + n_threads++; + break; + + case 'i': + if (n_ports == MAX_PORTS) { + printf("Max number of ports (%d) reached.\n", + MAX_PORTS); + return -1; + } + + port_params[n_ports].iface = optarg; + port_params[n_ports].iface_queue = 0; + n_ports++; + break; + + case 'q': + if (n_ports == 0) { + printf("No port specified for queue.\n"); + return -1; + } + port_params[n_ports - 1].iface_queue = atoi(optarg); + break; + + default: + printf("Illegal argument.\n"); + return -1; + } + } + + optind = 1; /* reset getopt lib */ + + /* Check the input arguments. */ + if (!n_ports) { + printf("No ports specified.\n"); + return -1; + } + + if (!n_threads) { + printf("No threads specified.\n"); + return -1; + } + + if (n_ports % n_threads) { + printf("Ports cannot be evenly distributed to threads.\n"); + return -1; + } + + return 0; +} + +static void +print_port(u32 port_id) +{ + struct port *port = ports[port_id]; + + printf("Port %u: interface = %s, queue = %u\n", + port_id, port->params.iface, port->params.iface_queue); +} + +static void +print_thread(u32 thread_id) +{ + struct thread_data *t = &thread_data[thread_id]; + u32 i; + + printf("Thread %u (CPU core %u): ", + thread_id, t->cpu_core_id); + + for (i = 0; i < t->n_ports_rx; i++) { + struct port *port_rx = t->ports_rx[i]; + struct port *port_tx = t->ports_tx[i]; + + printf("(%s, %u) -> (%s, %u), ", + port_rx->params.iface, + port_rx->params.iface_queue, + port_tx->params.iface, + port_tx->params.iface_queue); + } + + printf("\n"); +} + +static void +print_port_stats_separator(void) +{ + printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n", + "----", + "------------", + "-------------", + "------------", + "-------------"); +} + +static void +print_port_stats_header(void) +{ + print_port_stats_separator(); + printf("| %4s | %12s | %13s | %12s | %13s |\n", + "Port", + "RX packets", + "RX rate (pps)", + "TX packets", + "TX_rate (pps)"); + print_port_stats_separator(); +} + +static void +print_port_stats_trailer(void) +{ + print_port_stats_separator(); + printf("\n"); +} + +static void +print_port_stats(int port_id, u64 ns_diff) +{ + struct port *p = ports[port_id]; + double rx_pps, tx_pps; + + rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff; + tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff; + + printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n", + port_id, + p->n_pkts_rx, + rx_pps, + p->n_pkts_tx, + tx_pps); + + n_pkts_rx[port_id] = p->n_pkts_rx; + n_pkts_tx[port_id] = p->n_pkts_tx; +} + +static void +print_port_stats_all(u64 ns_diff) +{ + int i; + + print_port_stats_header(); + for (i = 0; i < n_ports; i++) + print_port_stats(i, ns_diff); + print_port_stats_trailer(); +} + +static int quit; + +static void +signal_handler(int sig) +{ + quit = 1; +} + +static void remove_xdp_program(void) +{ + int i; + + for (i = 0 ; i < n_ports; i++) + bpf_set_link_xdp_fd(if_nametoindex(port_params[i].iface), -1, + port_params[i].xsk_cfg.xdp_flags); +} + +int main(int argc, char **argv) +{ + struct timespec time; + u64 ns0; + int i; + + /* Parse args. */ + memcpy(&bpool_params, &bpool_params_default, + sizeof(struct bpool_params)); + memcpy(&umem_cfg, &umem_cfg_default, + sizeof(struct xsk_umem_config)); + for (i = 0; i < MAX_PORTS; i++) + memcpy(&port_params[i], &port_params_default, + sizeof(struct port_params)); + + if (parse_args(argc, argv)) { + print_usage(argv[0]); + return -1; + } + + /* Buffer pool initialization. */ + bp = bpool_init(&bpool_params, &umem_cfg); + if (!bp) { + printf("Buffer pool initialization failed.\n"); + return -1; + } + printf("Buffer pool created successfully.\n"); + + /* Ports initialization. */ + for (i = 0; i < MAX_PORTS; i++) + port_params[i].bp = bp; + + for (i = 0; i < n_ports; i++) { + ports[i] = port_init(&port_params[i]); + if (!ports[i]) { + printf("Port %d initialization failed.\n", i); + return -1; + } + print_port(i); + } + printf("All ports created successfully.\n"); + + /* Threads. */ + for (i = 0; i < n_threads; i++) { + struct thread_data *t = &thread_data[i]; + u32 n_ports_per_thread = n_ports / n_threads, j; + + for (j = 0; j < n_ports_per_thread; j++) { + t->ports_rx[j] = ports[i * n_ports_per_thread + j]; + t->ports_tx[j] = ports[i * n_ports_per_thread + + (j + 1) % n_ports_per_thread]; + } + + t->n_ports_rx = n_ports_per_thread; + + print_thread(i); + } + + for (i = 0; i < n_threads; i++) { + int status; + + status = pthread_create(&threads[i], + NULL, + thread_func, + &thread_data[i]); + if (status) { + printf("Thread %d creation failed.\n", i); + return -1; + } + } + printf("All threads created successfully.\n"); + + /* Print statistics. */ + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + signal(SIGABRT, signal_handler); + + clock_gettime(CLOCK_MONOTONIC, &time); + ns0 = time.tv_sec * 1000000000UL + time.tv_nsec; + for ( ; !quit; ) { + u64 ns1, ns_diff; + + sleep(1); + clock_gettime(CLOCK_MONOTONIC, &time); + ns1 = time.tv_sec * 1000000000UL + time.tv_nsec; + ns_diff = ns1 - ns0; + ns0 = ns1; + + print_port_stats_all(ns_diff); + } + + /* Threads completion. */ + printf("Quit.\n"); + for (i = 0; i < n_threads; i++) + thread_data[i].quit = 1; + + for (i = 0; i < n_threads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < n_ports; i++) + port_free(ports[i]); + + bpool_free(bp); + + remove_xdp_program(); + + return 0; +} -- cgit v1.2.3 From c8a039a47ffe06077929f29c9d4c7cba01a2104b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 14:51:05 +0200 Subject: samples/bpf: Optimize l2fwd performance in xdpsock Optimize the throughput performance of the l2fwd sub-app in the xdpsock sample application by removing a duplicate syscall and increasing the size of the fill ring. The latter needs some further explanation. We recommend that you set the fill ring size >= HW RX ring size + AF_XDP RX ring size. Make sure you fill up the fill ring with buffers at regular intervals, and you will with this setting avoid allocation failures in the driver. These are usually quite expensive since drivers have not been written to assume that allocation failures are common. For regular sockets, kernel allocated memory is used that only runs out in OOM situations that should be rare. These two performance optimizations together lead to a 6% percent improvement for the l2fwd app on my machine. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1598619065-1944-1-git-send-email-magnus.karlsson@intel.com --- samples/bpf/xdpsock_user.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 19c679456a0e..9f54df768d35 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -613,7 +613,16 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) { struct xsk_umem_info *umem; struct xsk_umem_config cfg = { - .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + /* We recommend that you set the fill ring size >= HW RX ring size + + * AF_XDP RX ring size. Make sure you fill up the fill ring + * with buffers at regular intervals, and you will with this setting + * avoid allocation failures in the driver. These are usually quite + * expensive since drivers have not been written to assume that + * allocation failures are common. For regular sockets, kernel + * allocated memory is used that only runs out in OOM situations + * that should be rare. + */ + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = opt_xsk_frame_size, .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, @@ -640,13 +649,13 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem) u32 idx; ret = xsk_ring_prod__reserve(&umem->fq, - XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); - if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx); + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2) exit_with_error(-ret); - for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++) *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * opt_xsk_frame_size; - xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); + xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2); } static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, @@ -888,9 +897,6 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, if (!xsk->outstanding_tx) return; - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); - ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; -- cgit v1.2.3 From b69e56cf765155dcac0037d7d0f162a2afab76c2 Mon Sep 17 00:00:00 2001 From: Weqaar Janjua Date: Sat, 29 Aug 2020 00:17:17 +0800 Subject: samples/bpf: Fix to xdpsock to avoid recycling frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The txpush program in the xdpsock sample application is supposed to send out all packets in the umem in a round-robin fashion. The problem is that it only cycled through the first BATCH_SIZE worth of packets. Fixed this so that it cycles through all buffers in the umem as intended. Fixes: 248c7f9c0e21 ("samples/bpf: convert xdpsock to use libbpf for AF_XDP access") Signed-off-by: Weqaar Janjua Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200828161717.42705-1-weqaar.a.janjua@intel.com --- samples/bpf/xdpsock_user.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 9f54df768d35..4cead341ae57 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1010,7 +1010,7 @@ static void rx_drop_all(void) } } -static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size) +static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) { u32 idx; unsigned int i; @@ -1023,14 +1023,14 @@ static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb, int batch_size) for (i = 0; i < batch_size; i++) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); - tx_desc->addr = (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; + tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; tx_desc->len = PKT_SIZE; } xsk_ring_prod__submit(&xsk->tx, batch_size); xsk->outstanding_tx += batch_size; - frame_nb += batch_size; - frame_nb %= NUM_FRAMES; + *frame_nb += batch_size; + *frame_nb %= NUM_FRAMES; complete_tx_only(xsk, batch_size); } @@ -1086,7 +1086,7 @@ static void tx_only_all(void) } for (i = 0; i < num_socks; i++) - tx_only(xsks[i], frame_nb[i], batch_size); + tx_only(xsks[i], &frame_nb[i], batch_size); pkt_cnt += batch_size; -- cgit v1.2.3 From 698584dffd4bca834ac4733fc6a659a0a0d213e7 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 4 Sep 2020 15:34:33 +0900 Subject: samples, bpf: Replace bpf_program__title() with bpf_program__section_name() From commit 521095842027 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name""), the term title has been replaced with section name in libbpf. Since the bpf_program__title() has been deprecated, this commit switches this function to bpf_program__section_name(). Due to this commit, the compilation warning issue has also been resolved. Fixes: 521095842027 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name"") Signed-off-by: Daniel T. Lee Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904063434.24963-1-danieltimlee@gmail.com --- samples/bpf/sockex3_user.c | 6 +++--- samples/bpf/spintest_user.c | 6 +++--- samples/bpf/tracex5_user.c | 6 +++--- samples/bpf/xdp_redirect_cpu_user.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'samples') diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 4dbee7427d47..7793f6a6ae7e 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -29,8 +29,8 @@ int main(int argc, char **argv) struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_program *prog; struct bpf_object *obj; + const char *section; char filename[256]; - const char *title; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); @@ -58,8 +58,8 @@ int main(int argc, char **argv) bpf_object__for_each_program(prog, obj) { fd = bpf_program__fd(prog); - title = bpf_program__title(prog, false); - if (sscanf(title, "socket/%d", &key) != 1) { + section = bpf_program__section_name(prog); + if (sscanf(section, "socket/%d", &key) != 1) { fprintf(stderr, "ERROR: finding prog failed\n"); goto cleanup; } diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 847da9284fa8..f090d0dc60d6 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -17,7 +17,7 @@ int main(int ac, char **argv) long key, next_key, value; struct bpf_program *prog; int map_fd, i, j = 0; - const char *title; + const char *section; struct ksym *sym; if (setrlimit(RLIMIT_MEMLOCK, &r)) { @@ -51,8 +51,8 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - title = bpf_program__title(prog, false); - if (sscanf(title, "kprobe/%s", symbol) != 1) + section = bpf_program__section_name(prog); + if (sscanf(section, "kprobe/%s", symbol) != 1) continue; /* Attach prog only when symbol exists */ diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index 98dad57a96c4..c17d3fb5fd64 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -39,8 +39,8 @@ int main(int ac, char **argv) struct bpf_program *prog; struct bpf_object *obj; int key, fd, progs_fd; + const char *section; char filename[256]; - const char *title; FILE *f; setrlimit(RLIMIT_MEMLOCK, &r); @@ -78,9 +78,9 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - title = bpf_program__title(prog, false); + section = bpf_program__section_name(prog); /* register only syscalls to PROG_ARRAY */ - if (sscanf(title, "kprobe/%d", &key) != 1) + if (sscanf(section, "kprobe/%d", &key) != 1) continue; fd = bpf_program__fd(prog); diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 004c0622c913..3dd366e9474d 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -111,7 +111,7 @@ static void print_avail_progs(struct bpf_object *obj) bpf_object__for_each_program(pos, obj) { if (bpf_program__is_xdp(pos)) - printf(" %s\n", bpf_program__title(pos, false)); + printf(" %s\n", bpf_program__section_name(pos)); } } -- cgit v1.2.3 From f9bec5d756b30d5b21aa5ff9b7d5d115741517c1 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Fri, 4 Sep 2020 15:34:34 +0900 Subject: samples, bpf: Add xsk_fwd test file to .gitignore This commit adds xsk_fwd test file to .gitignore which is newly added to samples/bpf. Signed-off-by: Daniel T. Lee Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904063434.24963-2-danieltimlee@gmail.com --- samples/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'samples') diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 034800c4d1e6..b2f29bc8dc43 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -50,4 +50,5 @@ xdp_rxq_info xdp_sample_pkts xdp_tx_iptunnel xdpsock +xsk_fwd testfile.img -- cgit v1.2.3 From 3131cf66d3033aa40db5b5d72a2673315b61c862 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 10 Sep 2020 10:31:04 +0200 Subject: samples/bpf: Fix one packet sending in xdpsock Fix the sending of a single packet (or small burst) in xdpsock when executing in copy mode. Currently, the l2fwd application in xdpsock only transmits the packets after a batch of them has been received, which might be confusing if you only send one packet and expect that it is returned pronto. Fix this by calling sendto() more often and add a comment in the code that states that this can be optimized if needed. Reported-by: Tirthendu Sarkar Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1599726666-8431-2-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4cead341ae57..b6175cb9a31d 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -897,6 +897,14 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, if (!xsk->outstanding_tx) return; + /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to + * really send the packets. In zero-copy mode we do not have to do this, since Tx + * is driven by the NAPI loop. So as an optimization, we do not have to call + * sendto() all the time in zero-copy mode for l2fwd. + */ + if (opt_xdp_bind_flags & XDP_COPY) + kick_tx(xsk); + ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; -- cgit v1.2.3 From 5a2a0dd88f0f267ac5953acd81050ae43a82201f Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 10 Sep 2020 10:31:05 +0200 Subject: samples/bpf: Fix possible deadlock in xdpsock Fix a possible deadlock in the l2fwd application in xdpsock that can occur when there is no space in the Tx ring. There are two ways to get the kernel to consume entries in the Tx ring: calling sendto() to make it send packets and freeing entries from the completion ring, as the kernel will not send a packet if there is no space for it to add a completion entry in the completion ring. The Tx loop in l2fwd only used to call sendto(). This patches adds cleaning the completion ring in that loop. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1599726666-8431-3-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b6175cb9a31d..b60bf4ef7cdb 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1125,6 +1125,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + complete_tx_l2fwd(xsk, fds); if (xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); -- cgit v1.2.3 From 74e00676d7f1f3c0676fdfffcf404a09a037bfe3 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 10 Sep 2020 10:31:06 +0200 Subject: samples/bpf: Add quiet option to xdpsock Add a quiet option (-Q) that disables the statistics print outs of xdpsock. This is good to have when measuring 0% loss rate performance as it will be quite terrible if the application uses printfs. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1599726666-8431-4-git-send-email-magnus.karlsson@gmail.com --- samples/bpf/xdpsock_user.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b60bf4ef7cdb..b220173dbe1e 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -78,6 +78,7 @@ static int opt_pkt_count; static u16 opt_pkt_size = MIN_PKT_SIZE; static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_extra_stats; +static bool opt_quiet; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -718,6 +719,7 @@ static struct option long_options[] = { {"tx-pkt-size", required_argument, 0, 's'}, {"tx-pkt-pattern", required_argument, 0, 'P'}, {"extra-stats", no_argument, 0, 'x'}, + {"quiet", no_argument, 0, 'Q'}, {0, 0, 0, 0} }; @@ -753,6 +755,7 @@ static void usage(const char *prog) " Min size: %d, Max size %d.\n" " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" " -x, --extra-stats Display extra statistics.\n" + " -Q, --quiet Do not display any stats.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -768,7 +771,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:x", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ", long_options, &option_index); if (c == -1) break; @@ -852,6 +855,9 @@ static void parse_command_line(int argc, char **argv) case 'x': opt_extra_stats = 1; break; + case 'Q': + opt_quiet = 1; + break; default: usage(basename(argv[0])); } @@ -1286,9 +1292,11 @@ int main(int argc, char **argv) setlocale(LC_ALL, ""); - ret = pthread_create(&pt, NULL, poller, NULL); - if (ret) - exit_with_error(ret); + if (!opt_quiet) { + ret = pthread_create(&pt, NULL, poller, NULL); + if (ret) + exit_with_error(ret); + } prev_time = get_nsecs(); start_time = prev_time; @@ -1302,7 +1310,8 @@ int main(int argc, char **argv) benchmark_done = true; - pthread_join(pt, NULL); + if (!opt_quiet) + pthread_join(pt, NULL); xdpsock_cleanup(); -- cgit v1.2.3 From f55f4c349a03d820c27145bdf457013b42e4b487 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 15 Sep 2020 13:55:19 +0200 Subject: samples/bpf: Fix test_map_in_map on s390 s390 uses socketcall multiplexer instead of individual socket syscalls. Therefore, "kprobe/" SYSCALL(sys_connect) does not trigger and test_map_in_map fails. Fix by using "kprobe/__sys_connect" instead. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200915115519.3769807-1-iii@linux.ibm.com --- samples/bpf/test_map_in_map_kern.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'samples') diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c index 8def45c5b697..b0200c8eac09 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map_kern.c @@ -103,10 +103,9 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) return result ? *result : -ENOENT; } -SEC("kprobe/" SYSCALL(sys_connect)) +SEC("kprobe/__sys_connect") int trace_sys_connect(struct pt_regs *ctx) { - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); struct sockaddr_in6 *in6; u16 test_case, port, dst6[8]; int addrlen, ret, inline_ret, ret_key = 0; @@ -114,8 +113,8 @@ int trace_sys_connect(struct pt_regs *ctx) void *outer_map, *inner_map; bool inline_hash = false; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); - addrlen = (int)PT_REGS_PARM3_CORE(real_regs); + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); + addrlen = (int)PT_REGS_PARM3_CORE(ctx); if (addrlen != sizeof(*in6)) return 0; -- cgit v1.2.3 From faef26fa444dc44eeff70c9a63ebe7fef00b6c37 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:19 +0200 Subject: bpf, selftests: Use bpf_tail_call_static where appropriate For those locations where we use an immediate tail call map index use the newly added bpf_tail_call_static() helper. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/3cfb2b799a62d22c6e7ae5897c23940bdcc24cbc.1601477936.git.daniel@iogearbox.net --- samples/bpf/sockex3_kern.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'samples') diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index cab9cca0b8eb..8142d02b33e6 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -31,28 +31,30 @@ struct { #define PARSE_IP 3 #define PARSE_IPV6 4 -/* protocol dispatch routine. - * It tail-calls next BPF program depending on eth proto - * Note, we could have used: - * bpf_tail_call(skb, &jmp_table, proto); - * but it would need large prog_array +/* Protocol dispatch routine. It tail-calls next BPF program depending + * on eth proto. Note, we could have used ... + * + * bpf_tail_call(skb, &jmp_table, proto); + * + * ... but it would need large prog_array and cannot be optimised given + * the map key is not static. */ static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) { switch (proto) { case ETH_P_8021Q: case ETH_P_8021AD: - bpf_tail_call(skb, &jmp_table, PARSE_VLAN); + bpf_tail_call_static(skb, &jmp_table, PARSE_VLAN); break; case ETH_P_MPLS_UC: case ETH_P_MPLS_MC: - bpf_tail_call(skb, &jmp_table, PARSE_MPLS); + bpf_tail_call_static(skb, &jmp_table, PARSE_MPLS); break; case ETH_P_IP: - bpf_tail_call(skb, &jmp_table, PARSE_IP); + bpf_tail_call_static(skb, &jmp_table, PARSE_IP); break; case ETH_P_IPV6: - bpf_tail_call(skb, &jmp_table, PARSE_IPV6); + bpf_tail_call_static(skb, &jmp_table, PARSE_IPV6); break; } } -- cgit v1.2.3 From 9618bde489b2d23779b1e2d04c10983da21f3818 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 5 Oct 2020 21:34:26 -0700 Subject: samples/bpf: Change Makefile to cope with latest llvm With latest llvm trunk, bpf programs under samples/bpf directory, if using CORE, may experience the following errors: LLVM ERROR: Cannot select: intrinsic %llvm.preserve.struct.access.index PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace. Stack dump: 0. Program arguments: llc -march=bpf -filetype=obj -o samples/bpf/test_probe_write_user_kern.o 1. Running pass 'Function Pass Manager' on module ''. 2. Running pass 'BPF DAG->DAG Pattern Instruction Selection' on function '@bpf_prog1' #0 0x000000000183c26c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x183c26c) ... #7 0x00000000017c375e (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x17c375e) #8 0x00000000016a75c5 llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*) (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x16a75c5) #9 0x00000000016ab4f8 llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (/data/users/yhs/work/llvm-project/llvm/build.cur/install/bin/llc+0x16ab4f8) ... Aborted (core dumped) | llc -march=bpf -filetype=obj -o samples/bpf/test_probe_write_user_kern.o The reason is due to llvm change https://reviews.llvm.org/D87153 where the CORE relocation global generation is moved from the beginning of target dependent optimization (llc) to the beginning of target independent optimization (opt). Since samples/bpf programs did not use vmlinux.h and its clang compilation uses native architecture, we need to adjust arch triple at opt level to do CORE relocation global generation properly. Otherwise, the above error will appear. This patch fixed the issue by introduce opt and llvm-dis to compilation chain, which will do proper CORE relocation global generation as well as O2 level optimization. Tested with llvm10, llvm11 and trunk/llvm12. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201006043427.1891742-1-yhs@fb.com --- samples/bpf/Makefile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4f1ed0e3cf9f..e29b1e89dd3e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -211,6 +211,8 @@ TPROGLDLIBS_xsk_fwd += -pthread # make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang LLC ?= llc CLANG ?= clang +OPT ?= opt +LLVM_DIS ?= llvm-dis LLVM_OBJCOPY ?= llvm-objcopy BTF_PAHOLE ?= pahole @@ -303,6 +305,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. +# below we use long chain of commands, clang | opt | llvm-dis | llc, +# to generate final object file. 'clang' compiles the source into IR +# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin +# processing (llvm12) and IR optimizations. 'llvm-dis' converts +# 'opt' output to IR, and finally 'llc' generates bpf byte code. $(obj)/%.o: $(src)/%.c @echo " CLANG-bpf " $@ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ @@ -314,7 +321,9 @@ $(obj)/%.o: $(src)/%.c -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ - -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ + -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ + $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ + $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif -- cgit v1.2.3 From 544d6adf3c3dddbf80e5757a3098c63612f5f8f8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 5 Oct 2020 21:34:27 -0700 Subject: samples/bpf: Fix a compilation error with fallthrough marking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling samples/bpf hits an error related to fallthrough marking. ... CC samples/bpf/hbm.o samples/bpf/hbm.c: In function ‘main’: samples/bpf/hbm.c:486:4: error: ‘fallthrough’ undeclared (first use in this function) fallthrough; ^~~~~~~~~~~ The "fallthrough" is not defined under tools/include directory. Rather, it is "__fallthrough" is defined in linux/compiler.h. Including "linux/compiler.h" and using "__fallthrough" fixed the issue. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201006043427.1891805-1-yhs@fb.com --- samples/bpf/hbm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index 4b22ace52f80..ff4c533dfac2 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -483,7 +484,7 @@ int main(int argc, char **argv) "Option -%c requires an argument.\n\n", optopt); case 'h': - fallthrough; + __fallthrough; default: Usage(); return 0; -- cgit v1.2.3 From 2e8806f032f5d9d5c59b790c73b83842bda78f23 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Fri, 2 Oct 2020 13:36:10 +0000 Subject: samples: bpf: Split xdpsock stats into new struct New statistics will be added in future commits. In preparation for this, let's split out the existing statistics into their own struct. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201002133612.31536-1-ciara.loftus@intel.com --- samples/bpf/xdpsock_user.c | 123 +++++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 54 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b220173dbe1e..4c5022e6479e 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -91,18 +91,7 @@ static bool opt_need_wakeup = true; static u32 opt_num_xsks = 1; static u32 prog_id; -struct xsk_umem_info { - struct xsk_ring_prod fq; - struct xsk_ring_cons cq; - struct xsk_umem *umem; - void *buffer; -}; - -struct xsk_socket_info { - struct xsk_ring_cons rx; - struct xsk_ring_prod tx; - struct xsk_umem_info *umem; - struct xsk_socket *xsk; +struct xsk_ring_stats { unsigned long rx_npkts; unsigned long tx_npkts; unsigned long rx_dropped_npkts; @@ -119,6 +108,21 @@ struct xsk_socket_info { unsigned long prev_rx_full_npkts; unsigned long prev_rx_fill_empty_npkts; unsigned long prev_tx_empty_npkts; +}; + +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; +}; + +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; + struct xsk_ring_stats ring_stats; u32 outstanding_tx; }; @@ -173,12 +177,12 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) return err; if (optlen == sizeof(struct xdp_statistics)) { - xsk->rx_dropped_npkts = stats.rx_dropped; - xsk->rx_invalid_npkts = stats.rx_invalid_descs; - xsk->tx_invalid_npkts = stats.tx_invalid_descs; - xsk->rx_full_npkts = stats.rx_ring_full; - xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; - xsk->tx_empty_npkts = stats.tx_ring_empty_descs; + xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; + xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; + xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; + xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; + xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; + xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; return 0; } @@ -198,9 +202,9 @@ static void dump_stats(void) double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, tx_invalid_pps, tx_empty_pps; - rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * + rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * 1000000000. / dt; - tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * + tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * 1000000000. / dt; printf("\n sock%d@", i); @@ -209,47 +213,58 @@ static void dump_stats(void) printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", dt / 1000000000.); - printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); - printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); + printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); + printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); - xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; - xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; + xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; + xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; if (opt_extra_stats) { if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { - dropped_pps = (xsks[i]->rx_dropped_npkts - - xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt; - rx_invalid_pps = (xsks[i]->rx_invalid_npkts - - xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt; - tx_invalid_pps = (xsks[i]->tx_invalid_npkts - - xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt; - full_pps = (xsks[i]->rx_full_npkts - - xsks[i]->prev_rx_full_npkts) * 1000000000. / dt; - fill_empty_pps = (xsks[i]->rx_fill_empty_npkts - - xsks[i]->prev_rx_fill_empty_npkts) - * 1000000000. / dt; - tx_empty_pps = (xsks[i]->tx_empty_npkts - - xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt; + dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - + xsks[i]->ring_stats.prev_rx_dropped_npkts) * + 1000000000. / dt; + rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - + xsks[i]->ring_stats.prev_rx_invalid_npkts) * + 1000000000. / dt; + tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - + xsks[i]->ring_stats.prev_tx_invalid_npkts) * + 1000000000. / dt; + full_pps = (xsks[i]->ring_stats.rx_full_npkts - + xsks[i]->ring_stats.prev_rx_full_npkts) * + 1000000000. / dt; + fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - + xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * + 1000000000. / dt; + tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - + xsks[i]->ring_stats.prev_tx_empty_npkts) * + 1000000000. / dt; printf(fmt, "rx dropped", dropped_pps, - xsks[i]->rx_dropped_npkts); + xsks[i]->ring_stats.rx_dropped_npkts); printf(fmt, "rx invalid", rx_invalid_pps, - xsks[i]->rx_invalid_npkts); + xsks[i]->ring_stats.rx_invalid_npkts); printf(fmt, "tx invalid", tx_invalid_pps, - xsks[i]->tx_invalid_npkts); + xsks[i]->ring_stats.tx_invalid_npkts); printf(fmt, "rx queue full", full_pps, - xsks[i]->rx_full_npkts); + xsks[i]->ring_stats.rx_full_npkts); printf(fmt, "fill ring empty", fill_empty_pps, - xsks[i]->rx_fill_empty_npkts); + xsks[i]->ring_stats.rx_fill_empty_npkts); printf(fmt, "tx ring empty", tx_empty_pps, - xsks[i]->tx_empty_npkts); - - xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts; - xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts; - xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts; - xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts; - xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts; - xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts; + xsks[i]->ring_stats.tx_empty_npkts); + + xsks[i]->ring_stats.prev_rx_dropped_npkts = + xsks[i]->ring_stats.rx_dropped_npkts; + xsks[i]->ring_stats.prev_rx_invalid_npkts = + xsks[i]->ring_stats.rx_invalid_npkts; + xsks[i]->ring_stats.prev_tx_invalid_npkts = + xsks[i]->ring_stats.tx_invalid_npkts; + xsks[i]->ring_stats.prev_rx_full_npkts = + xsks[i]->ring_stats.rx_full_npkts; + xsks[i]->ring_stats.prev_rx_fill_empty_npkts = + xsks[i]->ring_stats.rx_fill_empty_npkts; + xsks[i]->ring_stats.prev_tx_empty_npkts = + xsks[i]->ring_stats.tx_empty_npkts; } else { printf("%-15s\n", "Error retrieving extra stats"); } @@ -936,7 +951,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; + xsk->ring_stats.tx_npkts += rcvd; } } @@ -956,7 +971,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; + xsk->ring_stats.tx_npkts += rcvd; } } @@ -996,7 +1011,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; + xsk->ring_stats.rx_npkts += rcvd; } static void rx_drop_all(void) @@ -1155,7 +1170,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->tx, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; + xsk->ring_stats.rx_npkts += rcvd; xsk->outstanding_tx += rcvd; } -- cgit v1.2.3 From 60dc609dbd545226e00c9e7b518d0f4873f0a3e1 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Fri, 2 Oct 2020 13:36:11 +0000 Subject: samples: bpf: Count syscalls in xdpsock Categorise and record syscalls issued in the xdpsock sample app. The categories recorded are: rx_empty_polls: polls when the rx ring is empty fill_fail_polls: polls when failed to get addr from fill ring copy_tx_sendtos: sendtos issued for tx when copy mode enabled tx_wakeup_sendtos: sendtos issued when tx ring needs waking up opt_polls: polls issued since the '-p' flag is set Print the stats using '-a' on the xdpsock command line. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201002133612.31536-2-ciara.loftus@intel.com --- samples/bpf/xdpsock_user.c | 113 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 10 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4c5022e6479e..ff119ede4ab1 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -79,6 +79,7 @@ static u16 opt_pkt_size = MIN_PKT_SIZE; static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_extra_stats; static bool opt_quiet; +static bool opt_app_stats; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -110,6 +111,19 @@ struct xsk_ring_stats { unsigned long prev_tx_empty_npkts; }; +struct xsk_app_stats { + unsigned long rx_empty_polls; + unsigned long fill_fail_polls; + unsigned long copy_tx_sendtos; + unsigned long tx_wakeup_sendtos; + unsigned long opt_polls; + unsigned long prev_rx_empty_polls; + unsigned long prev_fill_fail_polls; + unsigned long prev_copy_tx_sendtos; + unsigned long prev_tx_wakeup_sendtos; + unsigned long prev_opt_polls; +}; + struct xsk_umem_info { struct xsk_ring_prod fq; struct xsk_ring_cons cq; @@ -123,6 +137,7 @@ struct xsk_socket_info { struct xsk_umem_info *umem; struct xsk_socket *xsk; struct xsk_ring_stats ring_stats; + struct xsk_app_stats app_stats; u32 outstanding_tx; }; @@ -189,6 +204,45 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) return -EINVAL; } +static void dump_app_stats(long dt) +{ + int i; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, + tx_wakeup_sendtos_ps, opt_polls_ps; + + rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - + xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; + fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - + xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; + copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - + xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; + tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - + xsks[i]->app_stats.prev_tx_wakeup_sendtos) + * 1000000000. / dt; + opt_polls_ps = (xsks[i]->app_stats.opt_polls - + xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; + + printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); + printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); + printf(fmt, "fill fail polls", fill_fail_polls_ps, + xsks[i]->app_stats.fill_fail_polls); + printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, + xsks[i]->app_stats.copy_tx_sendtos); + printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, + xsks[i]->app_stats.tx_wakeup_sendtos); + printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); + + xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; + xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; + xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; + xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; + xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; + } +} + static void dump_stats(void) { unsigned long now = get_nsecs(); @@ -198,7 +252,7 @@ static void dump_stats(void) prev_time = now; for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-15s %'-11.0f %'-11lu\n"; + char *fmt = "%-18s %'-14.0f %'-14lu\n"; double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, tx_invalid_pps, tx_empty_pps; @@ -211,7 +265,7 @@ static void dump_stats(void) print_benchmark(false); printf("\n"); - printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", + printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", dt / 1000000000.); printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); @@ -270,6 +324,9 @@ static void dump_stats(void) } } } + + if (opt_app_stats) + dump_app_stats(dt); } static bool is_benchmark_done(void) @@ -708,6 +765,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, if (ret) exit_with_error(-ret); + xsk->app_stats.rx_empty_polls = 0; + xsk->app_stats.fill_fail_polls = 0; + xsk->app_stats.copy_tx_sendtos = 0; + xsk->app_stats.tx_wakeup_sendtos = 0; + xsk->app_stats.opt_polls = 0; + xsk->app_stats.prev_rx_empty_polls = 0; + xsk->app_stats.prev_fill_fail_polls = 0; + xsk->app_stats.prev_copy_tx_sendtos = 0; + xsk->app_stats.prev_tx_wakeup_sendtos = 0; + xsk->app_stats.prev_opt_polls = 0; + return xsk; } @@ -735,6 +803,7 @@ static struct option long_options[] = { {"tx-pkt-pattern", required_argument, 0, 'P'}, {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, + {"app-stats", no_argument, 0, 'a'}, {0, 0, 0, 0} }; @@ -771,6 +840,7 @@ static void usage(const char *prog) " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" + " -a, --app-stats Display application (syscall) statistics.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -786,7 +856,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQa", long_options, &option_index); if (c == -1) break; @@ -873,6 +943,9 @@ static void parse_command_line(int argc, char **argv) case 'Q': opt_quiet = 1; break; + case 'a': + opt_app_stats = 1; + break; default: usage(basename(argv[0])); } @@ -923,8 +996,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, * is driven by the NAPI loop. So as an optimization, we do not have to call * sendto() all the time in zero-copy mode for l2fwd. */ - if (opt_xdp_bind_flags & XDP_COPY) + if (opt_xdp_bind_flags & XDP_COPY) { + xsk->app_stats.copy_tx_sendtos++; kick_tx(xsk); + } ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; @@ -939,8 +1014,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&umem->fq)) + if (xsk_ring_prod__needs_wakeup(&umem->fq)) { + xsk->app_stats.fill_fail_polls++; ret = poll(fds, num_socks, opt_timeout); + } ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); } @@ -964,8 +1041,10 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, if (!xsk->outstanding_tx) return; - if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { + xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); + } rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); if (rcvd > 0) { @@ -983,8 +1062,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.rx_empty_polls++; ret = poll(fds, num_socks, opt_timeout); + } return; } @@ -992,8 +1073,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.fill_fail_polls++; ret = poll(fds, num_socks, opt_timeout); + } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } @@ -1026,6 +1109,8 @@ static void rx_drop_all(void) for (;;) { if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; @@ -1106,6 +1191,8 @@ static void tx_only_all(void) int batch_size = get_batch_size(pkt_cnt); if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; @@ -1137,8 +1224,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + xsk->app_stats.rx_empty_polls++; ret = poll(fds, num_socks, opt_timeout); + } return; } @@ -1147,8 +1236,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) if (ret < 0) exit_with_error(-ret); complete_tx_l2fwd(xsk, fds); - if (xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + xsk->app_stats.tx_wakeup_sendtos++; kick_tx(xsk); + } ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); } @@ -1186,6 +1277,8 @@ static void l2fwd_all(void) for (;;) { if (opt_poll) { + for (i = 0; i < num_socks; i++) + xsks[i]->app_stats.opt_polls++; ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; -- cgit v1.2.3 From 67ed375530e223624508382a9348a21c3b2ccc09 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Fri, 2 Oct 2020 13:36:12 +0000 Subject: samples: bpf: Driver interrupt statistics in xdpsock Add an option to count the number of interrupts generated per second and total number of interrupts during the lifetime of the application for a given interface. This information is extracted from /proc/interrupts. Since there is no naming convention across drivers, the user must provide the string which is specific to their interface in the /proc/interrupts file on the command line. Usage: ./xdpsock ... -I eg. for queue 0 of i40e device eth0: ./xdpsock ... -I i40e-eth0-TxRx-0 Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201002133612.31536-3-ciara.loftus@intel.com --- samples/bpf/xdpsock_user.c | 120 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index ff119ede4ab1..1149e94ca32f 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,9 @@ static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_extra_stats; static bool opt_quiet; static bool opt_app_stats; +static const char *opt_irq_str = ""; +static u32 irq_no; +static int irqs_at_init = -1; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -111,6 +115,11 @@ struct xsk_ring_stats { unsigned long prev_tx_empty_npkts; }; +struct xsk_driver_stats { + unsigned long intrs; + unsigned long prev_intrs; +}; + struct xsk_app_stats { unsigned long rx_empty_polls; unsigned long fill_fail_polls; @@ -138,6 +147,7 @@ struct xsk_socket_info { struct xsk_socket *xsk; struct xsk_ring_stats ring_stats; struct xsk_app_stats app_stats; + struct xsk_driver_stats drv_stats; u32 outstanding_tx; }; @@ -243,6 +253,100 @@ static void dump_app_stats(long dt) } } +static bool get_interrupt_number(void) +{ + FILE *f_int_proc; + char line[4096]; + bool found = false; + + f_int_proc = fopen("/proc/interrupts", "r"); + if (f_int_proc == NULL) { + printf("Failed to open /proc/interrupts.\n"); + return found; + } + + while (!feof(f_int_proc) && !found) { + /* Make sure to read a full line at a time */ + if (fgets(line, sizeof(line), f_int_proc) == NULL || + line[strlen(line) - 1] != '\n') { + printf("Error reading from interrupts file\n"); + break; + } + + /* Extract interrupt number from line */ + if (strstr(line, opt_irq_str) != NULL) { + irq_no = atoi(line); + found = true; + break; + } + } + + fclose(f_int_proc); + + return found; +} + +static int get_irqs(void) +{ + char count_path[PATH_MAX]; + int total_intrs = -1; + FILE *f_count_proc; + char line[4096]; + + snprintf(count_path, sizeof(count_path), + "/sys/kernel/irq/%i/per_cpu_count", irq_no); + f_count_proc = fopen(count_path, "r"); + if (f_count_proc == NULL) { + printf("Failed to open %s\n", count_path); + return total_intrs; + } + + if (fgets(line, sizeof(line), f_count_proc) == NULL || + line[strlen(line) - 1] != '\n') { + printf("Error reading from %s\n", count_path); + } else { + static const char com[2] = ","; + char *token; + + total_intrs = 0; + token = strtok(line, com); + while (token != NULL) { + /* sum up interrupts across all cores */ + total_intrs += atoi(token); + token = strtok(NULL, com); + } + } + + fclose(f_count_proc); + + return total_intrs; +} + +static void dump_driver_stats(long dt) +{ + int i; + + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-18s %'-14.0f %'-14lu\n"; + double intrs_ps; + int n_ints = get_irqs(); + + if (n_ints < 0) { + printf("error getting intr info for intr %i\n", irq_no); + return; + } + xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; + + intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * + 1000000000. / dt; + + printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); + printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); + + xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; + } +} + static void dump_stats(void) { unsigned long now = get_nsecs(); @@ -327,6 +431,8 @@ static void dump_stats(void) if (opt_app_stats) dump_app_stats(dt); + if (irq_no) + dump_driver_stats(dt); } static bool is_benchmark_done(void) @@ -804,6 +910,7 @@ static struct option long_options[] = { {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, {"app-stats", no_argument, 0, 'a'}, + {"irq-string", no_argument, 0, 'I'}, {0, 0, 0, 0} }; @@ -841,6 +948,7 @@ static void usage(const char *prog) " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" " -a, --app-stats Display application (syscall) statistics.\n" + " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -856,7 +964,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQa", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", long_options, &option_index); if (c == -1) break; @@ -945,6 +1053,16 @@ static void parse_command_line(int argc, char **argv) break; case 'a': opt_app_stats = 1; + break; + case 'I': + opt_irq_str = optarg; + if (get_interrupt_number()) + irqs_at_init = get_irqs(); + if (irqs_at_init < 0) { + fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); + usage(basename(argv[0])); + } + break; default: usage(basename(argv[0])); -- cgit v1.2.3 From 8ac91df6de164923c1025e340ea81d085fb4ab85 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 11 Oct 2020 03:17:32 +0900 Subject: samples: bpf: Refactor xdp_monitor with libbpf To avoid confusion caused by the increasing fragmentation of the BPF Loader program, this commit would like to change to the libbpf loader instead of using the bpf_load. Thanks to libbpf's bpf_link interface, managing the tracepoint BPF program is much easier. bpf_program__attach_tracepoint manages the enable of tracepoint event and attach of BPF programs to it with a single interface bpf_link, so there is no need to manage event_fd and prog_fd separately. This commit refactors xdp_monitor with using this libbpf API, and the bpf_load is removed and migrated to libbpf. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010181734.1109-2-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- samples/bpf/xdp_monitor_user.c | 159 +++++++++++++++++++++++++++++++---------- 2 files changed, 121 insertions(+), 40 deletions(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index e29b1e89dd3e..50ad7e8b05bc 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -99,7 +99,7 @@ per_socket_stats_example-objs := cookie_uid_helper_example.o xdp_redirect-objs := xdp_redirect_user.o xdp_redirect_map-objs := xdp_redirect_map_user.o xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o -xdp_monitor-objs := bpf_load.o xdp_monitor_user.o +xdp_monitor-objs := xdp_monitor_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index ef53b93db573..03d0a182913f 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -26,12 +26,37 @@ static const char *__doc_err_only__= #include #include +#include #include -#include "bpf_load.h" +#include #include "bpf_util.h" +enum map_type { + REDIRECT_ERR_CNT, + EXCEPTION_CNT, + CPUMAP_ENQUEUE_CNT, + CPUMAP_KTHREAD_CNT, + DEVMAP_XMIT_CNT, +}; + +static const char *const map_type_strings[] = { + [REDIRECT_ERR_CNT] = "redirect_err_cnt", + [EXCEPTION_CNT] = "exception_cnt", + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", + [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt", +}; + +#define NUM_MAP 5 +#define NUM_TP 8 + +static int tp_cnt; +static int map_cnt; static int verbose = 1; static bool debug = false; +struct bpf_map *map_data[NUM_MAP] = {}; +struct bpf_link *tp_links[NUM_TP] = {}; +struct bpf_object *obj; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -41,6 +66,16 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +static void int_exit(int sig) +{ + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); + exit(0); +} + /* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ #define EXIT_FAIL_MEM 5 @@ -483,23 +518,23 @@ static bool stats_collect(struct stats_record *rec) * this can happen by someone running perf-record -e */ - fd = map_data[0].fd; /* map0: redirect_err_cnt */ + fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]); for (i = 0; i < REDIR_RES_MAX; i++) map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); - fd = map_data[1].fd; /* map1: exception_cnt */ + fd = bpf_map__fd(map_data[EXCEPTION_CNT]); for (i = 0; i < XDP_ACTION_MAX; i++) { map_collect_record_u64(fd, i, &rec->xdp_exception[i]); } - fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ + fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]); for (i = 0; i < MAX_CPUS; i++) map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); - fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ + fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]); map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); - fd = map_data[4].fd; /* map4: devmap_xmit_cnt */ + fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]); map_collect_record(fd, 0, &rec->xdp_devmap_xmit); return true; @@ -598,8 +633,8 @@ static void stats_poll(int interval, bool err_only) /* TODO Need more advanced stats on error types */ if (verbose) { - printf(" - Stats map0: %s\n", map_data[0].name); - printf(" - Stats map1: %s\n", map_data[1].name); + printf(" - Stats map0: %s\n", bpf_map__name(map_data[0])); + printf(" - Stats map1: %s\n", bpf_map__name(map_data[1])); printf("\n"); } fflush(stdout); @@ -618,44 +653,51 @@ static void stats_poll(int interval, bool err_only) static void print_bpf_prog_info(void) { - int i; + struct bpf_program *prog; + struct bpf_map *map; + int i = 0; /* Prog info */ - printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt); - for (i = 0; i < prog_cnt; i++) { - printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]); + printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt); + bpf_object__for_each_program(prog, obj) { + printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog)); + i++; } + i = 0; /* Maps info */ - printf("Loaded BPF prog have %d map(s)\n", map_data_count); - for (i = 0; i < map_data_count; i++) { - char *name = map_data[i].name; - int fd = map_data[i].fd; + printf("Loaded BPF prog have %d map(s)\n", map_cnt); + bpf_object__for_each_map(map, obj) { + const char *name = bpf_map__name(map); + int fd = bpf_map__fd(map); printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); + i++; } /* Event info */ - printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt); - for (i = 0; i < prog_cnt; i++) { - if (event_fd[i] != -1) - printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]); + printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt); + for (i = 0; i < tp_cnt; i++) { + int fd = bpf_link__fd(tp_links[i]); + + if (fd != -1) + printf(" - event_fd[%d] = fd(%d)\n", i, fd); } } int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; int longindex = 0, opt; - int ret = EXIT_SUCCESS; - char bpf_obj_file[256]; + int ret = EXIT_FAILURE; + enum map_type type; + char filename[256]; /* Default settings: */ bool errors_only = true; int interval = 2; - snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]); - /* Parse commands line args */ while ((opt = getopt_long(argc, argv, "hDSs:", long_options, &longindex)) != -1) { @@ -672,40 +714,79 @@ int main(int argc, char **argv) case 'h': default: usage(argv); - return EXIT_FAILURE; + return ret; } } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); - return EXIT_FAILURE; + return ret; } - if (load_bpf_file(bpf_obj_file)) { - printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return EXIT_FAILURE; + /* Remove tracepoint program when program is interrupted or killed */ + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + printf("ERROR: opening BPF object file failed\n"); + obj = NULL; + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + printf("ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + for (type = 0; type < NUM_MAP; type++) { + map_data[type] = + bpf_object__find_map_by_name(obj, map_type_strings[type]); + + if (libbpf_get_error(map_data[type])) { + printf("ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + map_cnt++; } - if (!prog_fd[0]) { - printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return EXIT_FAILURE; + + bpf_object__for_each_program(prog, obj) { + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + printf("ERROR: bpf_program__attach failed\n"); + tp_links[tp_cnt] = NULL; + goto cleanup; + } + tp_cnt++; } if (debug) { print_bpf_prog_info(); } - /* Unload/stop tracepoint event by closing fd's */ + /* Unload/stop tracepoint event by closing bpf_link's */ if (errors_only) { - /* The prog_fd[i] and event_fd[i] depend on the - * order the functions was defined in _kern.c + /* The bpf_link[i] depend on the order of + * the functions was defined in _kern.c */ - close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */ - close(prog_fd[2]); /* func: trace_xdp_redirect */ - close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */ - close(prog_fd[3]); /* func: trace_xdp_redirect_map */ + bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */ + tp_links[2] = NULL; + + bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */ + tp_links[3] = NULL; } stats_poll(interval, errors_only); + ret = EXIT_SUCCESS; + +cleanup: + /* Detach tracepoints */ + while (tp_cnt) + bpf_link__destroy(tp_links[--tp_cnt]); + + bpf_object__close(obj); return ret; } -- cgit v1.2.3 From 151936bf51afcdd6db52b3b73e339d021064b0ea Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 11 Oct 2020 03:17:33 +0900 Subject: samples: bpf: Replace attach_tracepoint() to attach() in xdp_redirect_cpu >From commit d7a18ea7e8b6 ("libbpf: Add generic bpf_program__attach()"), for some BPF programs, it is now possible to attach BPF programs with __attach() instead of explicitly calling __attach_(). This commit refactors the __attach_tracepoint() with libbpf's generic __attach() method. In addition, this refactors the logic of setting the map FD to simplify the code. Also, the missing removal of bpf_load.o in Makefile has been fixed. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010181734.1109-3-danieltimlee@gmail.com --- samples/bpf/Makefile | 2 +- samples/bpf/xdp_redirect_cpu_user.c | 153 +++++++++++++++++------------------- 2 files changed, 73 insertions(+), 82 deletions(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 50ad7e8b05bc..aeebf5d12f32 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -98,7 +98,7 @@ test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o xdp_redirect-objs := xdp_redirect_user.o xdp_redirect_map-objs := xdp_redirect_map_user.o -xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o +xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o xdp_monitor-objs := xdp_monitor_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 3dd366e9474d..6fb8dbde62c5 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -37,18 +37,35 @@ static __u32 prog_id; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int n_cpus; -static int cpu_map_fd; -static int rx_cnt_map_fd; -static int redirect_err_cnt_map_fd; -static int cpumap_enqueue_cnt_map_fd; -static int cpumap_kthread_cnt_map_fd; -static int cpus_available_map_fd; -static int cpus_count_map_fd; -static int cpus_iterator_map_fd; -static int exception_cnt_map_fd; + +enum map_type { + CPU_MAP, + RX_CNT, + REDIRECT_ERR_CNT, + CPUMAP_ENQUEUE_CNT, + CPUMAP_KTHREAD_CNT, + CPUS_AVAILABLE, + CPUS_COUNT, + CPUS_ITERATOR, + EXCEPTION_CNT, +}; + +static const char *const map_type_strings[] = { + [CPU_MAP] = "cpu_map", + [RX_CNT] = "rx_cnt", + [REDIRECT_ERR_CNT] = "redirect_err_cnt", + [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", + [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", + [CPUS_AVAILABLE] = "cpus_available", + [CPUS_COUNT] = "cpus_count", + [CPUS_ITERATOR] = "cpus_iterator", + [EXCEPTION_CNT] = "exception_cnt", +}; #define NUM_TP 5 -struct bpf_link *tp_links[NUM_TP] = { 0 }; +#define NUM_MAP 9 +struct bpf_link *tp_links[NUM_TP] = {}; +static int map_fds[NUM_MAP]; static int tp_cnt = 0; /* Exit return codes */ @@ -527,20 +544,20 @@ static void stats_collect(struct stats_record *rec) { int fd, i; - fd = rx_cnt_map_fd; + fd = map_fds[RX_CNT]; map_collect_percpu(fd, 0, &rec->rx_cnt); - fd = redirect_err_cnt_map_fd; + fd = map_fds[REDIRECT_ERR_CNT]; map_collect_percpu(fd, 1, &rec->redir_err); - fd = cpumap_enqueue_cnt_map_fd; + fd = map_fds[CPUMAP_ENQUEUE_CNT]; for (i = 0; i < n_cpus; i++) map_collect_percpu(fd, i, &rec->enq[i]); - fd = cpumap_kthread_cnt_map_fd; + fd = map_fds[CPUMAP_KTHREAD_CNT]; map_collect_percpu(fd, 0, &rec->kthread); - fd = exception_cnt_map_fd; + fd = map_fds[EXCEPTION_CNT]; map_collect_percpu(fd, 0, &rec->exception); } @@ -565,7 +582,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, /* Add a CPU entry to cpumap, as this allocate a cpu entry in * the kernel for the cpu. */ - ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0); + ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0); if (ret) { fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); exit(EXIT_FAIL_BPF); @@ -574,21 +591,21 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, /* Inform bpf_prog's that a new CPU is available to select * from via some control maps. */ - ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0); + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0); if (ret) { fprintf(stderr, "Add to avail CPUs failed\n"); exit(EXIT_FAIL_BPF); } /* When not replacing/updating existing entry, bump the count */ - ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count); + ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count); if (ret) { fprintf(stderr, "Failed reading curr cpus_count\n"); exit(EXIT_FAIL_BPF); } if (new) { curr_cpus_count++; - ret = bpf_map_update_elem(cpus_count_map_fd, &key, + ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count, 0); if (ret) { fprintf(stderr, "Failed write curr cpus_count\n"); @@ -612,7 +629,7 @@ static void mark_cpus_unavailable(void) int ret, i; for (i = 0; i < n_cpus; i++) { - ret = bpf_map_update_elem(cpus_available_map_fd, &i, + ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i, &invalid_cpu, 0); if (ret) { fprintf(stderr, "Failed marking CPU unavailable\n"); @@ -665,68 +682,37 @@ static void stats_poll(int interval, bool use_separators, char *prog_name, free_stats_record(prev); } -static struct bpf_link * attach_tp(struct bpf_object *obj, - const char *tp_category, - const char* tp_name) +static int init_tracepoints(struct bpf_object *obj) { struct bpf_program *prog; - struct bpf_link *link; - char sec_name[PATH_MAX]; - int len; - len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s", - tp_category, tp_name); - if (len < 0) - exit(EXIT_FAIL); + bpf_object__for_each_program(prog, obj) { + if (bpf_program__is_tracepoint(prog) != true) + continue; - prog = bpf_object__find_program_by_title(obj, sec_name); - if (!prog) { - fprintf(stderr, "ERR: finding progsec: %s\n", sec_name); - exit(EXIT_FAIL_BPF); + tp_links[tp_cnt] = bpf_program__attach(prog); + if (libbpf_get_error(tp_links[tp_cnt])) { + tp_links[tp_cnt] = NULL; + return -EINVAL; + } + tp_cnt++; } - link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); - if (libbpf_get_error(link)) - exit(EXIT_FAIL_BPF); - - return link; -} - -static void init_tracepoints(struct bpf_object *obj) { - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue"); - tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread"); + return 0; } static int init_map_fds(struct bpf_object *obj) { - /* Maps updated by tracepoints */ - redirect_err_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt"); - exception_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "exception_cnt"); - cpumap_enqueue_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt"); - cpumap_kthread_cnt_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt"); - - /* Maps used by XDP */ - rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt"); - cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map"); - cpus_available_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpus_available"); - cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count"); - cpus_iterator_map_fd = - bpf_object__find_map_fd_by_name(obj, "cpus_iterator"); - - if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 || - redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 || - cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 || - cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 || - exception_cnt_map_fd < 0) - return -ENOENT; + enum map_type type; + + for (type = 0; type < NUM_MAP; type++) { + map_fds[type] = + bpf_object__find_map_fd_by_name(obj, + map_type_strings[type]); + + if (map_fds[type] < 0) + return -ENOENT; + } return 0; } @@ -795,13 +781,13 @@ int main(int argc, char **argv) bool stress_mode = false; struct bpf_program *prog; struct bpf_object *obj; + int err = EXIT_FAIL; char filename[256]; int added_cpus = 0; int longindex = 0; int interval = 2; int add_cpu = -1; - int opt, err; - int prog_fd; + int opt, prog_fd; int *cpu, i; __u32 qsize; @@ -824,24 +810,29 @@ int main(int argc, char **argv) } if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return EXIT_FAIL; + return err; if (prog_fd < 0) { fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", strerror(errno)); - return EXIT_FAIL; + return err; } - init_tracepoints(obj); + + if (init_tracepoints(obj) < 0) { + fprintf(stderr, "ERR: bpf_program__attach failed\n"); + return err; + } + if (init_map_fds(obj) < 0) { fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); - return EXIT_FAIL; + return err; } mark_cpus_unavailable(); cpu = malloc(n_cpus * sizeof(int)); if (!cpu) { fprintf(stderr, "failed to allocate cpu array\n"); - return EXIT_FAIL; + return err; } memset(cpu, 0, n_cpus * sizeof(int)); @@ -960,14 +951,12 @@ int main(int argc, char **argv) prog = bpf_object__find_program_by_title(obj, prog_name); if (!prog) { fprintf(stderr, "bpf_object__find_program_by_title failed\n"); - err = EXIT_FAIL; goto out; } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { fprintf(stderr, "bpf_program__fd failed\n"); - err = EXIT_FAIL; goto out; } @@ -986,6 +975,8 @@ int main(int argc, char **argv) stats_poll(interval, use_separators, prog_name, mprog_name, &value, stress_mode); + + err = EXIT_OK; out: free(cpu); return err; -- cgit v1.2.3 From 321f6324500eb2ec75b6fcff7dcd66d64ba18529 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 11 Oct 2020 03:17:34 +0900 Subject: samples: bpf: Refactor XDP kern program maps with BTF-defined map Most of the samples were converted to use the new BTF-defined MAP as they moved to libbpf, but some of the samples were missing. Instead of using the previous BPF MAP definition, this commit refactors xdp_monitor and xdp_sample_pkts_kern MAP definition with the new BTF-defined MAP format. Also, this commit removes the max_entries attribute at PERF_EVENT_ARRAY map type. The libbpf's bpf_object__create_map() will automatically set max_entries to the maximum configured number of CPUs on the host. Signed-off-by: Daniel T. Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010181734.1109-4-danieltimlee@gmail.com --- samples/bpf/xdp_monitor_kern.c | 60 +++++++++++++++++++------------------- samples/bpf/xdp_sample_pkts_kern.c | 14 ++++----- samples/bpf/xdp_sample_pkts_user.c | 1 - 3 files changed, 36 insertions(+), 39 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 3d33cca2d48a..5c955b812c47 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -6,21 +6,21 @@ #include #include -struct bpf_map_def SEC("maps") redirect_err_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = 2, +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, 2); /* TODO: have entries for all possible errno's */ -}; +} redirect_err_cnt SEC(".maps"); #define XDP_UNKNOWN XDP_REDIRECT + 1 -struct bpf_map_def SEC("maps") exception_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = XDP_UNKNOWN + 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, XDP_UNKNOWN + 1); +} exception_cnt SEC(".maps"); /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format * Code in: kernel/include/trace/events/xdp.h @@ -129,19 +129,19 @@ struct datarec { }; #define MAX_CPUS 64 -struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = MAX_CPUS, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, MAX_CPUS); +} cpumap_enqueue_cnt SEC(".maps"); -struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} cpumap_kthread_cnt SEC(".maps"); /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format * Code in: kernel/include/trace/events/xdp.h @@ -210,12 +210,12 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) return 0; } -struct bpf_map_def SEC("maps") devmap_xmit_cnt = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(struct datarec), - .max_entries = 1, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct datarec); + __uint(max_entries, 1); +} devmap_xmit_cnt SEC(".maps"); /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format * Code in: kernel/include/trace/events/xdp.h diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c index 33377289e2a8..9cf76b340dd7 100644 --- a/samples/bpf/xdp_sample_pkts_kern.c +++ b/samples/bpf/xdp_sample_pkts_kern.c @@ -5,14 +5,12 @@ #include #define SAMPLE_SIZE 64ul -#define MAX_CPUS 128 - -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = MAX_CPUS, -}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); +} my_map SEC(".maps"); SEC("xdp_sample") int xdp_sample_prog(struct xdp_md *ctx) diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index 991ef6f0880b..4b2a300c750c 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -18,7 +18,6 @@ #include "perf-sys.h" -#define MAX_CPUS 128 static int if_idx; static char *if_name; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -- cgit v1.2.3 From 3652c9a1b1fe6cbdd4510eb220db548bff8704ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 21 Oct 2020 22:32:57 +0200 Subject: bpf, libbpf: Guard bpf inline asm from bpf_tail_call_static Yaniv reported a compilation error after pulling latest libbpf: [...] ../libbpf/src/root/usr/include/bpf/bpf_helpers.h:99:10: error: unknown register name 'r0' in asm : "r0", "r1", "r2", "r3", "r4", "r5"); [...] The issue got triggered given Yaniv was compiling tracing programs with native target (e.g. x86) instead of BPF target, hence no BTF generated vmlinux.h nor CO-RE used, and later llc with -march=bpf was invoked to compile from LLVM IR to BPF object file. Given that clang was expecting x86 inline asm and not BPF one the error complained that these regs don't exist on the former. Guard bpf_tail_call_static() with defined(__bpf__) where BPF inline asm is valid to use. BPF tracing programs on more modern kernels use BPF target anyway and thus the bpf_tail_call_static() function will be available for them. BPF inline asm is supported since clang 7 (clang <= 6 otherwise throws same above error), and __bpf_unreachable() since clang 8, therefore include the latter condition in order to prevent compilation errors for older clang versions. Given even an old Ubuntu 18.04 LTS has official LLVM packages all the way up to llvm-10, I did not bother to special case the __bpf_unreachable() inside bpf_tail_call_static() further. Also, undo the sockex3_kern's use of bpf_tail_call_static() sample given they still have the old hacky way to even compile networking progs with native instead of BPF target so bpf_tail_call_static() won't be defined there anymore. Fixes: 0e9f6841f664 ("bpf, libbpf: Add bpf_tail_call_static helper for bpf programs") Reported-by: Yaniv Agman Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Tested-by: Yaniv Agman Link: https://lore.kernel.org/bpf/CAMy7=ZUk08w5Gc2Z-EKi4JFtuUCaZYmE4yzhJjrExXpYKR4L8w@mail.gmail.com Link: https://lore.kernel.org/bpf/20201021203257.26223-1-daniel@iogearbox.net --- samples/bpf/sockex3_kern.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'samples') diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 8142d02b33e6..b363503357e5 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -44,17 +44,17 @@ static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) switch (proto) { case ETH_P_8021Q: case ETH_P_8021AD: - bpf_tail_call_static(skb, &jmp_table, PARSE_VLAN); + bpf_tail_call(skb, &jmp_table, PARSE_VLAN); break; case ETH_P_MPLS_UC: case ETH_P_MPLS_MC: - bpf_tail_call_static(skb, &jmp_table, PARSE_MPLS); + bpf_tail_call(skb, &jmp_table, PARSE_MPLS); break; case ETH_P_IP: - bpf_tail_call_static(skb, &jmp_table, PARSE_IP); + bpf_tail_call(skb, &jmp_table, PARSE_IP); break; case ETH_P_IPV6: - bpf_tail_call_static(skb, &jmp_table, PARSE_IPV6); + bpf_tail_call(skb, &jmp_table, PARSE_IPV6); break; } } -- cgit v1.2.3 From c66dca98a24cb5f3493dd08d40bcfa94a220fa92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 27 Oct 2020 00:36:23 +0100 Subject: samples/bpf: Set rlimit for memlock to infinity in all samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memlock rlimit is a notorious source of failure for BPF programs. Most of the samples just set it to infinity, but a few used a lower limit. The problem with unconditionally setting a lower limit is that this will also override the limit if the system-wide setting is *higher* than the limit being set, which can lead to failures on systems that lock a lot of memory, but set 'ulimit -l' to unlimited before running a sample. One fix for this is to only conditionally set the limit if the current limit is lower, but it is simpler to just unify all the samples and have them all set the limit to infinity. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20201026233623.91728-1-toke@redhat.com --- samples/bpf/task_fd_query_user.c | 2 +- samples/bpf/tracex2_user.c | 2 +- samples/bpf/tracex3_user.c | 2 +- samples/bpf/xdp_redirect_cpu_user.c | 2 +- samples/bpf/xdp_rxq_info_user.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'samples') diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index 4a74531dc403..b68bd2f8fdc9 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -290,7 +290,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) int main(int argc, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; extern char __executable_start; char filename[256], buf[256]; __u64 uprobe_file_offset; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index 3e36b3e4e3ef..3d6eab711d23 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -116,7 +116,7 @@ static void int_exit(int sig) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; long key, next_key, value; struct bpf_link *links[2]; struct bpf_program *prog; diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 70e987775c15..83e0fecbb01a 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -107,7 +107,7 @@ static void print_hist(int fd) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *links[2]; struct bpf_program *prog; struct bpf_object *obj; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 6fb8dbde62c5..f78cb18319aa 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -765,7 +765,7 @@ static int load_cpumap_prog(char *file_name, char *prog_name, int main(int argc, char **argv) { - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; char *mprog_filename = "xdp_redirect_kern.o"; char *redir_interface = NULL, *redir_map = NULL; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index caa4e7ffcfc7..93fa1bc54f13 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -450,7 +450,7 @@ static void stats_poll(int interval, int action, __u32 cfg_opt) int main(int argc, char **argv) { __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; -- cgit v1.2.3 From 80ade22c06ca115b81dd168e99479c8e09843513 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Tue, 27 Oct 2020 20:14:15 -0700 Subject: misc: mic: remove the MIC drivers This patch removes the MIC drivers from the kernel tree since the corresponding devices have been discontinued. Removing the dma and char-misc changes in one patch and merging via the char-misc tree is best to avoid any potential build breakage. Cc: Nikhil Rao Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Acked-By: Vinod Koul Reviewed-by: Sherry Sun Link: https://lore.kernel.org/r/8c1443136563de34699d2c084df478181c205db4.1603854416.git.sudeep.dutt@intel.com Signed-off-by: Greg Kroah-Hartman --- samples/mic/mpssd/.gitignore | 2 - samples/mic/mpssd/Makefile | 28 - samples/mic/mpssd/micctrl | 162 ---- samples/mic/mpssd/mpss | 189 ----- samples/mic/mpssd/mpssd.c | 1815 ------------------------------------------ samples/mic/mpssd/mpssd.h | 89 --- samples/mic/mpssd/sysfs.c | 91 --- 7 files changed, 2376 deletions(-) delete mode 100644 samples/mic/mpssd/.gitignore delete mode 100644 samples/mic/mpssd/Makefile delete mode 100755 samples/mic/mpssd/micctrl delete mode 100755 samples/mic/mpssd/mpss delete mode 100644 samples/mic/mpssd/mpssd.c delete mode 100644 samples/mic/mpssd/mpssd.h delete mode 100644 samples/mic/mpssd/sysfs.c (limited to 'samples') diff --git a/samples/mic/mpssd/.gitignore b/samples/mic/mpssd/.gitignore deleted file mode 100644 index aa03f1eb37a0..000000000000 --- a/samples/mic/mpssd/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -mpssd diff --git a/samples/mic/mpssd/Makefile b/samples/mic/mpssd/Makefile deleted file mode 100644 index a7a6e0c70424..000000000000 --- a/samples/mic/mpssd/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) - -ifeq ($(ARCH),x86) - -PROGS := mpssd -CC = $(CROSS_COMPILE)gcc -CFLAGS := -I../../../usr/include -I../../../tools/include - -ifdef DEBUG -CFLAGS += -DDEBUG=$(DEBUG) -endif - -all: $(PROGS) -mpssd: mpssd.c sysfs.c - $(CC) $(CFLAGS) mpssd.c sysfs.c -o mpssd -lpthread - -install: - install mpssd /usr/sbin/mpssd - install micctrl /usr/sbin/micctrl - -clean: - rm -fr $(PROGS) - -endif -endif diff --git a/samples/mic/mpssd/micctrl b/samples/mic/mpssd/micctrl deleted file mode 100755 index 030a60b04046..000000000000 --- a/samples/mic/mpssd/micctrl +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# Intel MIC User Space Tools. -# -# micctrl - Controls MIC boot/start/stop. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: micctrl -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -sysfs="/sys/class/mic" - -_status() -{ - f=$sysfs/$1 - echo -e $1 state: "`cat $f/state`" shutdown_status: "`cat $f/shutdown_status`" -} - -status() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _status $1 - return $? - fi - for f in $sysfs/* - do - _status `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_reset() -{ - f=$sysfs/$1 - echo reset > $f/state -} - -reset() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _reset $1 - return $? - fi - for f in $sysfs/* - do - _reset `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_boot() -{ - f=$sysfs/$1 - echo "linux" > $f/bootmode - echo "mic/uos.img" > $f/firmware - echo "mic/$1.image" > $f/ramdisk - echo "boot" > $f/state -} - -boot() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _boot $1 - return $? - fi - for f in $sysfs/* - do - _boot `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_shutdown() -{ - f=$sysfs/$1 - echo shutdown > $f/state -} - -shutdown() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _shutdown $1 - return $? - fi - for f in $sysfs/* - do - _shutdown `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_wait() -{ - f=$sysfs/$1 - while [ "`cat $f/state`" != "offline" -a "`cat $f/state`" != "online" ] - do - sleep 1 - echo -e "Waiting for $1 to go offline" - done -} - -wait() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _wait $1 - return $? - fi - # Wait for the cards to go offline - for f in $sysfs/* - do - _wait `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -if [ ! -d "$sysfs" ]; then - echo -e $"Module unloaded " - exit 3 -fi - -case $1 in - -s) - status $2 - ;; - -r) - reset $2 - ;; - -b) - boot $2 - ;; - -S) - shutdown $2 - ;; - -w) - wait $2 - ;; - *) - echo $"Usage: $0 {-s (status) |-r (reset) |-b (boot) |-S (shutdown) |-w (wait)}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpss b/samples/mic/mpssd/mpss deleted file mode 100755 index 248ac7313c71..000000000000 --- a/samples/mic/mpssd/mpss +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# Intel MIC User Space Tools. -# -# mpss Start mpssd. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: mpss -# Required-Start: -# Required-Stop: -# Short-Description: MPSS stack control -# Description: MPSS stack control -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -exec=/usr/sbin/mpssd -sysfs="/sys/class/mic" -mic_modules="mic_host mic_x100_dma scif vop" - -start() -{ - [ -x $exec ] || exit 5 - - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -1`" = "mpssd" ]; then - echo -e $"MPSSD already running! " - success - echo - return 0 - fi - - echo -e $"Starting MPSS Stack" - echo -e $"Loading MIC drivers:" $mic_modules - - modprobe -a $mic_modules - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - - # Start the daemon - echo -n $"Starting MPSSD " - $exec - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - success - echo - - sleep 5 - - # Boot the cards - micctrl -b - - # Wait till ping works - for f in $sysfs/* - do - count=100 - ipaddr=`cat $f/cmdline` - ipaddr=${ipaddr#*address,} - ipaddr=`echo $ipaddr | cut -d, -f1 | cut -d\; -f1` - while [ $count -ge 0 ] - do - echo -e "Pinging "`basename $f`" " - ping -c 1 $ipaddr &> /dev/null - RETVAL=$? - if [ $RETVAL -eq 0 ]; then - success - break - fi - sleep 1 - count=`expr $count - 1` - done - [ $RETVAL -ne 0 ] && failure || success - echo - done - return $RETVAL -} - -stop() -{ - echo -e $"Shutting down MPSS Stack: " - - # Bail out if module is unloaded - if [ ! -d "$sysfs" ]; then - echo -n $"Module unloaded " - success - echo - return 0 - fi - - # Shut down the cards. - micctrl -S - - # Wait for the cards to go offline - for f in $sysfs/* - do - while [ "`cat $f/state`" != "ready" ] - do - sleep 1 - echo -e "Waiting for "`basename $f`" to become ready" - done - done - - # Display the status of the cards - micctrl -s - - # Kill MPSSD now - echo -n $"Killing MPSSD" - killall -9 mpssd 2>/dev/null - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -restart() -{ - stop - sleep 5 - start -} - -status() -{ - micctrl -s - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -n 1`" = "mpssd" ]; then - echo "mpssd is running" - else - echo "mpssd is stopped" - fi - return 0 -} - -unload() -{ - if [ ! -d "$sysfs" ]; then - echo -n $"No MIC_HOST Module: " - success - echo - return - fi - - stop - - sleep 5 - echo -n $"Removing MIC drivers:" $mic_modules - modprobe -r $mic_modules - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -case $1 in - start) - start - ;; - stop) - stop - ;; - restart) - restart - ;; - status) - status - ;; - unload) - unload - ;; - *) - echo $"Usage: $0 {start|stop|restart|status|unload}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c deleted file mode 100644 index c03a05d498f0..000000000000 --- a/samples/mic/mpssd/mpssd.c +++ /dev/null @@ -1,1815 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "mpssd.h" -#include -#include -#include - -static void *init_mic(void *arg); - -static FILE *logfp; -static struct mic_info mic_list; - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1 : __min2; }) - -/* align addr on a size boundary - adjust address up/down if needed */ -#define _ALIGN_DOWN(addr, size) ((addr)&(~((size)-1))) -#define _ALIGN_UP(addr, size) _ALIGN_DOWN(addr + size - 1, size) - -/* align addr on a size boundary - adjust address up if needed */ -#define _ALIGN(addr, size) _ALIGN_UP(addr, size) - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) - -#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) - -#define GSO_ENABLED 1 -#define MAX_GSO_SIZE (64 * 1024) -#define ETH_H_LEN 14 -#define MAX_NET_PKT_SIZE (_ALIGN_UP(MAX_GSO_SIZE + ETH_H_LEN, 64)) -#define MIC_DEVICE_PAGE_END 0x1000 - -#ifndef VIRTIO_NET_HDR_F_DATA_VALID -#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ -#endif - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_console_config cons_config; -} virtcons_dev_page = { - .dd = { - .type = VIRTIO_ID_CONSOLE, - .num_vq = ARRAY_SIZE(virtcons_dev_page.vqconfig), - .feature_len = sizeof(virtcons_dev_page.host_features), - .config_len = sizeof(virtcons_dev_page.cons_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -}; - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_net_config net_config; -} virtnet_dev_page = { - .dd = { - .type = VIRTIO_ID_NET, - .num_vq = ARRAY_SIZE(virtnet_dev_page.vqconfig), - .feature_len = sizeof(virtnet_dev_page.host_features), - .config_len = sizeof(virtnet_dev_page.net_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -#if GSO_ENABLED - .host_features = htole32( - 1 << VIRTIO_NET_F_CSUM | - 1 << VIRTIO_NET_F_GSO | - 1 << VIRTIO_NET_F_GUEST_TSO4 | - 1 << VIRTIO_NET_F_GUEST_TSO6 | - 1 << VIRTIO_NET_F_GUEST_ECN), -#else - .host_features = 0, -#endif -}; - -static const char *mic_config_dir = "/etc/mpss"; -static const char *virtblk_backend = "VIRTBLK_BACKEND"; -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[1]; - __u32 host_features, guest_acknowledgements; - struct virtio_blk_config blk_config; -} virtblk_dev_page = { - .dd = { - .type = VIRTIO_ID_BLOCK, - .num_vq = ARRAY_SIZE(virtblk_dev_page.vqconfig), - .feature_len = sizeof(virtblk_dev_page.host_features), - .config_len = sizeof(virtblk_dev_page.blk_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .host_features = - htole32(1<name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - snprintf(ipaddr, IFNAMSIZ, "172.31.%d.254/24", mic->id + 1); - - pid = fork(); - if (pid == 0) { - ifargv[0] = "ip"; - ifargv[1] = "addr"; - ifargv[2] = "add"; - ifargv[3] = ipaddr; - ifargv[4] = "dev"; - ifargv[5] = dev; - ifargv[6] = NULL; - mpsslog("Configuring %s ipaddr %s\n", dev, ipaddr); - ret = execvp("ip", ifargv); - if (ret < 0) { - mpsslog("%s execvp failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - mpsslog("MIC name %s %s %d DONE!\n", - mic->name, __func__, __LINE__); - return 0; -} - -static int tun_alloc(struct mic_info *mic, char *dev) -{ - struct ifreq ifr; - int fd, err; -#if GSO_ENABLED - unsigned offload; -#endif - fd = open("/dev/net/tun", O_RDWR); - if (fd < 0) { - mpsslog("Could not open /dev/net/tun %s\n", strerror(errno)); - goto done; - } - - memset(&ifr, 0, sizeof(ifr)); - - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - if (*dev) - strncpy(ifr.ifr_name, dev, IFNAMSIZ); - - err = ioctl(fd, TUNSETIFF, (void *)&ifr); - if (err < 0) { - mpsslog("%s %s %d TUNSETIFF failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#if GSO_ENABLED - offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN; - - err = ioctl(fd, TUNSETOFFLOAD, offload); - if (err < 0) { - mpsslog("%s %s %d TUNSETOFFLOAD failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#endif - strcpy(dev, ifr.ifr_name); - mpsslog("Created TAP %s\n", dev); -done: - return fd; -} - -#define NET_FD_VIRTIO_NET 0 -#define NET_FD_TUN 1 -#define MAX_NET_FD 2 - -static void set_dp(struct mic_info *mic, int type, void *dp) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - mic->mic_console.console_dp = dp; - return; - case VIRTIO_ID_NET: - mic->mic_net.net_dp = dp; - return; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.block_dp = dp; - return; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); -} - -static void *get_dp(struct mic_info *mic, int type) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - return mic->mic_console.console_dp; - case VIRTIO_ID_NET: - return mic->mic_net.net_dp; - case VIRTIO_ID_BLOCK: - return mic->mic_virtblk.block_dp; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); - return NULL; -} - -static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type) -{ - struct mic_device_desc *d; - int i; - void *dp = get_dp(mic, type); - - for (i = sizeof(struct mic_bootparam); i < PAGE_SIZE; - i += mic_total_desc_size(d)) { - d = dp + i; - - /* End of list */ - if (d->type == 0) - break; - - if (d->type == -1) - continue; - - mpsslog("%s %s d-> type %d d %p\n", - mic->name, __func__, d->type, d); - - if (d->type == (__u8)type) - return d; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - return NULL; -} - -/* See comments in vhost.c for explanation of next_desc() */ -static unsigned next_desc(struct vring_desc *desc) -{ - unsigned int next; - - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) - return -1U; - next = le16toh(desc->next); - return next; -} - -/* Sum up all the IOVEC length */ -static ssize_t -sum_iovec_len(struct mic_copy_desc *copy) -{ - ssize_t sum = 0; - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - sum += copy->iov[i].iov_len; - return sum; -} - -static inline void verify_out_len(struct mic_info *mic, - struct mic_copy_desc *copy) -{ - if (copy->out_len != sum_iovec_len(copy)) { - mpsslog("%s %s %d BUG copy->out_len 0x%x len 0x%zx\n", - mic->name, __func__, __LINE__, - copy->out_len, sum_iovec_len(copy)); - assert(copy->out_len == sum_iovec_len(copy)); - } -} - -/* Display an iovec */ -static void -disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy, - const char *s, int line) -{ - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - mpsslog("%s %s %d copy->iov[%d] addr %p len 0x%zx\n", - mic->name, s, line, i, - copy->iov[i].iov_base, copy->iov[i].iov_len); -} - -static inline __u16 read_avail_idx(struct mic_vring *vr) -{ - return READ_ONCE(vr->info->avail_idx); -} - -static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr, - struct mic_copy_desc *copy, ssize_t len) -{ - copy->vr_idx = tx ? 0 : 1; - copy->update_used = true; - if (type == VIRTIO_ID_NET) - copy->iov[1].iov_len = len - sizeof(struct virtio_net_hdr); - else - copy->iov[0].iov_len = len; -} - -/* Central API which triggers the copies */ -static int -mic_virtio_copy(struct mic_info *mic, int fd, - struct mic_vring *vr, struct mic_copy_desc *copy) -{ - int ret; - - ret = ioctl(fd, MIC_VIRTIO_COPY_DESC, copy); - if (ret) { - mpsslog("%s %s %d errno %s ret %d\n", - mic->name, __func__, __LINE__, - strerror(errno), ret); - } - return ret; -} - -static inline unsigned _vring_size(unsigned int num, unsigned long align) -{ - return _ALIGN_UP(((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) - + align - 1) & ~(align - 1)) - + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num, 4); -} - -/* - * This initialization routine requires at least one - * vring i.e. vr0. vr1 is optional. - */ -static void * -init_vr(struct mic_info *mic, int fd, int type, - struct mic_vring *vr0, struct mic_vring *vr1, int num_vq) -{ - int vr_size; - char *va; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - va = mmap(NULL, MIC_DEVICE_PAGE_END + vr_size * num_vq, - PROT_READ, MAP_SHARED, fd, 0); - if (MAP_FAILED == va) { - mpsslog("%s %s %d mmap failed errno %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - goto done; - } - set_dp(mic, type, va); - vr0->va = (struct mic_vring *)&va[MIC_DEVICE_PAGE_END]; - vr0->info = vr0->va + - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN); - vring_init(&vr0->vr, - MIC_VRING_ENTRIES, vr0->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr0 %p vr0->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr0->va, vr0->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr0->info->magic), MIC_MAGIC + type); - assert(le32toh(vr0->info->magic) == MIC_MAGIC + type); - if (vr1) { - vr1->va = (struct mic_vring *) - &va[MIC_DEVICE_PAGE_END + vr_size]; - vr1->info = vr1->va + _vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN); - vring_init(&vr1->vr, - MIC_VRING_ENTRIES, vr1->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr1 %p vr1->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr1->va, vr1->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr1->info->magic), MIC_MAGIC + type + 1); - assert(le32toh(vr1->info->magic) == MIC_MAGIC + type + 1); - } -done: - return va; -} - -static int -wait_for_card_driver(struct mic_info *mic, int fd, int type) -{ - struct pollfd pollfd; - int err; - struct mic_device_desc *desc = get_device_desc(mic, type); - __u8 prev_status; - - if (!desc) - return -ENODEV; - prev_status = desc->status; - pollfd.fd = fd; - mpsslog("%s %s Waiting .... desc-> type %d status 0x%x\n", - mic->name, __func__, type, desc->status); - - while (1) { - pollfd.events = POLLIN; - pollfd.revents = 0; - err = poll(&pollfd, 1, -1); - if (err < 0) { - mpsslog("%s %s poll failed %s\n", - mic->name, __func__, strerror(errno)); - continue; - } - - if (pollfd.revents) { - if (desc->status != prev_status) { - mpsslog("%s %s Waiting... desc-> type %d " - "status 0x%x\n", - mic->name, __func__, type, - desc->status); - prev_status = desc->status; - } - if (desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - mpsslog("%s %s poll.revents %d\n", - mic->name, __func__, pollfd.revents); - mpsslog("%s %s desc-> type %d status 0x%x\n", - mic->name, __func__, type, - desc->status); - break; - } - } - } - return 0; -} - -/* Spin till we have some descriptors */ -static void -spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr) -{ - __u16 avail_idx = read_avail_idx(vr); - - while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) { -#ifdef DEBUG - mpsslog("%s %s waiting for desc avail %d info_avail %d\n", - mic->name, __func__, - le16toh(vr->vr.avail->idx), vr->info->avail_idx); -#endif - sched_yield(); - } -} - -static void * -virtio_net(void *arg) -{ - static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)]; - static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __attribute__ ((aligned(64))); - struct iovec vnet_iov[2][2] = { - { { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) }, - { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } }, - { { .iov_base = vnet_hdr[1], .iov_len = sizeof(vnet_hdr[1]) }, - { .iov_base = vnet_buf[1], .iov_len = sizeof(vnet_buf[1]) } }, - }; - struct iovec *iov0 = vnet_iov[0], *iov1 = vnet_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - char if_name[IFNAMSIZ]; - struct pollfd net_poll[MAX_NET_FD]; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - int err; - - snprintf(if_name, IFNAMSIZ, "mic%d", mic->id); - mic->mic_net.tap_fd = tun_alloc(mic, if_name); - if (mic->mic_net.tap_fd < 0) - goto done; - - if (tap_configure(mic, if_name)) - goto done; - mpsslog("MIC name %s id %d\n", mic->name, mic->id); - - net_poll[NET_FD_VIRTIO_NET].fd = mic->mic_net.virtio_net_fd; - net_poll[NET_FD_VIRTIO_NET].events = POLLIN; - net_poll[NET_FD_TUN].fd = mic->mic_net.tap_fd; - net_poll[NET_FD_TUN].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET, &tx_vr, &rx_vr, - virtnet_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto done; - } - - copy.iovcnt = 2; - desc = get_device_desc(mic, VIRTIO_ID_NET); - - while (1) { - ssize_t len; - - net_poll[NET_FD_VIRTIO_NET].revents = 0; - net_poll[NET_FD_TUN].revents = 0; - - /* Start polling for data from tap and virtio net */ - err = poll(net_poll, 2, -1); - if (err < 0) { - mpsslog("%s poll failed %s\n", - __func__, strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - /* - * Check if there is data to be read from TUN and write to - * virtio net fd if there is. - */ - if (net_poll[NET_FD_TUN].revents & POLLIN) { - copy.iov = iov0; - len = readv(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len > 0) { - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *)vnet_hdr[0]; - - /* Disable checksums on the card since we are on - a reliable PCIe link */ - hdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID; -#ifdef DEBUG - mpsslog("%s %s %d hdr->flags 0x%x ", mic->name, - __func__, __LINE__, hdr->flags); - mpsslog("copy.out_len %d hdr->gso_type 0x%x\n", - copy.out_len, hdr->gso_type); -#endif -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_NET, 1, &tx_vr, ©, - len); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &tx_vr, - ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(©)); -#endif - /* Reinitialize IOV for next run */ - iov0[1].iov_len = MAX_NET_PKT_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", mic->name, - __func__, __LINE__, strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - /* - * Check if there is data to be read from virtio net and - * write to TUN if there is. - */ - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_NET, 0, &rx_vr, ©, - MAX_NET_PKT_SIZE - + sizeof(struct virtio_net_hdr)); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &rx_vr, - ©); - if (!err) { -#ifdef DEBUG - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *) - vnet_hdr[1]; - - mpsslog("%s %s %d hdr->flags 0x%x, ", - mic->name, __func__, __LINE__, - hdr->flags); - mpsslog("out_len %d gso_type 0x%x\n", - copy.out_len, - hdr->gso_type); -#endif - /* Set the correct output iov_len */ - iov1[1].iov_len = copy.out_len - - sizeof(struct virtio_net_hdr); - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(©)); -#endif - len = writev(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -done: - pthread_exit(NULL); -} - -/* virtio_console */ -#define VIRTIO_CONSOLE_FD 0 -#define MONITOR_FD (VIRTIO_CONSOLE_FD + 1) -#define MAX_CONSOLE_FD (MONITOR_FD + 1) /* must be the last one + 1 */ -#define MAX_BUFFER_SIZE PAGE_SIZE - -static void * -virtio_console(void *arg) -{ - static __u8 vcons_buf[2][PAGE_SIZE]; - struct iovec vcons_iov[2] = { - { .iov_base = vcons_buf[0], .iov_len = sizeof(vcons_buf[0]) }, - { .iov_base = vcons_buf[1], .iov_len = sizeof(vcons_buf[1]) }, - }; - struct iovec *iov0 = &vcons_iov[0], *iov1 = &vcons_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - int err; - struct pollfd console_poll[MAX_CONSOLE_FD]; - int pty_fd; - char *pts_name; - ssize_t len; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - - pty_fd = posix_openpt(O_RDWR); - if (pty_fd < 0) { - mpsslog("can't open a pseudoterminal master device: %s\n", - strerror(errno)); - goto _return; - } - pts_name = ptsname(pty_fd); - if (pts_name == NULL) { - mpsslog("can't get pts name\n"); - goto _close_pty; - } - printf("%s console message goes to %s\n", mic->name, pts_name); - mpsslog("%s console message goes to %s\n", mic->name, pts_name); - err = grantpt(pty_fd); - if (err < 0) { - mpsslog("can't grant access: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - err = unlockpt(pty_fd); - if (err < 0) { - mpsslog("can't unlock a pseudoterminal: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - console_poll[MONITOR_FD].fd = pty_fd; - console_poll[MONITOR_FD].events = POLLIN; - - console_poll[VIRTIO_CONSOLE_FD].fd = mic->mic_console.virtio_console_fd; - console_poll[VIRTIO_CONSOLE_FD].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE, &tx_vr, &rx_vr, - virtcons_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto _close_pty; - } - - copy.iovcnt = 1; - desc = get_device_desc(mic, VIRTIO_ID_CONSOLE); - - for (;;) { - console_poll[MONITOR_FD].revents = 0; - console_poll[VIRTIO_CONSOLE_FD].revents = 0; - err = poll(console_poll, MAX_CONSOLE_FD, -1); - if (err < 0) { - mpsslog("%s %d: poll failed: %s\n", __func__, __LINE__, - strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - - if (console_poll[MONITOR_FD].revents & POLLIN) { - copy.iov = iov0; - len = readv(pty_fd, copy.iov, copy.iovcnt); - if (len > 0) { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_CONSOLE, 1, &tx_vr, - ©, len); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &tx_vr, ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(©)); -#endif - /* Reinitialize IOV for next run */ - iov0->iov_len = PAGE_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", - mic->name, __func__, __LINE__, - strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - if (console_poll[VIRTIO_CONSOLE_FD].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_CONSOLE, 0, &rx_vr, - ©, PAGE_SIZE); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &rx_vr, ©); - if (!err) { - /* Set the correct output iov_len */ - iov1->iov_len = copy.out_len; - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(©)); -#endif - len = writev(pty_fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (console_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -_close_pty: - close(pty_fd); -_return: - pthread_exit(NULL); -} - -static void -add_virtio_device(struct mic_info *mic, struct mic_device_desc *dd) -{ - char path[PATH_MAX]; - int fd, err; - - snprintf(path, PATH_MAX, "/dev/vop_virtio%d", mic->id); - fd = open(path, O_RDWR); - if (fd < 0) { - mpsslog("Could not open %s %s\n", path, strerror(errno)); - return; - } - - err = ioctl(fd, MIC_VIRTIO_ADD_DEVICE, dd); - if (err < 0) { - mpsslog("Could not add %d %s\n", dd->type, strerror(errno)); - close(fd); - return; - } - switch (dd->type) { - case VIRTIO_ID_NET: - mic->mic_net.virtio_net_fd = fd; - mpsslog("Added VIRTIO_ID_NET for %s\n", mic->name); - break; - case VIRTIO_ID_CONSOLE: - mic->mic_console.virtio_console_fd = fd; - mpsslog("Added VIRTIO_ID_CONSOLE for %s\n", mic->name); - break; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.virtio_block_fd = fd; - mpsslog("Added VIRTIO_ID_BLOCK for %s\n", mic->name); - break; - } -} - -static bool -set_backend_file(struct mic_info *mic) -{ - FILE *config; - char buff[PATH_MAX], *line, *evv, *p; - - snprintf(buff, PATH_MAX, "%s/mpssd%03d.conf", mic_config_dir, mic->id); - config = fopen(buff, "r"); - if (config == NULL) - return false; - do { /* look for "virtblk_backend=XXXX" */ - line = fgets(buff, PATH_MAX, config); - if (line == NULL) - break; - if (*line == '#') - continue; - p = strchr(line, '\n'); - if (p) - *p = '\0'; - } while (strncmp(line, virtblk_backend, strlen(virtblk_backend)) != 0); - fclose(config); - if (line == NULL) - return false; - evv = strchr(line, '='); - if (evv == NULL) - return false; - mic->mic_virtblk.backend_file = malloc(strlen(evv) + 1); - if (mic->mic_virtblk.backend_file == NULL) { - mpsslog("%s %d can't allocate memory\n", mic->name, mic->id); - return false; - } - strcpy(mic->mic_virtblk.backend_file, evv + 1); - return true; -} - -#define SECTOR_SIZE 512 -static bool -set_backend_size(struct mic_info *mic) -{ - mic->mic_virtblk.backend_size = lseek(mic->mic_virtblk.backend, 0, - SEEK_END); - if (mic->mic_virtblk.backend_size < 0) { - mpsslog("%s: can't seek: %s\n", - mic->name, mic->mic_virtblk.backend_file); - return false; - } - virtblk_dev_page.blk_config.capacity = - mic->mic_virtblk.backend_size / SECTOR_SIZE; - if ((mic->mic_virtblk.backend_size % SECTOR_SIZE) != 0) - virtblk_dev_page.blk_config.capacity++; - - virtblk_dev_page.blk_config.capacity = - htole64(virtblk_dev_page.blk_config.capacity); - - return true; -} - -static bool -open_backend(struct mic_info *mic) -{ - if (!set_backend_file(mic)) - goto _error_exit; - mic->mic_virtblk.backend = open(mic->mic_virtblk.backend_file, O_RDWR); - if (mic->mic_virtblk.backend < 0) { - mpsslog("%s: can't open: %s\n", mic->name, - mic->mic_virtblk.backend_file); - goto _error_free; - } - if (!set_backend_size(mic)) - goto _error_close; - mic->mic_virtblk.backend_addr = mmap(NULL, - mic->mic_virtblk.backend_size, - PROT_READ|PROT_WRITE, MAP_SHARED, - mic->mic_virtblk.backend, 0L); - if (mic->mic_virtblk.backend_addr == MAP_FAILED) { - mpsslog("%s: can't map: %s %s\n", - mic->name, mic->mic_virtblk.backend_file, - strerror(errno)); - goto _error_close; - } - return true; - - _error_close: - close(mic->mic_virtblk.backend); - _error_free: - free(mic->mic_virtblk.backend_file); - _error_exit: - return false; -} - -static void -close_backend(struct mic_info *mic) -{ - munmap(mic->mic_virtblk.backend_addr, mic->mic_virtblk.backend_size); - close(mic->mic_virtblk.backend); - free(mic->mic_virtblk.backend_file); -} - -static bool -start_virtblk(struct mic_info *mic, struct mic_vring *vring) -{ - if (((unsigned long)&virtblk_dev_page.blk_config % 8) != 0) { - mpsslog("%s: blk_config is not 8 byte aligned.\n", - mic->name); - return false; - } - add_virtio_device(mic, &virtblk_dev_page.dd); - if (MAP_FAILED == init_vr(mic, mic->mic_virtblk.virtio_block_fd, - VIRTIO_ID_BLOCK, vring, NULL, - virtblk_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - return false; - } - return true; -} - -static void -stop_virtblk(struct mic_info *mic) -{ - int vr_size, ret; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - ret = munmap(mic->mic_virtblk.block_dp, - MIC_DEVICE_PAGE_END + vr_size * virtblk_dev_page.dd.num_vq); - if (ret < 0) - mpsslog("%s munmap errno %d\n", mic->name, errno); - close(mic->mic_virtblk.virtio_block_fd); -} - -static __u8 -header_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(struct virtio_blk_outhdr)) { - mpsslog("%s() %d: length is not sizeof(virtio_blk_outhd)\n", - __func__, __LINE__); - return -EIO; - } - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) { - mpsslog("%s() %d: alone\n", - __func__, __LINE__); - return -EIO; - } - if (le16toh(desc->flags) & VRING_DESC_F_WRITE) { - mpsslog("%s() %d: not read\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -read_header(int fd, struct virtio_blk_outhdr *hdr, __u32 desc_idx) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_len = sizeof(*hdr); - iovec.iov_base = hdr; - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static int -transfer_blocks(int fd, struct iovec *iovec, __u32 iovcnt) -{ - struct mic_copy_desc copy; - - copy.iov = iovec; - copy.iovcnt = iovcnt; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static __u8 -status_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(__u8)) { - mpsslog("%s() %d: length is not sizeof(status)\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -write_status(int fd, __u8 *status) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_base = status; - iovec.iov_len = sizeof(*status); - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = true; /* Update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -#ifndef VIRTIO_BLK_T_GET_ID -#define VIRTIO_BLK_T_GET_ID 8 -#endif - -static void * -virtio_block(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int ret; - struct pollfd block_poll; - struct mic_vring vring; - __u16 avail_idx; - __u32 desc_idx; - struct vring_desc *desc; - struct iovec *iovec, *piov; - __u8 status; - __u32 buffer_desc_idx; - struct virtio_blk_outhdr hdr; - void *fos; - - for (;;) { /* forever */ - if (!open_backend(mic)) { /* No virtblk */ - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) - sleep(1); - continue; - } - - /* backend file is specified. */ - if (!start_virtblk(mic, &vring)) - goto _close_backend; - iovec = malloc(sizeof(*iovec) * - le32toh(virtblk_dev_page.blk_config.seg_max)); - if (!iovec) { - mpsslog("%s: can't alloc iovec: %s\n", - mic->name, strerror(ENOMEM)); - goto _stop_virtblk; - } - - block_poll.fd = mic->mic_virtblk.virtio_block_fd; - block_poll.events = POLLIN; - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) { - block_poll.revents = 0; - /* timeout in 1 sec to see signaled */ - ret = poll(&block_poll, 1, 1000); - if (ret < 0) { - mpsslog("%s %d: poll failed: %s\n", - __func__, __LINE__, - strerror(errno)); - continue; - } - - if (!(block_poll.revents & POLLIN)) { -#ifdef DEBUG - mpsslog("%s %d: block_poll.revents=0x%x\n", - __func__, __LINE__, block_poll.revents); -#endif - continue; - } - - /* POLLIN */ - while (vring.info->avail_idx != - le16toh(vring.vr.avail->idx)) { - /* read header element */ - avail_idx = - vring.info->avail_idx & - (vring.vr.num - 1); - desc_idx = le16toh( - vring.vr.avail->ring[avail_idx]); - desc = &vring.vr.desc[desc_idx]; -#ifdef DEBUG - mpsslog("%s() %d: avail_idx=%d ", - __func__, __LINE__, - vring.info->avail_idx); - mpsslog("vring.vr.num=%d desc=%p\n", - vring.vr.num, desc); -#endif - status = header_error_check(desc); - ret = read_header( - mic->mic_virtblk.virtio_block_fd, - &hdr, desc_idx); - if (ret < 0) { - mpsslog("%s() %d %s: ret=%d %s\n", - __func__, __LINE__, - mic->name, ret, - strerror(errno)); - break; - } - /* buffer element */ - piov = iovec; - status = 0; - fos = mic->mic_virtblk.backend_addr + - (hdr.sector * SECTOR_SIZE); - buffer_desc_idx = next_desc(desc); - desc_idx = buffer_desc_idx; - for (desc = &vring.vr.desc[buffer_desc_idx]; - desc->flags & VRING_DESC_F_NEXT; - desc_idx = next_desc(desc), - desc = &vring.vr.desc[desc_idx]) { - piov->iov_len = desc->len; - piov->iov_base = fos; - piov++; - fos += desc->len; - } - /* Returning NULLs for VIRTIO_BLK_T_GET_ID. */ - if (hdr.type & ~(VIRTIO_BLK_T_OUT | - VIRTIO_BLK_T_GET_ID)) { - /* - VIRTIO_BLK_T_IN - does not do - anything. Probably for documenting. - VIRTIO_BLK_T_SCSI_CMD - for - virtio_scsi. - VIRTIO_BLK_T_FLUSH - turned off in - config space. - VIRTIO_BLK_T_BARRIER - defined but not - used in anywhere. - */ - mpsslog("%s() %d: type %x ", - __func__, __LINE__, - hdr.type); - mpsslog("is not supported\n"); - status = -ENOTSUP; - - } else { - ret = transfer_blocks( - mic->mic_virtblk.virtio_block_fd, - iovec, - piov - iovec); - if (ret < 0 && - status != 0) - status = ret; - } - /* write status and update used pointer */ - if (status != 0) - status = status_error_check(desc); - ret = write_status( - mic->mic_virtblk.virtio_block_fd, - &status); -#ifdef DEBUG - mpsslog("%s() %d: write status=%d on desc=%p\n", - __func__, __LINE__, - status, desc); -#endif - } - } - free(iovec); -_stop_virtblk: - stop_virtblk(mic); -_close_backend: - close_backend(mic); - } /* forever */ - - pthread_exit(NULL); -} - -static void -reset(struct mic_info *mic) -{ -#define RESET_TIMEOUT 120 - int i = RESET_TIMEOUT; - setsysfs(mic->name, "state", "reset"); - while (i) { - char *state; - state = readsysfs(mic->name, "state"); - if (!state) - goto retry; - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - free(state); - break; - } - free(state); -retry: - sleep(1); - i--; - } -} - -static int -get_mic_shutdown_status(struct mic_info *mic, char *shutdown_status) -{ - if (!strcmp(shutdown_status, "nop")) - return MIC_NOP; - if (!strcmp(shutdown_status, "crashed")) - return MIC_CRASHED; - if (!strcmp(shutdown_status, "halted")) - return MIC_HALTED; - if (!strcmp(shutdown_status, "poweroff")) - return MIC_POWER_OFF; - if (!strcmp(shutdown_status, "restart")) - return MIC_RESTART; - mpsslog("%s: BUG invalid status %s\n", mic->name, shutdown_status); - /* Invalid state */ - assert(0); -}; - -static int get_mic_state(struct mic_info *mic) -{ - char *state = NULL; - enum mic_states mic_state; - - while (!state) { - state = readsysfs(mic->name, "state"); - sleep(1); - } - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - mic_state = MIC_READY; - } else if (!strcmp(state, "booting")) { - mic_state = MIC_BOOTING; - } else if (!strcmp(state, "online")) { - mic_state = MIC_ONLINE; - } else if (!strcmp(state, "shutting_down")) { - mic_state = MIC_SHUTTING_DOWN; - } else if (!strcmp(state, "reset_failed")) { - mic_state = MIC_RESET_FAILED; - } else if (!strcmp(state, "resetting")) { - mic_state = MIC_RESETTING; - } else { - mpsslog("%s: BUG invalid state %s\n", mic->name, state); - assert(0); - } - - free(state); - return mic_state; -}; - -static void mic_handle_shutdown(struct mic_info *mic) -{ -#define SHUTDOWN_TIMEOUT 60 - int i = SHUTDOWN_TIMEOUT; - char *shutdown_status; - while (i) { - shutdown_status = readsysfs(mic->name, "shutdown_status"); - if (!shutdown_status) { - sleep(1); - continue; - } - mpsslog("%s: %s %d shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - switch (get_mic_shutdown_status(mic, shutdown_status)) { - case MIC_RESTART: - mic->restart = 1; - case MIC_HALTED: - case MIC_POWER_OFF: - case MIC_CRASHED: - free(shutdown_status); - goto reset; - default: - break; - } - free(shutdown_status); - sleep(1); - i--; - } -reset: - if (!i) - mpsslog("%s: %s %d timing out waiting for shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - reset(mic); -} - -static int open_state_fd(struct mic_info *mic) -{ - char pathname[PATH_MAX]; - int fd; - - snprintf(pathname, PATH_MAX - 1, "%s/%s/%s", - MICSYSFSDIR, mic->name, "state"); - - fd = open(pathname, O_RDONLY); - if (fd < 0) - mpsslog("%s: opening file %s failed %s\n", - mic->name, pathname, strerror(errno)); - return fd; -} - -static int block_till_state_change(int fd, struct mic_info *mic) -{ - struct pollfd ufds[1]; - char value[PAGE_SIZE]; - int ret; - - ufds[0].fd = fd; - ufds[0].events = POLLERR | POLLPRI; - ret = poll(ufds, 1, -1); - if (ret < 0) { - mpsslog("%s: %s %d poll failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = lseek(fd, 0, SEEK_SET); - if (ret < 0) { - mpsslog("%s: %s %d Failed to seek to 0: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = read(fd, value, sizeof(value)); - if (ret < 0) { - mpsslog("%s: %s %d Failed to read sysfs entry: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - return 0; -} - -static void * -mic_config(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int fd, ret, stat = 0; - - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto exit; - } - - do { - ret = block_till_state_change(fd, mic); - if (ret < 0) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto close_exit; - } - - switch (get_mic_state(mic)) { - case MIC_SHUTTING_DOWN: - mic_handle_shutdown(mic); - break; - case MIC_READY: - case MIC_RESET_FAILED: - ret = kill(mic->pid, SIGTERM); - mpsslog("%s: %s %d kill pid %d ret %d\n", - mic->name, __func__, __LINE__, - mic->pid, ret); - if (!ret) { - ret = waitpid(mic->pid, &stat, - WIFSIGNALED(stat)); - mpsslog("%s: %s %d waitpid ret %d pid %d\n", - mic->name, __func__, __LINE__, - ret, mic->pid); - } - if (mic->boot_on_resume) { - setsysfs(mic->name, "state", "boot"); - mic->boot_on_resume = 0; - } - goto close_exit; - default: - break; - } - } while (1); - -close_exit: - close(fd); -exit: - init_mic(mic); - pthread_exit(NULL); -} - -static void -set_cmdline(struct mic_info *mic) -{ - char buffer[PATH_MAX]; - int len; - - len = snprintf(buffer, PATH_MAX, - "clocksource=tsc highres=off nohz=off "); - len += snprintf(buffer + len, PATH_MAX - len, - "cpufreq_on;corec6_off;pc3_off;pc6_off "); - len += snprintf(buffer + len, PATH_MAX - len, - "ifcfg=static;address,172.31.%d.1;netmask,255.255.255.0", - mic->id + 1); - - setsysfs(mic->name, "cmdline", buffer); - mpsslog("%s: Command line: \"%s\"\n", mic->name, buffer); - snprintf(buffer, PATH_MAX, "172.31.%d.1", mic->id + 1); - mpsslog("%s: IPADDR: \"%s\"\n", mic->name, buffer); -} - -static void -set_log_buf_info(struct mic_info *mic) -{ - int fd; - off_t len; - char system_map[] = "/lib/firmware/mic/System.map"; - char *map, *temp, log_buf[17] = {'\0'}; - - fd = open(system_map, O_RDONLY); - if (fd < 0) { - mpsslog("%s: Opening System.map failed: %d\n", - mic->name, errno); - return; - } - len = lseek(fd, 0, SEEK_END); - if (len < 0) { - mpsslog("%s: Reading System.map size failed: %d\n", - mic->name, errno); - close(fd); - return; - } - map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0); - if (map == MAP_FAILED) { - mpsslog("%s: mmap of System.map failed: %d\n", - mic->name, errno); - close(fd); - return; - } - temp = strstr(map, "__log_buf"); - if (!temp) { - mpsslog("%s: __log_buf not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_addr", log_buf); - mpsslog("%s: log_buf_addr: %s\n", mic->name, log_buf); - temp = strstr(map, "log_buf_len"); - if (!temp) { - mpsslog("%s: log_buf_len not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_len", log_buf); - mpsslog("%s: log_buf_len: %s\n", mic->name, log_buf); - munmap(map, len); - close(fd); -} - -static void -change_virtblk_backend(int x, siginfo_t *siginfo, void *p) -{ - struct mic_info *mic; - - for (mic = mic_list.next; mic != NULL; mic = mic->next) - mic->mic_virtblk.signaled = 1/* true */; -} - -static void -set_mic_boot_params(struct mic_info *mic) -{ - set_log_buf_info(mic); - set_cmdline(mic); -} - -static void * -init_mic(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - struct sigaction ignore = { - .sa_flags = 0, - .sa_handler = SIG_IGN - }; - struct sigaction act = { - .sa_flags = SA_SIGINFO, - .sa_sigaction = change_virtblk_backend, - }; - char buffer[PATH_MAX]; - int err, fd; - - /* - * Currently, one virtio block device is supported for each MIC card - * at a time. Any user (or test) can send a SIGUSR1 to the MIC daemon. - * The signal informs the virtio block backend about a change in the - * configuration file which specifies the virtio backend file name on - * the host. Virtio block backend then re-reads the configuration file - * and switches to the new block device. This signalling mechanism may - * not be required once multiple virtio block devices are supported by - * the MIC daemon. - */ - sigaction(SIGUSR1, &ignore, NULL); -retry: - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - goto retry; - } - - if (mic->restart) { - snprintf(buffer, PATH_MAX, "boot"); - setsysfs(mic->name, "state", buffer); - mpsslog("%s restarting mic %d\n", - mic->name, mic->restart); - mic->restart = 0; - } - - while (1) { - while (block_till_state_change(fd, mic)) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - continue; - } - - if (get_mic_state(mic) == MIC_BOOTING) - break; - } - - mic->pid = fork(); - switch (mic->pid) { - case 0: - add_virtio_device(mic, &virtcons_dev_page.dd); - add_virtio_device(mic, &virtnet_dev_page.dd); - err = pthread_create(&mic->mic_console.console_thread, NULL, - virtio_console, mic); - if (err) - mpsslog("%s virtcons pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_net.net_thread, NULL, - virtio_net, mic); - if (err) - mpsslog("%s virtnet pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_virtblk.block_thread, NULL, - virtio_block, mic); - if (err) - mpsslog("%s virtblk pthread_create failed %s\n", - mic->name, strerror(err)); - sigemptyset(&act.sa_mask); - err = sigaction(SIGUSR1, &act, NULL); - if (err) - mpsslog("%s sigaction SIGUSR1 failed %s\n", - mic->name, strerror(errno)); - while (1) - sleep(60); - case -1: - mpsslog("fork failed MIC name %s id %d errno %d\n", - mic->name, mic->id, errno); - break; - default: - err = pthread_create(&mic->config_thread, NULL, - mic_config, mic); - if (err) - mpsslog("%s mic_config pthread_create failed %s\n", - mic->name, strerror(err)); - } - - return NULL; -} - -static void -start_daemon(void) -{ - struct mic_info *mic; - int err; - - for (mic = mic_list.next; mic; mic = mic->next) { - set_mic_boot_params(mic); - err = pthread_create(&mic->init_thread, NULL, init_mic, mic); - if (err) - mpsslog("%s init_mic pthread_create failed %s\n", - mic->name, strerror(err)); - } - - while (1) - sleep(60); -} - -static int -init_mic_list(void) -{ - struct mic_info *mic = &mic_list; - struct dirent *file; - DIR *dp; - int cnt = 0; - - dp = opendir(MICSYSFSDIR); - if (!dp) - return 0; - - while ((file = readdir(dp)) != NULL) { - if (!strncmp(file->d_name, "mic", 3)) { - mic->next = calloc(1, sizeof(struct mic_info)); - if (mic->next) { - mic = mic->next; - mic->id = atoi(&file->d_name[3]); - mic->name = malloc(strlen(file->d_name) + 16); - if (mic->name) - strcpy(mic->name, file->d_name); - mpsslog("MIC name %s id %d\n", mic->name, - mic->id); - cnt++; - } - } - } - - closedir(dp); - return cnt; -} - -void -mpsslog(char *format, ...) -{ - va_list args; - char buffer[4096]; - char ts[52], *ts1; - time_t t; - - if (logfp == NULL) - return; - - va_start(args, format); - vsprintf(buffer, format, args); - va_end(args); - - time(&t); - ts1 = ctime_r(&t, ts); - ts1[strlen(ts1) - 1] = '\0'; - fprintf(logfp, "%s: %s", ts1, buffer); - - fflush(logfp); -} - -int -main(int argc, char *argv[]) -{ - int cnt; - pid_t pid; - - myname = argv[0]; - - logfp = fopen(LOGFILE_NAME, "a+"); - if (!logfp) { - fprintf(stderr, "cannot open logfile '%s'\n", LOGFILE_NAME); - exit(1); - } - pid = fork(); - switch (pid) { - case 0: - break; - case -1: - exit(2); - default: - exit(0); - } - - mpsslog("MIC Daemon start\n"); - - cnt = init_mic_list(); - if (cnt == 0) { - mpsslog("MIC module not loaded\n"); - exit(3); - } - mpsslog("MIC found %d devices\n", cnt); - - start_daemon(); - - exit(0); -} diff --git a/samples/mic/mpssd/mpssd.h b/samples/mic/mpssd/mpssd.h deleted file mode 100644 index 5f98bdafe653..000000000000 --- a/samples/mic/mpssd/mpssd.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ -#ifndef _MPSSD_H_ -#define _MPSSD_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MICSYSFSDIR "/sys/class/mic" -#define LOGFILE_NAME "/var/log/mpssd" -#define PAGE_SIZE 4096 - -struct mic_console_info { - pthread_t console_thread; - int virtio_console_fd; - void *console_dp; -}; - -struct mic_net_info { - pthread_t net_thread; - int virtio_net_fd; - int tap_fd; - void *net_dp; -}; - -struct mic_virtblk_info { - pthread_t block_thread; - int virtio_block_fd; - void *block_dp; - volatile sig_atomic_t signaled; - char *backend_file; - int backend; - void *backend_addr; - long backend_size; -}; - -struct mic_info { - int id; - char *name; - pthread_t config_thread; - pthread_t init_thread; - pid_t pid; - struct mic_console_info mic_console; - struct mic_net_info mic_net; - struct mic_virtblk_info mic_virtblk; - int restart; - int boot_on_resume; - struct mic_info *next; -}; - -__attribute__((format(printf, 1, 2))) -void mpsslog(char *format, ...); -char *readsysfs(char *dir, char *entry); -int setsysfs(char *dir, char *entry, char *value); -#endif diff --git a/samples/mic/mpssd/sysfs.c b/samples/mic/mpssd/sysfs.c deleted file mode 100644 index 3fb08eb7ed9d..000000000000 --- a/samples/mic/mpssd/sysfs.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ - -#include "mpssd.h" - -#define PAGE_SIZE 4096 - -char * -readsysfs(char *dir, char *entry) -{ - char filename[PATH_MAX]; - char value[PAGE_SIZE]; - char *string = NULL; - int fd; - int len; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, - "%s/%s/%s", MICSYSFSDIR, dir, entry); - - fd = open(filename, O_RDONLY); - if (fd < 0) { - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - return NULL; - } - - len = read(fd, value, sizeof(value)); - if (len < 0) { - mpsslog("Failed to read sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto readsys_ret; - } - if (len == 0) - goto readsys_ret; - - value[len - 1] = '\0'; - - string = malloc(strlen(value) + 1); - if (string) - strcpy(string, value); - -readsys_ret: - close(fd); - return string; -} - -int -setsysfs(char *dir, char *entry, char *value) -{ - char filename[PATH_MAX]; - char *oldvalue; - int fd, ret = 0; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, "%s/%s/%s", - MICSYSFSDIR, dir, entry); - - oldvalue = readsysfs(dir, entry); - - fd = open(filename, O_RDWR); - if (fd < 0) { - ret = errno; - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto done; - } - - if (!oldvalue || strcmp(value, oldvalue)) { - if (write(fd, value, strlen(value)) < 0) { - ret = errno; - mpsslog("Failed to write new sysfs entry '%s': %s\n", - filename, strerror(errno)); - } - } - close(fd); -done: - if (oldvalue) - free(oldvalue); - return ret; -} -- cgit v1.2.3