diff options
Diffstat (limited to 'tools')
640 files changed, 31049 insertions, 4585 deletions
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 21cb3c3d1331..64796c0223be 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -24,6 +24,7 @@ #include <sys/socket.h> #include <sys/wait.h> #include <signal.h> +#include <time.h> #include <linux/genetlink.h> #include <linux/taskstats.h> @@ -195,6 +196,37 @@ static int get_family_id(int sd) #define delay_ms(t) (t / 1000000ULL) /* + * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS) + * Returns formatted string or "N/A" if timestamp is zero + */ +static const char *format_timespec64(struct timespec64 *ts) +{ + static char buffer[32]; + struct tm tm_info; + time_t time_sec; + + /* Check if timestamp is zero (not set) */ + if (ts->tv_sec == 0 && ts->tv_nsec == 0) + return "N/A"; + + time_sec = (time_t)ts->tv_sec; + + /* Use thread-safe localtime_r */ + if (localtime_r(&time_sec, &tm_info) == NULL) + return "N/A"; + + snprintf(buffer, sizeof(buffer), "%04d-%02d-%02dT%02d:%02d:%02d", + tm_info.tm_year + 1900, + tm_info.tm_mon + 1, + tm_info.tm_mday, + tm_info.tm_hour, + tm_info.tm_min, + tm_info.tm_sec); + + return buffer; +} + +/* * Version compatibility note: * Field availability depends on taskstats version (t->version), * corresponding to TASKSTATS_VERSION in kernel headers @@ -205,13 +237,28 @@ static int get_family_id(int sd) * version >= 13 - supports WPCOPY statistics * version >= 14 - supports IRQ statistics * version >= 16 - supports *_max and *_min delay statistics + * version >= 17 - supports delay max timestamp statistics * * Always verify version before accessing version-dependent fields * to maintain backward compatibility. */ #define PRINT_CPU_DELAY(version, t) \ do { \ - if (version >= 16) { \ + if (version >= 17) { \ + printf("%-10s%15s%15s%15s%15s%15s%15s%15s%25s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average", "delay max", \ + "delay min", "delay max timestamp"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ + delay_ms((double)(t)->cpu_delay_max), \ + delay_ms((double)(t)->cpu_delay_min), \ + format_timespec64(&(t)->cpu_delay_max_ts)); \ + } else if (version >= 16) { \ printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ "CPU", "count", "real total", "virtual total", \ "delay total", "delay average", "delay max", "delay min"); \ @@ -257,44 +304,115 @@ static int get_family_id(int sd) } \ } while (0) +#define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \ + do { \ + if (version >= 17) { \ + printf("%-10s%15s%15s%15s%15s%15s%25s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min", "delay max timestamp"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min), \ + format_timespec64(&(t)->max_ts)); \ + } else if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min)); \ + } else { \ + printf("%-10s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average"); \ + printf(" %15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count)); \ + } \ + } while (0) + static void print_delayacct(struct taskstats *t) { printf("\n\n"); PRINT_CPU_DELAY(t->version, t); - PRINT_FILED_DELAY("IO", t->version, t, - blkio_count, blkio_delay_total, - blkio_delay_max, blkio_delay_min); + /* Use new macro with timestamp support for version >= 17 */ + if (t->version >= 17) { + PRINT_FILED_DELAY_WITH_TS("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min, blkio_delay_max_ts); - PRINT_FILED_DELAY("SWAP", t->version, t, - swapin_count, swapin_delay_total, - swapin_delay_max, swapin_delay_min); + PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min, swapin_delay_max_ts); - PRINT_FILED_DELAY("RECLAIM", t->version, t, - freepages_count, freepages_delay_total, - freepages_delay_max, freepages_delay_min); + PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min, freepages_delay_max_ts); - PRINT_FILED_DELAY("THRASHING", t->version, t, - thrashing_count, thrashing_delay_total, - thrashing_delay_max, thrashing_delay_min); + PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts); - if (t->version >= 11) { - PRINT_FILED_DELAY("COMPACT", t->version, t, - compact_count, compact_delay_total, - compact_delay_max, compact_delay_min); - } + if (t->version >= 11) { + PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min, compact_delay_max_ts); + } - if (t->version >= 13) { - PRINT_FILED_DELAY("WPCOPY", t->version, t, - wpcopy_count, wpcopy_delay_total, - wpcopy_delay_max, wpcopy_delay_min); - } + if (t->version >= 13) { + PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts); + } - if (t->version >= 14) { - PRINT_FILED_DELAY("IRQ", t->version, t, - irq_count, irq_delay_total, - irq_delay_max, irq_delay_min); + if (t->version >= 14) { + PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min, irq_delay_max_ts); + } + } else { + /* Use original macro for older versions */ + PRINT_FILED_DELAY("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min); + + PRINT_FILED_DELAY("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min); + + PRINT_FILED_DELAY("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min); + + PRINT_FILED_DELAY("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min); + + if (t->version >= 11) { + PRINT_FILED_DELAY("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min); + } + + if (t->version >= 13) { + PRINT_FILED_DELAY("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min); + } + + if (t->version >= 14) { + PRINT_FILED_DELAY("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min); + } } } diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h index 3d265f6babaf..6791f6508632 100644 --- a/tools/arch/alpha/include/uapi/asm/errno.h +++ b/tools/arch/alpha/include/uapi/asm/errno.h @@ -55,6 +55,7 @@ #define ENOSR 82 /* Out of streams resources */ #define ETIME 83 /* Timer expired */ #define EBADMSG 84 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EPROTO 85 /* Protocol error */ #define ENODATA 86 /* No data available */ #define ENOSTR 87 /* Device not a stream */ @@ -96,6 +97,7 @@ #define EREMCHG 115 /* Remote address changed */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h index 2fb714e2d6d8..c01ed91b1ef4 100644 --- a/tools/arch/mips/include/uapi/asm/errno.h +++ b/tools/arch/mips/include/uapi/asm/errno.h @@ -50,6 +50,7 @@ #define EDOTDOT 73 /* RFS specific error */ #define EMULTIHOP 74 /* Multihop attempted */ #define EBADMSG 77 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define ENAMETOOLONG 78 /* File name too long */ #define EOVERFLOW 79 /* Value too large for defined data type */ #define ENOTUNIQ 80 /* Name not unique on network */ @@ -88,6 +89,7 @@ #define EISCONN 133 /* Transport endpoint is already connected */ #define ENOTCONN 134 /* Transport endpoint is not connected */ #define EUCLEAN 135 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 137 /* Not a XENIX named type file */ #define ENAVAIL 138 /* No XENIX semaphores available */ #define EISNAM 139 /* Is a named type file */ diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h index 8d94739d75c6..8cbc07c1903e 100644 --- a/tools/arch/parisc/include/uapi/asm/errno.h +++ b/tools/arch/parisc/include/uapi/asm/errno.h @@ -36,6 +36,7 @@ #define EDOTDOT 66 /* RFS specific error */ #define EBADMSG 67 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ #define ESTALE 70 /* Stale file handle */ @@ -62,6 +63,7 @@ #define ERESTART 175 /* Interrupted system call should be restarted */ #define ESTRPIPE 176 /* Streams pipe error */ #define EUCLEAN 177 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 178 /* Not a XENIX named type file */ #define ENAVAIL 179 /* No XENIX semaphores available */ #define EISNAM 180 /* Is a named type file */ diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h index 81a732b902ee..4a41e7835fd5 100644 --- a/tools/arch/sparc/include/uapi/asm/errno.h +++ b/tools/arch/sparc/include/uapi/asm/errno.h @@ -48,6 +48,7 @@ #define ENOSR 74 /* Out of streams resources */ #define ENOMSG 75 /* No message of desired type */ #define EBADMSG 76 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EIDRM 77 /* Identifier removed */ #define EDEADLK 78 /* Resource deadlock would occur */ #define ENOLCK 79 /* No record locks available */ @@ -91,6 +92,7 @@ #define ENOTUNIQ 115 /* Name not unique on network */ #define ERESTART 116 /* Interrupted syscall should be restarted */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 3d0a0950d20a..43adc38d31d5 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -794,8 +794,8 @@ #define MSR_F19H_UMC_PERF_CTR 0xc0010801 /* Zen 2 */ -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT 1 /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index a9ed8992800f..22da07087e42 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -24,7 +24,7 @@ NET COMMANDS ============ | **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** | **prepend** ] | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net help** | @@ -58,11 +58,9 @@ bpftool net { show | list } [ dev *NAME* ] then all bpf programs attached to non clsact qdiscs, and finally all bpf programs attached to root and clsact qdisc. -bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite | prepend ] Attach bpf program *PROG* to network interface *NAME* with type specified - by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the - command used with **overwrite** option. Currently, only XDP-related modes - are supported for *ATTACH_TYPE*. + by *ATTACH_TYPE*. *ATTACH_TYPE* can be of: **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; @@ -72,11 +70,18 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] **tcx_ingress** - Ingress TCX. runs on ingress net traffic; **tcx_egress** - Egress TCX. runs on egress net traffic; + For XDP-related attach types (**xdp**, **xdpgeneric**, **xdpdrv**, + **xdpoffload**), the **overwrite** option can be used to replace a + previously attached bpf program. + + For **tcx_ingress** and **tcx_egress** attach types, the **prepend** option + can be used to attach the program at the beginning of the chain instead of + at the end. + bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used - for attach must be specified. Currently, only XDP-related modes are - supported for *ATTACH_TYPE*. + for attach must be specified. bpftool net help Print short help message. @@ -192,6 +197,17 @@ EXAMPLES lo(1) tcx/ingress tc_prog prog_id 29 | +| **# bpftool net attach tcx_ingress name tc_prog2 dev lo prepend** +| **# bpftool net** +| + +:: + + tc: + lo(1) tcx/ingress tc_prog2 prog_id 30 + lo(1) tcx/ingress tc_prog prog_id 29 + +| | **# bpftool net attach tcx_ingress name tc_prog dev lo** | **# bpftool net detach tcx_ingress dev lo** | **# bpftool net** diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 5442073a2e42..519ea5cb8ab1 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -130,8 +130,8 @@ include $(FEATURES_DUMP) endif endif -LIBS = $(LIBBPF) -lelf -lz -lcrypto -LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto +LIBS = $(LIBBPF) -lelf -lcrypto -lz +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lcrypto -lz ifeq ($(feature-libelf-zstd),1) LIBS += -lzstd diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 53bcfeb1a76e..a28f0cc522e4 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1142,7 +1142,14 @@ _bpftool() return 0 ;; 8) - _bpftool_once_attr 'overwrite' + case ${words[3]} in + tcx_ingress|tcx_egress) + _bpftool_once_attr 'prepend' + ;; + *) + _bpftool_once_attr 'overwrite' + ;; + esac return 0 ;; esac diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index e8daf963ecef..8bfcff9e2f63 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -1191,6 +1191,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t) case BPF_TRACE_FENTRY: return "fentry"; case BPF_TRACE_FEXIT: return "fexit"; case BPF_MODIFY_RETURN: return "mod_ret"; + case BPF_TRACE_FSESSION: return "fsession"; case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select"; case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate"; default: return libbpf_bpf_attach_type_str(t); diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 993c7d9484a4..2f9e10752e28 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -731,10 +731,10 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h { \n\ struct %1$s *skel; \n\ \n\ - skel = skel_alloc(sizeof(*skel)); \n\ + skel = (struct %1$s *)skel_alloc(sizeof(*skel)); \n\ if (!skel) \n\ goto cleanup; \n\ - skel->ctx.sz = (void *)&skel->links - (void *)skel; \n\ + skel->ctx.sz = (char *)&skel->links - (char *)skel; \n\ ", obj_name, opts.data_sz); bpf_object__for_each_map(map, obj) { @@ -755,7 +755,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h \n\ \"; \n\ \n\ - skel->%1$s = skel_prep_map_data((void *)data, %2$zd,\n\ + skel->%1$s = (__typeof__(skel->%1$s))skel_prep_map_data((void *)data, %2$zd,\n\ sizeof(data) - 1);\n\ if (!skel->%1$s) \n\ goto cleanup; \n\ @@ -857,7 +857,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h codegen("\ \n\ - skel->%1$s = skel_finalize_map_data(&skel->maps.%1$s.initial_value, \n\ + skel->%1$s = (__typeof__(skel->%1$s))skel_finalize_map_data(&skel->maps.%1$s.initial_value,\n\ %2$zd, %3$s, skel->maps.%1$s.map_fd);\n\ if (!skel->%1$s) \n\ return -ENOMEM; \n\ diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index cfc6f944f7c3..f25d66c8395e 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -666,10 +666,16 @@ static int get_tcx_type(enum net_attach_type attach_type) } } -static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex) +static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex, bool prepend) { int type = get_tcx_type(attach_type); + if (prepend) { + LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = BPF_F_BEFORE + ); + return bpf_prog_attach_opts(progfd, ifindex, type, &opts); + } return bpf_prog_attach(progfd, ifindex, type, 0); } @@ -685,6 +691,7 @@ static int do_attach(int argc, char **argv) enum net_attach_type attach_type; int progfd, ifindex, err = 0; bool overwrite = false; + bool prepend = false; /* parse attach args */ if (!REQ_ARGS(5)) @@ -709,9 +716,25 @@ static int do_attach(int argc, char **argv) if (argc) { if (is_prefix(*argv, "overwrite")) { + if (attach_type != NET_ATTACH_TYPE_XDP && + attach_type != NET_ATTACH_TYPE_XDP_GENERIC && + attach_type != NET_ATTACH_TYPE_XDP_DRIVER && + attach_type != NET_ATTACH_TYPE_XDP_OFFLOAD) { + p_err("'overwrite' is only supported for xdp types"); + err = -EINVAL; + goto cleanup; + } overwrite = true; + } else if (is_prefix(*argv, "prepend")) { + if (attach_type != NET_ATTACH_TYPE_TCX_INGRESS && + attach_type != NET_ATTACH_TYPE_TCX_EGRESS) { + p_err("'prepend' is only supported for tcx_ingress/tcx_egress"); + err = -EINVAL; + goto cleanup; + } + prepend = true; } else { - p_err("expected 'overwrite', got: '%s'?", *argv); + p_err("expected 'overwrite' or 'prepend', got: '%s'?", *argv); err = -EINVAL; goto cleanup; } @@ -728,7 +751,7 @@ static int do_attach(int argc, char **argv) /* attach tcx prog */ case NET_ATTACH_TYPE_TCX_INGRESS: case NET_ATTACH_TYPE_TCX_EGRESS: - err = do_attach_tcx(progfd, attach_type, ifindex); + err = do_attach_tcx(progfd, attach_type, ifindex, prepend); break; default: break; @@ -985,7 +1008,7 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s { show | list } [dev <devname>]\n" - " %1$s %2$s attach ATTACH_TYPE PROG dev <devname> [ overwrite ]\n" + " %1$s %2$s attach ATTACH_TYPE PROG dev <devname> [ overwrite | prepend ]\n" " %1$s %2$s detach ATTACH_TYPE dev <devname>\n" " %1$s %2$s help\n" "\n" diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index ce1b556dfa90..1733a6e93a07 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -70,7 +70,8 @@ HOSTCFLAGS_resolve_btfids += -g \ -I$(srctree)/tools/include/uapi \ -I$(LIBBPF_INCLUDE) \ -I$(SUBCMD_INCLUDE) \ - $(LIBELF_FLAGS) + $(LIBELF_FLAGS) \ + -Wall -Werror LIBS = $(LIBELF_LIBS) -lz diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index d47191c6e55e..ca7fcd03efb6 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -71,9 +71,11 @@ #include <fcntl.h> #include <errno.h> #include <linux/btf_ids.h> +#include <linux/kallsyms.h> #include <linux/rbtree.h> #include <linux/zalloc.h> #include <linux/err.h> +#include <linux/limits.h> #include <bpf/btf.h> #include <bpf/libbpf.h> #include <subcmd/parse-options.h> @@ -98,6 +100,13 @@ # error "Unknown machine endianness!" #endif +enum btf_id_kind { + BTF_ID_KIND_NONE, + BTF_ID_KIND_SYM, + BTF_ID_KIND_SET, + BTF_ID_KIND_SET8 +}; + struct btf_id { struct rb_node rb_node; char *name; @@ -105,17 +114,20 @@ struct btf_id { int id; int cnt; }; + enum btf_id_kind kind; int addr_cnt; - bool is_set; - bool is_set8; Elf64_Addr addr[ADDR_CNT]; }; struct object { const char *path; - const char *btf; + const char *btf_path; const char *base_btf_path; + struct btf *btf; + struct btf *base_btf; + bool distill_base; + struct { int fd; Elf *elf; @@ -140,6 +152,25 @@ struct object { int nr_typedefs; }; +#define KF_IMPLICIT_ARGS (1 << 16) +#define KF_IMPL_SUFFIX "_impl" + +struct kfunc { + const char *name; + u32 btf_id; + u32 flags; +}; + +struct btf2btf_context { + struct btf *btf; + u32 *decl_tags; + u32 nr_decl_tags; + u32 max_decl_tags; + struct kfunc *kfuncs; + u32 nr_kfuncs; + u32 max_kfuncs; +}; + static int verbose; static int warnings; @@ -194,8 +225,10 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name) return NULL; } -static struct btf_id * -btf_id__add(struct rb_root *root, char *name, bool unique) +static struct btf_id *__btf_id__add(struct rb_root *root, + char *name, + enum btf_id_kind kind, + bool unique) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -218,12 +251,23 @@ btf_id__add(struct rb_root *root, char *name, bool unique) if (id) { pr_debug("adding symbol %s\n", name); id->name = name; + id->kind = kind; rb_link_node(&id->rb_node, parent, p); rb_insert_color(&id->rb_node, root); } return id; } +static inline struct btf_id *btf_id__add(struct rb_root *root, char *name, enum btf_id_kind kind) +{ + return __btf_id__add(root, name, kind, false); +} + +static inline struct btf_id *btf_id__add_unique(struct rb_root *root, char *name, enum btf_id_kind kind) +{ + return __btf_id__add(root, name, kind, true); +} + static char *get_id(const char *prefix_end) { /* @@ -257,22 +301,36 @@ static char *get_id(const char *prefix_end) return id; } -static struct btf_id *add_set(struct object *obj, char *name, bool is_set8) +static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind kind) { + int len = strlen(name); + int prefixlen; + char *id; + /* * __BTF_ID__set__name * name = ^ * id = ^ */ - char *id = name + (is_set8 ? sizeof(BTF_SET8 "__") : sizeof(BTF_SET "__")) - 1; - int len = strlen(name); + switch (kind) { + case BTF_ID_KIND_SET: + prefixlen = sizeof(BTF_SET "__") - 1; + break; + case BTF_ID_KIND_SET8: + prefixlen = sizeof(BTF_SET8 "__") - 1; + break; + default: + pr_err("Unexpected kind %d passed to %s() for symbol %s\n", kind, __func__, name); + return NULL; + } + id = name + prefixlen; if (id >= name + len) { pr_err("FAILED to parse set name: %s\n", name); return NULL; } - return btf_id__add(&obj->sets, id, true); + return btf_id__add_unique(&obj->sets, id, kind); } static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) @@ -285,45 +343,19 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return NULL; } - return btf_id__add(root, id, false); + return btf_id__add(root, id, BTF_ID_KIND_SYM); } -/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ -#ifndef SHF_COMPRESSED -#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */ -#endif - -/* - * The data of compressed section should be aligned to 4 - * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld - * sets sh_addralign to 1, which makes libelf fail with - * misaligned section error during the update: - * FAILED elf_update(WRITE): invalid section alignment - * - * While waiting for ld fix, we fix the compressed sections - * sh_addralign value manualy. - */ -static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh) +static void bswap_32_data(void *data, u32 nr_bytes) { - int expected = gelf_getclass(elf) == ELFCLASS32 ? 4 : 8; - - if (!(sh->sh_flags & SHF_COMPRESSED)) - return 0; - - if (sh->sh_addralign == expected) - return 0; - - pr_debug2(" - fixing wrong alignment sh_addralign %u, expected %u\n", - sh->sh_addralign, expected); + u32 cnt, i; + u32 *ptr; - sh->sh_addralign = expected; + cnt = nr_bytes / sizeof(u32); + ptr = data; - if (gelf_update_shdr(scn, sh) == 0) { - pr_err("FAILED cannot update section header: %s\n", - elf_errmsg(-1)); - return -1; - } - return 0; + for (i = 0; i < cnt; i++) + ptr[i] = bswap_32(ptr[i]); } static int elf_collect(struct object *obj) @@ -344,7 +376,7 @@ static int elf_collect(struct object *obj) elf_version(EV_CURRENT); - elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL); if (!elf) { close(fd); pr_err("FAILED cannot create ELF descriptor: %s\n", @@ -407,21 +439,20 @@ static int elf_collect(struct object *obj) obj->efile.symbols_shndx = idx; obj->efile.strtabidx = sh.sh_link; } else if (!strcmp(name, BTF_IDS_SECTION)) { + /* + * If target endianness differs from host, we need to bswap32 + * the .BTF_ids section data on load, because .BTF_ids has + * Elf_Type = ELF_T_BYTE, and so libelf returns data buffer in + * the target endianness. We repeat this on dump. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from target to host endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } obj->efile.idlist = data; obj->efile.idlist_shndx = idx; obj->efile.idlist_addr = sh.sh_addr; - } else if (!strcmp(name, BTF_BASE_ELF_SEC)) { - /* If a .BTF.base section is found, do not resolve - * BTF ids relative to vmlinux; resolve relative - * to the .BTF.base section instead. btf__parse_split() - * will take care of this once the base BTF it is - * passed is NULL. - */ - obj->base_btf_path = NULL; } - - if (compressed_section_fix(elf, scn, &sh)) - return -1; } return 0; @@ -488,35 +519,31 @@ static int symbols_collect(struct object *obj) id = add_symbol(&obj->funcs, prefix, sizeof(BTF_FUNC) - 1); /* set8 */ } else if (!strncmp(prefix, BTF_SET8, sizeof(BTF_SET8) - 1)) { - id = add_set(obj, prefix, true); + id = add_set(obj, prefix, BTF_ID_KIND_SET8); /* * SET8 objects store list's count, which is encoded * in symbol's size, together with 'cnt' field hence * that - 1. */ - if (id) { + if (id) id->cnt = sym.st_size / sizeof(uint64_t) - 1; - id->is_set8 = true; - } /* set */ } else if (!strncmp(prefix, BTF_SET, sizeof(BTF_SET) - 1)) { - id = add_set(obj, prefix, false); + id = add_set(obj, prefix, BTF_ID_KIND_SET); /* * SET objects store list's count, which is encoded * in symbol's size, together with 'cnt' field hence * that - 1. */ - if (id) { + if (id) id->cnt = sym.st_size / sizeof(int) - 1; - id->is_set = true; - } } else { pr_err("FAILED unsupported prefix %s\n", prefix); return -1; } if (!id) - return -ENOMEM; + return -EINVAL; if (id->addr_cnt >= ADDR_CNT) { pr_err("FAILED symbol %s crossed the number of allowed lists\n", @@ -529,16 +556,10 @@ static int symbols_collect(struct object *obj) return 0; } -static int symbols_resolve(struct object *obj) +static int load_btf(struct object *obj) { - int nr_typedefs = obj->nr_typedefs; - int nr_structs = obj->nr_structs; - int nr_unions = obj->nr_unions; - int nr_funcs = obj->nr_funcs; - struct btf *base_btf = NULL; - int err, type_id; - struct btf *btf; - __u32 nr_types; + struct btf *base_btf = NULL, *btf = NULL; + int err; if (obj->base_btf_path) { base_btf = btf__parse(obj->base_btf_path, NULL); @@ -546,18 +567,41 @@ static int symbols_resolve(struct object *obj) if (err) { pr_err("FAILED: load base BTF from %s: %s\n", obj->base_btf_path, strerror(-err)); - return -1; + goto out_err; } } - btf = btf__parse_split(obj->btf ?: obj->path, base_btf); + btf = btf__parse_split(obj->btf_path ?: obj->path, base_btf); err = libbpf_get_error(btf); if (err) { pr_err("FAILED: load BTF from %s: %s\n", - obj->btf ?: obj->path, strerror(-err)); - goto out; + obj->btf_path ?: obj->path, strerror(-err)); + goto out_err; } + obj->base_btf = base_btf; + obj->btf = btf; + + return 0; + +out_err: + btf__free(base_btf); + btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; + return err; +} + +static int symbols_resolve(struct object *obj) +{ + int nr_typedefs = obj->nr_typedefs; + int nr_structs = obj->nr_structs; + int nr_unions = obj->nr_unions; + int nr_funcs = obj->nr_funcs; + struct btf *btf = obj->btf; + int err, type_id; + __u32 nr_types; + err = -1; nr_types = btf__type_cnt(btf); @@ -615,8 +659,6 @@ static int symbols_resolve(struct object *obj) err = 0; out: - btf__free(base_btf); - btf__free(btf); return err; } @@ -627,7 +669,7 @@ static int id_patch(struct object *obj, struct btf_id *id) int i; /* For set, set8, id->id may be 0 */ - if (!id->id && !id->is_set && !id->is_set8) { + if (!id->id && id->kind != BTF_ID_KIND_SET && id->kind != BTF_ID_KIND_SET8) { pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name); warnings++; } @@ -680,6 +722,7 @@ static int sets_patch(struct object *obj) { Elf_Data *data = obj->efile.idlist; struct rb_node *next; + int cnt; next = rb_first(&obj->sets); while (next) { @@ -699,39 +742,28 @@ static int sets_patch(struct object *obj) return -1; } - if (id->is_set) { + switch (id->kind) { + case BTF_ID_KIND_SET: set = data->d_buf + off; + cnt = set->cnt; qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); - } else { + break; + case BTF_ID_KIND_SET8: set8 = data->d_buf + off; + cnt = set8->cnt; /* * Make sure id is at the beginning of the pairs * struct, otherwise the below qsort would not work. */ BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); - - /* - * When ELF endianness does not match endianness of the - * host, libelf will do the translation when updating - * the ELF. This, however, corrupts SET8 flags which are - * already in the target endianness. So, let's bswap - * them to the host endianness and libelf will then - * correctly translate everything. - */ - if (obj->efile.encoding != ELFDATANATIVE) { - int i; - - set8->flags = bswap_32(set8->flags); - for (i = 0; i < set8->cnt; i++) { - set8->pairs[i].flags = - bswap_32(set8->pairs[i].flags); - } - } + break; + default: + pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name); + return -1; } - pr_debug("sorting addr %5lu: cnt %6d [%s]\n", - off, id->is_set ? set->cnt : set8->cnt, id->name); + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", off, cnt, id->name); next = rb_next(next); } @@ -740,8 +772,6 @@ static int sets_patch(struct object *obj) static int symbols_patch(struct object *obj) { - off_t err; - if (__symbols_patch(obj, &obj->structs) || __symbols_patch(obj, &obj->unions) || __symbols_patch(obj, &obj->typedefs) || @@ -752,24 +782,665 @@ static int symbols_patch(struct object *obj) if (sets_patch(obj)) return -1; - /* Set type to ensure endian translation occurs. */ - obj->efile.idlist->d_type = ELF_T_WORD; + return 0; +} + +static int dump_raw_data(const char *out_path, const void *data, u32 size) +{ + size_t written; + FILE *file; + + file = fopen(out_path, "wb"); + if (!file) { + pr_err("Couldn't open %s for writing\n", out_path); + return -1; + } + + written = fwrite(data, 1, size, file); + if (written != size) { + pr_err("Failed to write data to %s\n", out_path); + fclose(file); + unlink(out_path); + return -1; + } + + fclose(file); + pr_debug("Dumped %lu bytes of data to %s\n", size, out_path); - elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY); + return 0; +} - err = elf_update(obj->efile.elf, ELF_C_WRITE); - if (err < 0) { - pr_err("FAILED elf_update(WRITE): %s\n", - elf_errmsg(-1)); +static int dump_raw_btf_ids(struct object *obj, const char *out_path) +{ + Elf_Data *data = obj->efile.idlist; + int err; + + if (!data || !data->d_buf) { + pr_debug("%s has no BTF_ids data to dump\n", obj->path); + return 0; + } + + /* + * If target endianness differs from host, we need to bswap32 the + * .BTF_ids section data before dumping so that the output is in + * target endianness. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from host to target endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } + + err = dump_raw_data(out_path, data->d_buf, data->d_size); + if (err) + return -1; + + return 0; +} + +static int dump_raw_btf(struct btf *btf, const char *out_path) +{ + const void *raw_btf_data; + u32 raw_btf_size; + int err; + + raw_btf_data = btf__raw_data(btf, &raw_btf_size); + if (!raw_btf_data) { + pr_err("btf__raw_data() failed\n"); + return -1; + } + + err = dump_raw_data(out_path, raw_btf_data, raw_btf_size); + if (err) + return -1; + + return 0; +} + +static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, s32 type_id) +{ + const struct btf_type *t = btf__type_by_id(btf, type_id); + + while (btf_is_mod(t)) + t = btf__type_by_id(btf, t->type); + + return t; +} + +static int push_decl_tag_id(struct btf2btf_context *ctx, u32 decl_tag_id) +{ + u32 *arr = ctx->decl_tags; + u32 cap = ctx->max_decl_tags; + + if (ctx->nr_decl_tags + 1 > cap) { + cap = max(cap + 256, cap * 2); + arr = realloc(arr, sizeof(u32) * cap); + if (!arr) + return -ENOMEM; + ctx->max_decl_tags = cap; + ctx->decl_tags = arr; + } + + ctx->decl_tags[ctx->nr_decl_tags++] = decl_tag_id; + + return 0; +} + +static int push_kfunc(struct btf2btf_context *ctx, struct kfunc *kfunc) +{ + struct kfunc *arr = ctx->kfuncs; + u32 cap = ctx->max_kfuncs; + + if (ctx->nr_kfuncs + 1 > cap) { + cap = max(cap + 256, cap * 2); + arr = realloc(arr, sizeof(struct kfunc) * cap); + if (!arr) + return -ENOMEM; + ctx->max_kfuncs = cap; + ctx->kfuncs = arr; + } + + ctx->kfuncs[ctx->nr_kfuncs++] = *kfunc; + + return 0; +} + +static int collect_decl_tags(struct btf2btf_context *ctx) +{ + const u32 type_cnt = btf__type_cnt(ctx->btf); + struct btf *btf = ctx->btf; + const struct btf_type *t; + int err; + + for (u32 id = 1; id < type_cnt; id++) { + t = btf__type_by_id(btf, id); + if (!btf_is_decl_tag(t)) + continue; + err = push_decl_tag_id(ctx, id); + if (err) + return err; + } + + return 0; +} + +/* + * To find the kfunc flags having its struct btf_id (with ELF addresses) + * we need to find the address that is in range of a set8. + * If a set8 is found, then the flags are located at addr + 4 bytes. + * Return 0 (no flags!) if not found. + */ +static u32 find_kfunc_flags(struct object *obj, struct btf_id *kfunc_id) +{ + const u32 *elf_data_ptr = obj->efile.idlist->d_buf; + u64 set_lower_addr, set_upper_addr, addr; + struct btf_id *set_id; + struct rb_node *next; + u32 flags; + u64 idx; + + for (next = rb_first(&obj->sets); next; next = rb_next(next)) { + set_id = rb_entry(next, struct btf_id, rb_node); + if (set_id->kind != BTF_ID_KIND_SET8 || set_id->addr_cnt != 1) + continue; + + set_lower_addr = set_id->addr[0]; + set_upper_addr = set_lower_addr + set_id->cnt * sizeof(u64); + + for (u32 i = 0; i < kfunc_id->addr_cnt; i++) { + addr = kfunc_id->addr[i]; + /* + * Lower bound is exclusive to skip the 8-byte header of the set. + * Upper bound is inclusive to capture the last entry at offset 8*cnt. + */ + if (set_lower_addr < addr && addr <= set_upper_addr) { + pr_debug("found kfunc %s in BTF_ID_FLAGS %s\n", + kfunc_id->name, set_id->name); + idx = addr - obj->efile.idlist_addr; + idx = idx / sizeof(u32) + 1; + flags = elf_data_ptr[idx]; + + return flags; + } + } + } + + return 0; +} + +static int collect_kfuncs(struct object *obj, struct btf2btf_context *ctx) +{ + const char *tag_name, *func_name; + struct btf *btf = ctx->btf; + const struct btf_type *t; + u32 flags, func_id; + struct kfunc kfunc; + struct btf_id *id; + int err; + + if (ctx->nr_decl_tags == 0) + return 0; + + for (u32 i = 0; i < ctx->nr_decl_tags; i++) { + t = btf__type_by_id(btf, ctx->decl_tags[i]); + if (btf_kflag(t) || btf_decl_tag(t)->component_idx != -1) + continue; + + tag_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(tag_name, "bpf_kfunc") != 0) + continue; + + func_id = t->type; + t = btf__type_by_id(btf, func_id); + if (!btf_is_func(t)) + continue; + + func_name = btf__name_by_offset(btf, t->name_off); + if (!func_name) + continue; + + id = btf_id__find(&obj->funcs, func_name); + if (!id || id->kind != BTF_ID_KIND_SYM) + continue; + + flags = find_kfunc_flags(obj, id); + + kfunc.name = id->name; + kfunc.btf_id = func_id; + kfunc.flags = flags; + + err = push_kfunc(ctx, &kfunc); + if (err) + return err; + } + + return 0; +} + +static int build_btf2btf_context(struct object *obj, struct btf2btf_context *ctx) +{ + int err; + + ctx->btf = obj->btf; + + err = collect_decl_tags(ctx); + if (err) { + pr_err("ERROR: resolve_btfids: failed to collect decl tags from BTF\n"); + return err; + } + + err = collect_kfuncs(obj, ctx); + if (err) { + pr_err("ERROR: resolve_btfids: failed to collect kfuncs from BTF\n"); + return err; + } + + return 0; +} + + +/* Implicit BPF kfunc arguments can only be of particular types */ +static bool is_kf_implicit_arg(const struct btf *btf, const struct btf_param *p) +{ + static const char *const kf_implicit_arg_types[] = { + "bpf_prog_aux", + }; + const struct btf_type *t; + const char *name; + + t = btf_type_skip_qualifiers(btf, p->type); + if (!btf_is_ptr(t)) + return false; + + t = btf_type_skip_qualifiers(btf, t->type); + if (!btf_is_struct(t)) + return false; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + return false; + + for (int i = 0; i < ARRAY_SIZE(kf_implicit_arg_types); i++) + if (strcmp(name, kf_implicit_arg_types[i]) == 0) + return true; + + return false; +} + +/* + * For a kfunc with KF_IMPLICIT_ARGS we do the following: + * 1. Add a new function with _impl suffix in the name, with the prototype + * of the original kfunc. + * 2. Add all decl tags except "bpf_kfunc" for the _impl func. + * 3. Add a new function prototype with modified list of arguments: + * omitting implicit args. + * 4. Change the prototype of the original kfunc to the new one. + * + * This way we transform the BTF associated with the kfunc from + * __bpf_kfunc bpf_foo(int arg1, void *implicit_arg); + * into + * bpf_foo_impl(int arg1, void *implicit_arg); + * __bpf_kfunc bpf_foo(int arg1); + * + * If a kfunc with KF_IMPLICIT_ARGS already has an _impl counterpart + * in BTF, then it's a legacy case: an _impl function is declared in the + * source code. In this case, we can skip adding an _impl function, but we + * still have to add a func prototype that omits implicit args. + */ +static int process_kfunc_with_implicit_args(struct btf2btf_context *ctx, struct kfunc *kfunc) +{ + s32 idx, new_proto_id, new_func_id, proto_id; + const char *param_name, *tag_name; + const struct btf_param *params; + enum btf_func_linkage linkage; + char tmp_name[KSYM_NAME_LEN]; + struct btf *btf = ctx->btf; + int err, len, nr_params; + struct btf_type *t; + + t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id); + if (!t || !btf_is_func(t)) { + pr_err("ERROR: resolve_btfids: btf id %d is not a function\n", kfunc->btf_id); + return -EINVAL; + } + + linkage = btf_vlen(t); + + proto_id = t->type; + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + if (!t || !btf_is_func_proto(t)) { + pr_err("ERROR: resolve_btfids: btf id %d is not a function prototype\n", proto_id); + return -EINVAL; + } + + len = snprintf(tmp_name, sizeof(tmp_name), "%s%s", kfunc->name, KF_IMPL_SUFFIX); + if (len < 0 || len >= sizeof(tmp_name)) { + pr_err("ERROR: function name is too long: %s%s\n", kfunc->name, KF_IMPL_SUFFIX); + return -E2BIG; + } + + if (btf__find_by_name_kind(btf, tmp_name, BTF_KIND_FUNC) > 0) { + pr_debug("resolve_btfids: function %s already exists in BTF\n", tmp_name); + goto add_new_proto; + } + + /* Add a new function with _impl suffix and original prototype */ + new_func_id = btf__add_func(btf, tmp_name, linkage, proto_id); + if (new_func_id < 0) { + pr_err("ERROR: resolve_btfids: failed to add func %s to BTF\n", tmp_name); + return new_func_id; + } + + /* Copy all decl tags except "bpf_kfunc" from the original kfunc to the new one */ + for (int i = 0; i < ctx->nr_decl_tags; i++) { + t = (struct btf_type *)btf__type_by_id(btf, ctx->decl_tags[i]); + if (t->type != kfunc->btf_id) + continue; + + tag_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(tag_name, "bpf_kfunc") == 0) + continue; + + idx = btf_decl_tag(t)->component_idx; + + if (btf_kflag(t)) + err = btf__add_decl_attr(btf, tag_name, new_func_id, idx); + else + err = btf__add_decl_tag(btf, tag_name, new_func_id, idx); + + if (err < 0) { + pr_err("ERROR: resolve_btfids: failed to add decl tag %s for %s\n", + tag_name, tmp_name); + return -EINVAL; + } + } + +add_new_proto: + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + new_proto_id = btf__add_func_proto(btf, t->type); + if (new_proto_id < 0) { + pr_err("ERROR: resolve_btfids: failed to add func proto for %s\n", kfunc->name); + return new_proto_id; + } + + /* Add non-implicit args to the new prototype */ + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + nr_params = btf_vlen(t); + for (int i = 0; i < nr_params; i++) { + params = btf_params(t); + if (is_kf_implicit_arg(btf, ¶ms[i])) + break; + param_name = btf__name_by_offset(btf, params[i].name_off); + err = btf__add_func_param(btf, param_name, params[i].type); + if (err < 0) { + pr_err("ERROR: resolve_btfids: failed to add param %s for %s\n", + param_name, kfunc->name); + return err; + } + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + } + + /* Finally change the prototype of the original kfunc to the new one */ + t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id); + t->type = new_proto_id; + + pr_debug("resolve_btfids: updated BTF for kfunc with implicit args %s\n", kfunc->name); + + return 0; +} + +static int btf2btf(struct object *obj) +{ + struct btf2btf_context ctx = {}; + int err; + + err = build_btf2btf_context(obj, &ctx); + if (err) + goto out; + + for (u32 i = 0; i < ctx.nr_kfuncs; i++) { + struct kfunc *kfunc = &ctx.kfuncs[i]; + + if (!(kfunc->flags & KF_IMPLICIT_ARGS)) + continue; + + err = process_kfunc_with_implicit_args(&ctx, kfunc); + if (err) + goto out; + } + + err = 0; +out: + free(ctx.decl_tags); + free(ctx.kfuncs); + + return err; +} + +/* + * Sort types by name in ascending order resulting in all + * anonymous types being placed before named types. + */ +static int cmp_type_names(const void *a, const void *b, void *priv) +{ + struct btf *btf = (struct btf *)priv; + const struct btf_type *ta = btf__type_by_id(btf, *(__u32 *)a); + const struct btf_type *tb = btf__type_by_id(btf, *(__u32 *)b); + const char *na, *nb; + int r; + + na = btf__str_by_offset(btf, ta->name_off); + nb = btf__str_by_offset(btf, tb->name_off); + r = strcmp(na, nb); + if (r != 0) + return r; + + /* preserve original relative order of anonymous or same-named types */ + return *(__u32 *)a < *(__u32 *)b ? -1 : 1; +} + +static int sort_btf_by_name(struct btf *btf) +{ + __u32 *permute_ids = NULL, *id_map = NULL; + int nr_types, i, err = 0; + __u32 start_id = 0, id; + + if (btf__base_btf(btf)) + start_id = btf__type_cnt(btf__base_btf(btf)); + nr_types = btf__type_cnt(btf) - start_id; + + permute_ids = calloc(nr_types, sizeof(*permute_ids)); + if (!permute_ids) { + err = -ENOMEM; + goto out; + } + + id_map = calloc(nr_types, sizeof(*id_map)); + if (!id_map) { + err = -ENOMEM; + goto out; + } + + for (i = 0, id = start_id; i < nr_types; i++, id++) + permute_ids[i] = id; + + qsort_r(permute_ids, nr_types, sizeof(*permute_ids), cmp_type_names, + btf); + + for (i = 0; i < nr_types; i++) { + id = permute_ids[i] - start_id; + id_map[id] = i + start_id; + } + + err = btf__permute(btf, id_map, nr_types, NULL); + if (err) + pr_err("FAILED: btf permute: %s\n", strerror(-err)); + +out: + free(permute_ids); + free(id_map); + return err; +} + +static int finalize_btf(struct object *obj) +{ + struct btf *base_btf = obj->base_btf, *btf = obj->btf; + int err; + + if (obj->base_btf && obj->distill_base) { + err = btf__distill_base(obj->btf, &base_btf, &btf); + if (err) { + pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); + goto out_err; + } + + btf__free(obj->base_btf); + btf__free(obj->btf); + obj->base_btf = base_btf; + obj->btf = btf; + } + + err = sort_btf_by_name(obj->btf); + if (err) { + pr_err("FAILED to sort BTF: %s\n", strerror(errno)); + goto out_err; } - pr_debug("update %s for %s\n", - err >= 0 ? "ok" : "failed", obj->path); - return err < 0 ? -1 : 0; + return 0; + +out_err: + btf__free(base_btf); + btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; + + return err; +} + +static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) +{ + int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); + + if (len < 0 || len >= buf_sz) { + pr_err("Output path is too long: %s%s\n", in_path, suffix); + return -E2BIG; + } + + return 0; +} + +/* + * Patch the .BTF_ids section of an ELF file with data from provided file. + * Equivalent to: objcopy --update-section .BTF_ids=<btfids> <elf> + * + * 1. Find .BTF_ids section in the ELF + * 2. Verify that blob file size matches section size + * 3. Update section data buffer with blob data + * 4. Write the ELF file + */ +static int patch_btfids(const char *btfids_path, const char *elf_path) +{ + Elf_Scn *scn = NULL; + FILE *btfids_file; + size_t shdrstrndx; + int fd, err = -1; + Elf_Data *data; + struct stat st; + GElf_Shdr sh; + char *name; + Elf *elf; + + elf_version(EV_CURRENT); + + fd = open(elf_path, O_RDWR, 0666); + if (fd < 0) { + pr_err("FAILED to open %s: %s\n", elf_path, strerror(errno)); + return -1; + } + + elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + if (!elf) { + close(fd); + pr_err("FAILED cannot create ELF descriptor: %s\n", elf_errmsg(-1)); + return -1; + } + + elf_flagelf(elf, ELF_C_SET, ELF_F_LAYOUT); + + if (elf_getshdrstrndx(elf, &shdrstrndx) != 0) { + pr_err("FAILED cannot get shdr str ndx\n"); + goto out; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + + if (gelf_getshdr(scn, &sh) != &sh) { + pr_err("FAILED to get section header\n"); + goto out; + } + + name = elf_strptr(elf, shdrstrndx, sh.sh_name); + if (!name) + continue; + + if (strcmp(name, BTF_IDS_SECTION) == 0) + break; + } + + if (!scn) { + pr_err("FAILED: section %s not found in %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + data = elf_getdata(scn, NULL); + if (!data) { + pr_err("FAILED to get %s section data from %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + if (stat(btfids_path, &st) < 0) { + pr_err("FAILED to stat %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + if ((size_t)st.st_size != data->d_size) { + pr_err("FAILED: size mismatch - %s section in %s is %zu bytes, %s is %zu bytes\n", + BTF_IDS_SECTION, elf_path, data->d_size, btfids_path, (size_t)st.st_size); + goto out; + } + + btfids_file = fopen(btfids_path, "rb"); + if (!btfids_file) { + pr_err("FAILED to open %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + pr_debug("Copying data from %s to %s section of %s (%zu bytes)\n", + btfids_path, BTF_IDS_SECTION, elf_path, data->d_size); + + if (fread(data->d_buf, data->d_size, 1, btfids_file) != 1) { + pr_err("FAILED to read %s\n", btfids_path); + fclose(btfids_file); + goto out; + } + fclose(btfids_file); + + elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY); + if (elf_update(elf, ELF_C_WRITE) < 0) { + pr_err("FAILED to update ELF file %s\n", elf_path); + goto out; + } + + err = 0; +out: + elf_end(elf); + close(fd); + + return err; } static const char * const resolve_btfids_usage[] = { "resolve_btfids [<options>] <ELF object>", + "resolve_btfids --patch_btfids <.BTF_ids file> <ELF object>", NULL }; @@ -786,16 +1457,24 @@ int main(int argc, const char **argv) .funcs = RB_ROOT, .sets = RB_ROOT, }; + const char *btfids_path = NULL; bool fatal_warnings = false; + bool resolve_btfids = true; + char out_path[PATH_MAX]; + struct option btfid_options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show errors, etc)"), - OPT_STRING(0, "btf", &obj.btf, "BTF data", - "BTF data"), + OPT_STRING(0, "btf", &obj.btf_path, "file", + "path to a file with input BTF data"), OPT_STRING('b', "btf_base", &obj.base_btf_path, "file", "path of file providing base BTF"), OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings, "turn warnings into errors"), + OPT_BOOLEAN(0, "distill_base", &obj.distill_base, + "distill --btf_base and emit .BTF.base section data"), + OPT_STRING(0, "patch_btfids", &btfids_path, "file", + "path to .BTF_ids section data blob to patch into ELF file"), OPT_END() }; int err = -1; @@ -807,6 +1486,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (btfids_path) + return patch_btfids(btfids_path, obj.path); + if (elf_collect(&obj)) goto out; @@ -816,23 +1498,55 @@ int main(int argc, const char **argv) */ if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { - pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n"); - err = 0; - goto out; + pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n"); + resolve_btfids = false; } - if (symbols_collect(&obj)) + if (resolve_btfids) + if (symbols_collect(&obj)) + goto out; + + if (load_btf(&obj)) goto out; + if (btf2btf(&obj)) + goto out; + + if (finalize_btf(&obj)) + goto out; + + if (!resolve_btfids) + goto dump_btf; + if (symbols_resolve(&obj)) goto out; if (symbols_patch(&obj)) goto out; + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_IDS_SECTION); + err = err ?: dump_raw_btf_ids(&obj, out_path); + if (err) + goto out; + +dump_btf: + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_ELF_SEC); + err = err ?: dump_raw_btf(obj.btf, out_path); + if (err) + goto out; + + if (obj.base_btf && obj.distill_base) { + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_BASE_ELF_SEC); + err = err ?: dump_raw_btf(obj.base_btf, out_path); + if (err) + goto out; + } + if (!(fatal_warnings && warnings)) err = 0; out: + btf__free(obj.base_btf); + btf__free(obj.btf); if (obj.efile.elf) { elf_end(obj.efile.elf); close(obj.efile.fd); diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index e7da0909d097..e1571c04afb5 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -211,9 +211,25 @@ else addout "J" echo " * fwctl's mutating debug interface was used (#19)" fi +echo "Raw taint value as int/string: $taint/'$out'" + +# report on any tainted loadable modules +[ "$1" = "" ] && [ -r /sys/module/ ] && \ + cnt=`grep [A-Z] /sys/module/*/taint | wc -l` || cnt=0 +if [ $cnt -ne 0 ]; then + echo + echo "Tainted modules:" + for dir in `ls /sys/module` ; do + if [ -r /sys/module/$dir/taint ]; then + modtnt=`cat /sys/module/$dir/taint` + [ "$modtnt" = "" ] || echo " * $dir ($modtnt)" + fi + done +fi + +echo echo "For a more detailed explanation of the various taint flags see" echo " Documentation/admin-guide/tainted-kernels.rst in the Linux kernel sources" echo " or https://kernel.org/doc/html/latest/admin-guide/tainted-kernels.html" -echo "Raw taint value as int/string: $taint/'$out'" #EOF# diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh index 05552dbda5bc..53514c759dc1 100755 --- a/tools/docs/find-unused-docs.sh +++ b/tools/docs/find-unused-docs.sh @@ -28,7 +28,7 @@ if ! [ -d "$1" ]; then fi cd "$( dirname "${BASH_SOURCE[0]}" )" -cd .. +cd ../.. cd Documentation/ @@ -54,7 +54,7 @@ for file in `find $1 -name '*.c'`; do if [[ ${FILES_INCLUDED[$file]+_} ]]; then continue; fi - str=$(PYTHONDONTWRITEBYTECODE=1 scripts/kernel-doc -export "$file" 2>/dev/null) + str=$(PYTHONDONTWRITEBYTECODE=1 tools/docs/kernel-doc -export "$file" 2>/dev/null) if [[ -n "$str" ]]; then echo "$file" fi diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc new file mode 100755 index 000000000000..aed09f9a54dd --- /dev/null +++ b/tools/docs/kernel-doc @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# +# pylint: disable=C0103,R0912,R0914,R0915 +# +# NOTE: While kernel-doc requires at least version 3.6 to run, the +# command line should work with Python 3.2+ (tested with 3.4). +# The rationale is that it shall fail gracefully during Kernel +# compilation with older Kernel versions. Due to that: +# - encoding line is needed here; +# - f-strings cannot be used in this file. +# - libraries that require newer versions can only be included +# after the Python version has been checked. +# +# Converted from the kernel-doc script originally written in Perl +# under GPLv2, copyrighted since 1998 by the following authors: +# +# Aditya Srivastava <yashsri421@gmail.com> +# Akira Yokosawa <akiyks@gmail.com> +# Alexander A. Klimov <grandmaster@al2klimov.de> +# Alexander Lobakin <aleksander.lobakin@intel.com> +# André Almeida <andrealmeid@igalia.com> +# Andy Shevchenko <andriy.shevchenko@linux.intel.com> +# Anna-Maria Behnsen <anna-maria@linutronix.de> +# Armin Kuster <akuster@mvista.com> +# Bart Van Assche <bart.vanassche@sandisk.com> +# Ben Hutchings <ben@decadent.org.uk> +# Borislav Petkov <bbpetkov@yahoo.de> +# Chen-Yu Tsai <wenst@chromium.org> +# Coco Li <lixiaoyan@google.com> +# Conchúr Navid <conchur@web.de> +# Daniel Santos <daniel.santos@pobox.com> +# Danilo Cesar Lemes de Paula <danilo.cesar@collabora.co.uk> +# Dan Luedtke <mail@danrl.de> +# Donald Hunter <donald.hunter@gmail.com> +# Gabriel Krisman Bertazi <krisman@collabora.co.uk> +# Greg Kroah-Hartman <gregkh@linuxfoundation.org> +# Harvey Harrison <harvey.harrison@gmail.com> +# Horia Geanta <horia.geanta@freescale.com> +# Ilya Dryomov <idryomov@gmail.com> +# Jakub Kicinski <kuba@kernel.org> +# Jani Nikula <jani.nikula@intel.com> +# Jason Baron <jbaron@redhat.com> +# Jason Gunthorpe <jgg@nvidia.com> +# Jérémy Bobbio <lunar@debian.org> +# Johannes Berg <johannes.berg@intel.com> +# Johannes Weiner <hannes@cmpxchg.org> +# Jonathan Cameron <Jonathan.Cameron@huawei.com> +# Jonathan Corbet <corbet@lwn.net> +# Jonathan Neuschäfer <j.neuschaefer@gmx.net> +# Kamil Rytarowski <n54@gmx.com> +# Kees Cook <kees@kernel.org> +# Laurent Pinchart <laurent.pinchart@ideasonboard.com> +# Levin, Alexander (Sasha Levin) <alexander.levin@verizon.com> +# Linus Torvalds <torvalds@linux-foundation.org> +# Lucas De Marchi <lucas.demarchi@profusion.mobi> +# Mark Rutland <mark.rutland@arm.com> +# Markus Heiser <markus.heiser@darmarit.de> +# Martin Waitz <tali@admingilde.org> +# Masahiro Yamada <masahiroy@kernel.org> +# Matthew Wilcox <willy@infradead.org> +# Mauro Carvalho Chehab <mchehab+huawei@kernel.org> +# Michal Wajdeczko <michal.wajdeczko@intel.com> +# Michael Zucchi +# Mike Rapoport <rppt@linux.ibm.com> +# Niklas Söderlund <niklas.soderlund@corigine.com> +# Nishanth Menon <nm@ti.com> +# Paolo Bonzini <pbonzini@redhat.com> +# Pavan Kumar Linga <pavan.kumar.linga@intel.com> +# Pavel Pisa <pisa@cmp.felk.cvut.cz> +# Peter Maydell <peter.maydell@linaro.org> +# Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com> +# Randy Dunlap <rdunlap@infradead.org> +# Richard Kennedy <richard@rsk.demon.co.uk> +# Rich Walker <rw@shadow.org.uk> +# Rolf Eike Beer <eike-kernel@sf-tec.de> +# Sakari Ailus <sakari.ailus@linux.intel.com> +# Silvio Fricke <silvio.fricke@gmail.com> +# Simon Huggins +# Tim Waugh <twaugh@redhat.com> +# Tomasz Warniełło <tomasz.warniello@gmail.com> +# Utkarsh Tripathi <utripathi2002@gmail.com> +# valdis.kletnieks@vt.edu <valdis.kletnieks@vt.edu> +# Vegard Nossum <vegard.nossum@oracle.com> +# Will Deacon <will.deacon@arm.com> +# Yacine Belkadi <yacine.belkadi.1@gmail.com> +# Yujie Liu <yujie.liu@intel.com> + +""" +Print formatted kernel documentation to stdout. + +Read C language source or header FILEs, extract embedded +documentation comments, and print formatted documentation +to standard output. + +The documentation comments are identified by the ``/**`` +opening comment mark. + +See Documentation/doc-guide/kernel-doc.rst for the +documentation comment syntax. +""" + +import argparse +import logging +import os +import sys + +# Import Python modules + +LIB_DIR = "../lib/python" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +WERROR_RETURN_CODE = 3 + +DESC = """ +Read C language source or header FILEs, extract embedded documentation comments, +and print formatted documentation to standard output. + +The documentation comments are identified by the "/**" opening comment mark. + +See Documentation/doc-guide/kernel-doc.rst for the documentation comment syntax. +""" + +EXPORT_FILE_DESC = """ +Specify an additional FILE in which to look for EXPORT_SYMBOL information. + +May be used multiple times. +""" + +EXPORT_DESC = """ +Only output documentation for symbols that have been +exported using EXPORT_SYMBOL() and related macros in any input +FILE or -export-file FILE. +""" + +INTERNAL_DESC = """ +Only output documentation for symbols that have NOT been +exported using EXPORT_SYMBOL() and related macros in any input +FILE or -export-file FILE. +""" + +FUNCTION_DESC = """ +Only output documentation for the given function or DOC: section +title. All other functions and DOC: sections are ignored. + +May be used multiple times. +""" + +NOSYMBOL_DESC = """ +Exclude the specified symbol from the output documentation. + +May be used multiple times. +""" + +FILES_DESC = """ +Header and C source files to be parsed. +""" + +WARN_CONTENTS_BEFORE_SECTIONS_DESC = """ +Warn if there are contents before sections (deprecated). + +This option is kept just for backward-compatibility, but it does nothing, +neither here nor at the original Perl script. +""" + +EPILOG = """ +The return value is: + +- 0: success or Python version is not compatible with +kernel-doc. If -Werror is not used, it will also +return 0 if there are issues at kernel-doc markups; + +- 1: an abnormal condition happened; + +- 2: argparse issued an error; + +- 3: When -Werror is used, it means that one or more unfiltered parse + warnings happened. +""" + +class MsgFormatter(logging.Formatter): + """ + Helper class to capitalize errors and warnings, the same way + the venerable (now retired) kernel-doc.pl used to do. + """ + + def format(self, record): + record.levelname = record.levelname.capitalize() + return logging.Formatter.format(self, record) + +def main(): + """ + Main program. + + """ + + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, + description=DESC, epilog=EPILOG) + + # + # Normal arguments + # + parser.add_argument("-v", "-verbose", "--verbose", action="store_true", + help="Verbose output, more warnings and other information.") + + parser.add_argument("-d", "-debug", "--debug", action="store_true", + help="Enable debug messages") + + parser.add_argument("-M", "-modulename", "--modulename", + default="Kernel API", + help="Allow setting a module name at the output.") + + parser.add_argument("-l", "-enable-lineno", "--enable_lineno", + action="store_true", + help="Enable line number output (only in ReST mode)") + + # + # Arguments to control the warning behavior + # + parser.add_argument("-Wreturn", "--wreturn", action="store_true", + help="Warns about the lack of a return markup on functions.") + + parser.add_argument("-Wshort-desc", "-Wshort-description", "--wshort-desc", + action="store_true", + help="Warns if initial short description is missing") + + parser.add_argument("-Wcontents-before-sections", + "--wcontents-before-sections", action="store_true", + help=WARN_CONTENTS_BEFORE_SECTIONS_DESC) + + parser.add_argument("-Wall", "--wall", action="store_true", + help="Enable all types of warnings") + + parser.add_argument("-Werror", "--werror", action="store_true", + help="Treat warnings as errors.") + + parser.add_argument("-export-file", "--export-file", action='append', + help=EXPORT_FILE_DESC) + + # + # Output format mutually-exclusive group + # + out_group = parser.add_argument_group("Output format selection (mutually exclusive)") + + out_fmt = out_group.add_mutually_exclusive_group() + + out_fmt.add_argument("-m", "-man", "--man", action="store_true", + help="Output troff manual page format.") + out_fmt.add_argument("-r", "-rst", "--rst", action="store_true", + help="Output reStructuredText format (default).") + out_fmt.add_argument("-N", "-none", "--none", action="store_true", + help="Do not output documentation, only warnings.") + + # + # Output selection mutually-exclusive group + # + sel_group = parser.add_argument_group("Output selection (mutually exclusive)") + sel_mut = sel_group.add_mutually_exclusive_group() + + sel_mut.add_argument("-e", "-export", "--export", action='store_true', + help=EXPORT_DESC) + + sel_mut.add_argument("-i", "-internal", "--internal", action='store_true', + help=INTERNAL_DESC) + + sel_mut.add_argument("-s", "-function", "--symbol", action='append', + help=FUNCTION_DESC) + + # + # Those are valid for all 3 types of filter + # + parser.add_argument("-n", "-nosymbol", "--nosymbol", action='append', + help=NOSYMBOL_DESC) + + parser.add_argument("-D", "-no-doc-sections", "--no-doc-sections", + action='store_true', help="Don't output DOC sections") + + parser.add_argument("files", metavar="FILE", + nargs="+", help=FILES_DESC) + + args = parser.parse_args() + + if args.wall: + args.wreturn = True + args.wshort_desc = True + args.wcontents_before_sections = True + + logger = logging.getLogger() + + if not args.debug: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.DEBUG) + + formatter = MsgFormatter('%(levelname)s: %(message)s') + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger.addHandler(handler) + + python_ver = sys.version_info[:2] + if python_ver < (3,6): + # + # Depending on the Kernel configuration, kernel-doc --none is called at + # build time. As we don't want to break compilation due to the + # usage of an old Python version, return 0 here. + # + if args.none: + logger.error("Python 3.6 or later is required by kernel-doc. Skipping checks") + sys.exit(0) + + sys.exit("Python 3.6 or later is required by kernel-doc. Aborting.") + + if python_ver < (3,7): + logger.warning("Python 3.7 or later is required for correct results") + + # + # Import kernel-doc libraries only after checking the Python version + # + from kdoc.kdoc_files import KernelFiles # pylint: disable=C0415 + from kdoc.kdoc_output import RestFormat, ManFormat # pylint: disable=C0415 + + if args.man: + out_style = ManFormat(modulename=args.modulename) + elif args.none: + out_style = None + else: + out_style = RestFormat() + + kfiles = KernelFiles(verbose=args.verbose, + out_style=out_style, werror=args.werror, + wreturn=args.wreturn, wshort_desc=args.wshort_desc, + wcontents_before_sections=args.wcontents_before_sections) + + kfiles.parse(args.files, export_file=args.export_file) + + for t in kfiles.msg(enable_lineno=args.enable_lineno, export=args.export, + internal=args.internal, symbol=args.symbol, + nosymbol=args.nosymbol, export_file=args.export_file, + no_doc_sections=args.no_doc_sections): + msg = t[1] + if msg: + print(msg) + + error_count = kfiles.errors + if not error_count: + sys.exit(0) + + if args.werror: + print("%s warnings as errors" % error_count) # pylint: disable=C0209 + sys.exit(WERROR_RETURN_CODE) + + if args.verbose: + print("%s errors" % error_count) # pylint: disable=C0209 + + sys.exit(0) + +# +# Call main method +# +if __name__ == "__main__": + main() diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper index 7a5fcef25429..b7c149dff06b 100755 --- a/tools/docs/sphinx-build-wrapper +++ b/tools/docs/sphinx-build-wrapper @@ -119,16 +119,17 @@ class SphinxBuilder: return path - def check_rust(self): + def check_rust(self, sphinxdirs): """ Checks if Rust is enabled """ - self.rustdoc = False - config = os.path.join(self.srctree, ".config") + if not {'.', 'rust'}.intersection(sphinxdirs): + return False + if not os.path.isfile(config): - return + return False re_rust = re.compile(r"CONFIG_RUST=(m|y)") @@ -136,11 +137,13 @@ class SphinxBuilder: with open(config, "r", encoding="utf-8") as fp: for line in fp: if re_rust.match(line): - self.rustdoc = True - return + return True except OSError as e: print(f"Failed to open {config}", file=sys.stderr) + return False + + return False def get_sphinx_extra_opts(self, n_jobs): """ @@ -165,6 +168,7 @@ class SphinxBuilder: parser = argparse.ArgumentParser() parser.add_argument('-j', '--jobs', type=int) parser.add_argument('-q', '--quiet', action='store_true') + parser.add_argument('-v', '--verbose', default=0, action='count') # # Other sphinx-build arguments go as-is, so place them @@ -176,10 +180,14 @@ class SphinxBuilder: # Build a list of sphinx args, honoring verbosity here if specified # - verbose = self.verbose sphinx_args, self.sphinxopts = parser.parse_known_args(sphinxopts) + + verbose = sphinx_args.verbose + if self.verbose: + verbose += 1 + if sphinx_args.quiet is True: - verbose = False + verbose = 0 # # If the user explicitly sets "-j" at command line, use it. @@ -192,8 +200,11 @@ class SphinxBuilder: else: self.n_jobs = None - if not verbose: + if verbose < 1: self.sphinxopts += ["-q"] + else: + for i in range(1, sphinx_args.verbose): + self.sphinxopts += ["-v"] def __init__(self, builddir, venv=None, verbose=False, n_jobs=None, interactive=None): @@ -246,7 +257,7 @@ class SphinxBuilder: # self.sphinxbuild = os.environ.get("SPHINXBUILD", "sphinx-build") self.kerneldoc = self.get_path(os.environ.get("KERNELDOC", - "scripts/kernel-doc.py")) + "tools/docs/kernel-doc")) self.builddir = self.get_path(builddir, use_cwd=True, abs_path=True) # @@ -259,8 +270,6 @@ class SphinxBuilder: self.get_sphinx_extra_opts(n_jobs) - self.check_rust() - # # If venv command line argument is specified, run Sphinx from venv # @@ -352,23 +361,6 @@ class SphinxBuilder: except (OSError, IOError) as e: print(f"Warning: Failed to copy CSS: {e}", file=sys.stderr) - if self.rustdoc: - print("Building rust docs") - if "MAKE" in self.env: - cmd = [self.env["MAKE"]] - else: - cmd = ["make", "LLVM=1"] - - cmd += [ "rustdoc"] - if self.verbose: - print(" ".join(cmd)) - - try: - subprocess.run(cmd, check=True) - except subprocess.CalledProcessError as e: - print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?", - file=sys.stderr) - def build_pdf_file(self, latex_cmd, from_dir, path): """Builds a single pdf file using latex_cmd""" try: @@ -689,6 +681,19 @@ class SphinxBuilder: if kerneldoc.startswith(self.srctree): kerneldoc = os.path.relpath(kerneldoc, self.srctree) + if not sphinxdirs: + sphinxdirs = os.environ.get("SPHINXDIRS", ".") + + # + # sphinxdirs can be a list or a whitespace-separated string + # + sphinxdirs_list = [] + for sphinxdir in sphinxdirs: + if isinstance(sphinxdir, list): + sphinxdirs_list += sphinxdir + else: + sphinxdirs_list += sphinxdir.split() + args = [ "-b", builder, "-c", docs_dir ] if builder == "latex": @@ -697,12 +702,10 @@ class SphinxBuilder: args.extend(["-D", f"latex_elements.papersize={paper}paper"]) - if self.rustdoc: + rustdoc = self.check_rust(sphinxdirs_list) + if rustdoc: args.extend(["-t", "rustdoc"]) - if not sphinxdirs: - sphinxdirs = os.environ.get("SPHINXDIRS", ".") - # # The sphinx-build tool has a bug: internally, it tries to set # locale with locale.setlocale(locale.LC_ALL, ''). This causes a @@ -714,16 +717,6 @@ class SphinxBuilder: self.env["LC_ALL"] = "C" # - # sphinxdirs can be a list or a whitespace-separated string - # - sphinxdirs_list = [] - for sphinxdir in sphinxdirs: - if isinstance(sphinxdir, list): - sphinxdirs_list += sphinxdir - else: - sphinxdirs_list += sphinxdir.split() - - # # Step 1: Build each directory in separate. # # This is not the best way of handling it, as cross-references between @@ -750,7 +743,6 @@ class SphinxBuilder: build_args = args + [ "-d", doctree_dir, - "-D", f"kerneldoc_bin={kerneldoc}", "-D", f"version={self.kernelversion}", "-D", f"release={self.kernelrelease}", "-D", f"kerneldoc_srctree={self.srctree}", @@ -786,6 +778,23 @@ class SphinxBuilder: elif target == "infodocs": self.handle_info(output_dirs) + if rustdoc and target in ["htmldocs", "epubdocs"]: + print("Building rust docs") + if "MAKE" in self.env: + cmd = [self.env["MAKE"]] + else: + cmd = ["make", "LLVM=1"] + + cmd += [ "rustdoc"] + if self.verbose: + print(" ".join(cmd)) + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?", + file=sys.stderr) + def jobs_type(value): """ Handle valid values for -j. Accepts Sphinx "-jauto", plus a number @@ -805,20 +814,42 @@ def jobs_type(value): except ValueError: raise argparse.ArgumentTypeError(f"Must be 'auto' or positive integer, got {value}") # pylint: disable=W0707 +EPILOG=""" +Besides the command line arguments, several environment variables affect its +default behavior, meant to be used when called via Kernel Makefile: + +- KERNELVERSION: Kernel major version +- KERNELRELEASE: Kernel release +- KBUILD_VERBOSE: Contains the value of "make V=[0|1] variable. + When V=0 (KBUILD_VERBOSE=0), sets verbose level to "-q". +- SPHINXBUILD: Documentation build tool (default: "sphinx-build"). +- SPHINXOPTS: Extra options pased to SPHINXBUILD + (default: "-j auto" and "-q" if KBUILD_VERBOSE=0). + The "-v" flag can be used to increase verbosity. + If V=0, the first "-v" will drop "-q". +- PYTHON3: Python command to run SPHINXBUILD +- PDFLATEX: LaTeX PDF engine. (default: "xelatex") +- LATEXOPTS: Optional set of command line arguments to the LaTeX engine +- srctree: Location of the Kernel root directory (default: "."). + +""" + def main(): """ Main function. The only mandatory argument is the target. If not specified, the other arguments will use default values if not specified at os.environ. """ - parser = argparse.ArgumentParser(description="Kernel documentation builder") + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, + description=__doc__, + epilog=EPILOG) parser.add_argument("target", choices=list(TARGETS.keys()), help="Documentation target to build") parser.add_argument("--sphinxdirs", nargs="+", help="Specific directories to build") parser.add_argument("--builddir", default="output", - help="Sphinx configuration file") + help="Sphinx configuration file (default: %(default)s)") parser.add_argument("--theme", help="Sphinx theme to use") @@ -834,7 +865,7 @@ def main(): help="place build in verbose mode") parser.add_argument('-j', '--jobs', type=jobs_type, - help="Sets number of jobs to use with sphinx-build") + help="Sets number of jobs to use with sphinx-build(default: auto)") parser.add_argument('-i', '--interactive', action='store_true', help="Change latex default to run in interactive mode") diff --git a/tools/include/linux/compiler-context-analysis.h b/tools/include/linux/compiler-context-analysis.h new file mode 100644 index 000000000000..13a9115e9e58 --- /dev/null +++ b/tools/include/linux/compiler-context-analysis.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H +#define _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H + +/* + * Macros and attributes for compiler-based static context analysis. + * No-op stubs for tools. + */ + +#define __guarded_by(...) +#define __pt_guarded_by(...) + +#define context_lock_struct(name, ...) struct __VA_ARGS__ name + +#define __no_context_analysis +#define __context_unsafe(comment) +#define context_unsafe(...) ({ __VA_ARGS__; }) +#define context_unsafe_alias(p) +#define disable_context_analysis() +#define enable_context_analysis() + +#define __must_hold(...) +#define __must_not_hold(...) +#define __acquires(...) +#define __cond_acquires(ret, x) +#define __releases(...) +#define __acquire(x) (void)0 +#define __release(x) (void)0 + +#define __must_hold_shared(...) +#define __acquires_shared(...) +#define __cond_acquires_shared(ret, x) +#define __releases_shared(...) +#define __acquire_shared(x) (void)0 +#define __release_shared(x) (void)0 + +#define __acquire_ret(call, expr) (call) +#define __acquire_shared_ret(call, expr) (call) +#define __acquires_ret +#define __acquires_shared_ret + +#endif /* _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H */ diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index d09f9dc172a4..b3adbf5682f0 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -13,23 +13,7 @@ #define __has_builtin(x) (0) #endif -#ifdef __CHECKER__ -/* context/locking */ -# define __must_hold(x) __attribute__((context(x,1,1))) -# define __acquires(x) __attribute__((context(x,0,1))) -# define __releases(x) __attribute__((context(x,1,0))) -# define __acquire(x) __context__(x,1) -# define __release(x) __context__(x,-1) -# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) -#else /* __CHECKER__ */ -/* context/locking */ -# define __must_hold(x) -# define __acquires(x) -# define __releases(x) -# define __acquire(x) (void)0 -# define __release(x) (void)0 -# define __cond_lock(x,c) (c) -#endif /* __CHECKER__ */ +#include <linux/compiler-context-analysis.h> /* Compiler specific macros. */ #ifdef __GNUC__ @@ -40,4 +24,26 @@ #define asm_goto_output(x...) asm goto(x) #endif +/* + * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged. + */ +/* + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' + * is not type-compatible with 'signed char', and we define a separate case. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type: (unsigned type)0, \ + signed type: (signed type)0 + +#define __unqual_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (x))) + #endif /* __LINUX_COMPILER_TYPES_H */ diff --git a/tools/include/linux/unaligned.h b/tools/include/linux/unaligned.h index 395a4464fe73..d51ddafed138 100644 --- a/tools/include/linux/unaligned.h +++ b/tools/include/linux/unaligned.h @@ -6,9 +6,6 @@ * This is the most generic implementation of unaligned accesses * and should work almost anywhere. */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpacked" -#pragma GCC diagnostic ignored "-Wattributes" #include <vdso/unaligned.h> #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) @@ -143,6 +140,5 @@ static inline u64 get_unaligned_be48(const void *p) { return __get_unaligned_be48(p); } -#pragma GCC diagnostic pop #endif /* __LINUX_UNALIGNED_H */ diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index 8118e22844f1..1958dda98895 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -54,6 +54,7 @@ all_files := \ sys/mman.h \ sys/mount.h \ sys/prctl.h \ + sys/ptrace.h \ sys/random.h \ sys/reboot.h \ sys/resource.h \ @@ -103,9 +104,12 @@ headers_standalone: headers $(Q)$(MAKE) -C $(srctree) headers $(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)sysroot +CFLAGS_s390 := -m64 +CFLAGS := $(CFLAGS_$(ARCH)) + headers_check: headers_standalone $(Q)for header in $(filter-out crt.h std.h,$(all_files)); do \ - $(CC) $(CLANG_CROSS_FLAGS) -Wall -Werror -nostdinc -fsyntax-only -x c /dev/null \ + $(CC) $(CFLAGS) $(CLANG_CROSS_FLAGS) -Wall -Werror -nostdinc -fsyntax-only -x c /dev/null \ -I$(or $(objtree),$(srctree))/usr/include -include $$header -include $$header || exit 1; \ done diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index 74125a254ce3..904281e95f99 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -5,6 +5,10 @@ #ifndef _NOLIBC_ARCH_S390_H #define _NOLIBC_ARCH_S390_H + +#include "types.h" + +#include <linux/sched.h> #include <linux/signal.h> #include <linux/unistd.h> @@ -186,4 +190,11 @@ pid_t sys_fork(void) } #define sys_fork sys_fork +static __attribute__((unused)) +pid_t sys_vfork(void) +{ + return my_syscall5(__NR_clone, 0, CLONE_VM | CLONE_VFORK | SIGCHLD, 0, 0, 0); +} +#define sys_vfork sys_vfork + #endif /* _NOLIBC_ARCH_S390_H */ diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h index 87090bbc53e0..a8c7619dcdde 100644 --- a/tools/include/nolibc/compiler.h +++ b/tools/include/nolibc/compiler.h @@ -47,4 +47,28 @@ # define __nolibc_fallthrough do { } while (0) #endif /* __nolibc_has_attribute(fallthrough) */ +#define __nolibc_version(_major, _minor, _patch) ((_major) * 10000 + (_minor) * 100 + (_patch)) + +#ifdef __GNUC__ +# define __nolibc_gnuc_version \ + __nolibc_version(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) +#else +# define __nolibc_gnuc_version 0 +#endif /* __GNUC__ */ + +#ifdef __clang__ +# define __nolibc_clang_version \ + __nolibc_version(__clang_major__, __clang_minor__, __clang_patchlevel__) +#else +# define __nolibc_clang_version 0 +#endif /* __clang__ */ + +#if __STDC_VERSION__ >= 201112L || \ + __nolibc_gnuc_version >= __nolibc_version(4, 6, 0) || \ + __nolibc_clang_version >= __nolibc_version(3, 0, 0) +# define __nolibc_static_assert(_t) _Static_assert(_t, "") +#else +# define __nolibc_static_assert(_t) +#endif + #endif /* _NOLIBC_COMPILER_H */ diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 272dfc961158..9c7f43b9218b 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -101,6 +101,7 @@ #include "sys/mman.h" #include "sys/mount.h" #include "sys/prctl.h" +#include "sys/ptrace.h" #include "sys/random.h" #include "sys/reboot.h" #include "sys/resource.h" diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h index 0d053f93ea99..e854c94647b1 100644 --- a/tools/include/nolibc/poll.h +++ b/tools/include/nolibc/poll.h @@ -23,24 +23,22 @@ static __attribute__((unused)) int sys_poll(struct pollfd *fds, int nfds, int timeout) { -#if defined(__NR_ppoll) - struct timespec t; +#if defined(__NR_ppoll_time64) + struct __kernel_timespec t; if (timeout >= 0) { t.tv_sec = timeout / 1000; t.tv_nsec = (timeout % 1000) * 1000000; } - return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); -#elif defined(__NR_ppoll_time64) - struct __kernel_timespec t; + return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); +#else + struct __kernel_old_timespec t; if (timeout >= 0) { t.tv_sec = timeout / 1000; t.tv_nsec = (timeout % 1000) * 1000000; } - return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); -#else - return my_syscall3(__NR_poll, fds, nfds, timeout); + return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0); #endif } diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h index 392f4dd94158..b9a116123902 100644 --- a/tools/include/nolibc/std.h +++ b/tools/include/nolibc/std.h @@ -29,6 +29,6 @@ typedef unsigned long nlink_t; typedef int64_t off_t; typedef signed long blksize_t; typedef signed long blkcnt_t; -typedef __kernel_time_t time_t; +typedef __kernel_time64_t time_t; #endif /* _NOLIBC_STD_H */ diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 1f16dab2ac88..233318b0d0f0 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -170,7 +170,7 @@ int putchar(int c) } -/* fwrite(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ +/* fwrite(), fread(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */ /* internal fwrite()-like function which only takes a size and returns 0 on * success or EOF on error. It automatically retries on short writes. @@ -204,6 +204,38 @@ size_t fwrite(const void *s, size_t size, size_t nmemb, FILE *stream) return written; } +/* internal fread()-like function which only takes a size and returns 0 on + * success or EOF on error. It automatically retries on short reads. + */ +static __attribute__((unused)) +int _fread(void *buf, size_t size, FILE *stream) +{ + int fd = fileno(stream); + ssize_t ret; + + while (size) { + ret = read(fd, buf, size); + if (ret <= 0) + return EOF; + size -= ret; + buf += ret; + } + return 0; +} + +static __attribute__((unused)) +size_t fread(void *s, size_t size, size_t nmemb, FILE *stream) +{ + size_t nread; + + for (nread = 0; nread < nmemb; nread++) { + if (_fread(s, size, stream) != 0) + break; + s += size; + } + return nread; +} + static __attribute__((unused)) int fputs(const char *s, FILE *stream) { @@ -240,6 +272,25 @@ char *fgets(char *s, int size, FILE *stream) } +/* fseek */ +static __attribute__((unused)) +int fseek(FILE *stream, long offset, int whence) +{ + int fd = fileno(stream); + off_t ret; + + ret = lseek(fd, offset, whence); + + /* lseek() and fseek() differ in that lseek returns the new + * position or -1, fseek() returns either 0 or -1. + */ + if (ret >= 0) + return 0; + + return -1; +} + + /* minimal printf(). It supports the following formats: * - %[l*]{d,u,c,x,p} * - %s diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 847af1ccbdc9..403ee9ce8389 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -22,7 +22,7 @@ #include <linux/time.h> #include <linux/auxvec.h> #include <linux/fcntl.h> /* for O_* and AT_* */ -#include <linux/sched.h> /* for clone_args */ +#include <linux/sched.h> /* for CLONE_* */ #include <linux/stat.h> /* for statx() */ #include "errno.h" @@ -363,19 +363,11 @@ pid_t fork(void) static __attribute__((unused)) pid_t sys_vfork(void) { -#if defined(__NR_vfork) +#if defined(__NR_clone) + /* See the note in sys_fork(). */ + return my_syscall5(__NR_clone, CLONE_VM | CLONE_VFORK | SIGCHLD, 0, 0, 0, 0); +#elif defined(__NR_vfork) return my_syscall0(__NR_vfork); -#else - /* - * clone() could be used but has different argument orders per - * architecture. - */ - struct clone_args args = { - .flags = CLONE_VM | CLONE_VFORK, - .exit_signal = SIGCHLD, - }; - - return my_syscall2(__NR_clone3, &args, sizeof(args)); #endif } #endif diff --git a/tools/include/nolibc/sys/ptrace.h b/tools/include/nolibc/sys/ptrace.h new file mode 100644 index 000000000000..72ca28541633 --- /dev/null +++ b/tools/include/nolibc/sys/ptrace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * ptrace for NOLIBC + * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu> + * Copyright (C) 2025 Intel Corporation + */ + +/* make sure to include all global symbols */ +#include "../nolibc.h" + +#ifndef _NOLIBC_SYS_PTRACE_H +#define _NOLIBC_SYS_PTRACE_H + +#include "../sys.h" + +#include <linux/ptrace.h> + +/* + * long ptrace(int op, pid_t pid, void *addr, void *data); + */ +static __attribute__((unused)) +long sys_ptrace(int op, pid_t pid, void *addr, void *data) +{ + return my_syscall4(__NR_ptrace, op, pid, addr, data); +} + +static __attribute__((unused)) +ssize_t ptrace(int op, pid_t pid, void *addr, void *data) +{ + return __sysret(sys_ptrace(op, pid, addr, data)); +} + +#endif /* _NOLIBC_SYS_PTRACE_H */ diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h index 2a5619c01277..80cb3755ba18 100644 --- a/tools/include/nolibc/sys/select.h +++ b/tools/include/nolibc/sys/select.h @@ -63,33 +63,22 @@ typedef struct { static __attribute__((unused)) int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) { -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__NR__newselect) - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#elif defined(__NR_select) - return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout); -#elif defined(__NR_pselect6) - struct timespec t; +#if defined(__NR_pselect6_time64) + struct __kernel_timespec t; if (timeout) { t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; + t.tv_nsec = (uint32_t)timeout->tv_usec * 1000; } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); + return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); #else - struct __kernel_timespec t; + struct __kernel_old_timespec t; if (timeout) { t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; + t.tv_nsec = (uint32_t)timeout->tv_usec * 1000; } - return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); #endif } diff --git a/tools/include/nolibc/sys/time.h b/tools/include/nolibc/sys/time.h index 33782a19aae9..afdb7e326df1 100644 --- a/tools/include/nolibc/sys/time.h +++ b/tools/include/nolibc/sys/time.h @@ -22,9 +22,6 @@ static int sys_clock_gettime(clockid_t clockid, struct timespec *tp); static __attribute__((unused)) int sys_gettimeofday(struct timeval *tv, struct timezone *tz) { -#ifdef __NR_gettimeofday - return my_syscall2(__NR_gettimeofday, tv, tz); -#else (void) tz; /* Non-NULL tz is undefined behaviour */ struct timespec tp; @@ -33,11 +30,10 @@ int sys_gettimeofday(struct timeval *tv, struct timezone *tz) ret = sys_clock_gettime(CLOCK_REALTIME, &tp); if (!ret && tv) { tv->tv_sec = tp.tv_sec; - tv->tv_usec = tp.tv_nsec / 1000; + tv->tv_usec = (uint32_t)tp.tv_nsec / 1000; } return ret; -#endif } static __attribute__((unused)) diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h index 5dd61030c991..29fd92bd47d2 100644 --- a/tools/include/nolibc/sys/timerfd.h +++ b/tools/include/nolibc/sys/timerfd.h @@ -32,16 +32,12 @@ int timerfd_create(int clockid, int flags) static __attribute__((unused)) int sys_timerfd_gettime(int fd, struct itimerspec *curr_value) { -#if defined(__NR_timerfd_gettime) - return my_syscall2(__NR_timerfd_gettime, fd, curr_value); +#if defined(__NR_timerfd_gettime64) + __nolibc_assert_time64_type(curr_value->it_value.tv_sec); + return my_syscall2(__NR_timerfd_gettime64, fd, curr_value); #else - struct __kernel_itimerspec kcurr_value; - int ret; - - ret = my_syscall2(__NR_timerfd_gettime64, fd, &kcurr_value); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); - return ret; + __nolibc_assert_native_time64(); + return my_syscall2(__NR_timerfd_gettime, fd, curr_value); #endif } @@ -56,20 +52,12 @@ static __attribute__((unused)) int sys_timerfd_settime(int fd, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { -#if defined(__NR_timerfd_settime) - return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); +#if defined(__NR_timerfd_settime64) + __nolibc_assert_time64_type(new_value->it_value.tv_sec); + return my_syscall4(__NR_timerfd_settime64, fd, flags, new_value, old_value); #else - struct __kernel_itimerspec knew_value, kold_value; - int ret; - - __nolibc_timespec_user_to_kernel(&new_value->it_value, &knew_value.it_value); - __nolibc_timespec_user_to_kernel(&new_value->it_interval, &knew_value.it_interval); - ret = my_syscall4(__NR_timerfd_settime64, fd, flags, &knew_value, &kold_value); - if (old_value) { - __nolibc_timespec_kernel_to_user(&kold_value.it_interval, &old_value->it_interval); - __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); - } - return ret; + __nolibc_assert_native_time64(); + return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h index 48e78f8becf9..f9257d6a7878 100644 --- a/tools/include/nolibc/time.h +++ b/tools/include/nolibc/time.h @@ -18,19 +18,11 @@ #include <linux/signal.h> #include <linux/time.h> -static __inline__ -void __nolibc_timespec_user_to_kernel(const struct timespec *ts, struct __kernel_timespec *kts) -{ - kts->tv_sec = ts->tv_sec; - kts->tv_nsec = ts->tv_nsec; -} +#define __nolibc_assert_time64_type(t) \ + __nolibc_static_assert(sizeof(t) == 8) -static __inline__ -void __nolibc_timespec_kernel_to_user(const struct __kernel_timespec *kts, struct timespec *ts) -{ - ts->tv_sec = kts->tv_sec; - ts->tv_nsec = kts->tv_nsec; -} +#define __nolibc_assert_native_time64() \ + __nolibc_assert_time64_type(__kernel_old_time_t) /* * int clock_getres(clockid_t clockid, struct timespec *res); @@ -43,16 +35,12 @@ void __nolibc_timespec_kernel_to_user(const struct __kernel_timespec *kts, struc static __attribute__((unused)) int sys_clock_getres(clockid_t clockid, struct timespec *res) { -#if defined(__NR_clock_getres) - return my_syscall2(__NR_clock_getres, clockid, res); +#if defined(__NR_clock_getres_time64) + __nolibc_assert_time64_type(res->tv_sec); + return my_syscall2(__NR_clock_getres_time64, clockid, res); #else - struct __kernel_timespec kres; - int ret; - - ret = my_syscall2(__NR_clock_getres_time64, clockid, &kres); - if (res) - __nolibc_timespec_kernel_to_user(&kres, res); - return ret; + __nolibc_assert_native_time64(); + return my_syscall2(__NR_clock_getres, clockid, res); #endif } @@ -65,16 +53,12 @@ int clock_getres(clockid_t clockid, struct timespec *res) static __attribute__((unused)) int sys_clock_gettime(clockid_t clockid, struct timespec *tp) { -#if defined(__NR_clock_gettime) - return my_syscall2(__NR_clock_gettime, clockid, tp); +#if defined(__NR_clock_gettime64) + __nolibc_assert_time64_type(tp->tv_sec); + return my_syscall2(__NR_clock_gettime64, clockid, tp); #else - struct __kernel_timespec ktp; - int ret; - - ret = my_syscall2(__NR_clock_gettime64, clockid, &ktp); - if (tp) - __nolibc_timespec_kernel_to_user(&ktp, tp); - return ret; + __nolibc_assert_native_time64(); + return my_syscall2(__NR_clock_gettime, clockid, tp); #endif } @@ -87,13 +71,12 @@ int clock_gettime(clockid_t clockid, struct timespec *tp) static __attribute__((unused)) int sys_clock_settime(clockid_t clockid, struct timespec *tp) { -#if defined(__NR_clock_settime) - return my_syscall2(__NR_clock_settime, clockid, tp); +#if defined(__NR_clock_settime64) + __nolibc_assert_time64_type(tp->tv_sec); + return my_syscall2(__NR_clock_settime64, clockid, tp); #else - struct __kernel_timespec ktp; - - __nolibc_timespec_user_to_kernel(tp, &ktp); - return my_syscall2(__NR_clock_settime64, clockid, &ktp); + __nolibc_assert_native_time64(); + return my_syscall2(__NR_clock_settime, clockid, tp); #endif } @@ -107,17 +90,12 @@ static __attribute__((unused)) int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqtp, struct timespec *rmtp) { -#if defined(__NR_clock_nanosleep) - return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); +#if defined(__NR_clock_nanosleep_time64) + __nolibc_assert_time64_type(rqtp->tv_sec); + return my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, rqtp, rmtp); #else - struct __kernel_timespec krqtp, krmtp; - int ret; - - __nolibc_timespec_user_to_kernel(rqtp, &krqtp); - ret = my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, &krqtp, &krmtp); - if (rmtp) - __nolibc_timespec_kernel_to_user(&krmtp, rmtp); - return ret; + __nolibc_assert_native_time64(); + return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp); #endif } @@ -189,16 +167,12 @@ int timer_delete(timer_t timerid) static __attribute__((unused)) int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value) { -#if defined(__NR_timer_gettime) - return my_syscall2(__NR_timer_gettime, timerid, curr_value); +#if defined(__NR_timer_gettime64) + __nolibc_assert_time64_type(curr_value->it_value.tv_sec); + return my_syscall2(__NR_timer_gettime64, timerid, curr_value); #else - struct __kernel_itimerspec kcurr_value; - int ret; - - ret = my_syscall2(__NR_timer_gettime64, timerid, &kcurr_value); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval); - __nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value); - return ret; + __nolibc_assert_native_time64(); + return my_syscall2(__NR_timer_gettime, timerid, curr_value); #endif } @@ -212,20 +186,12 @@ static __attribute__((unused)) int sys_timer_settime(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { -#if defined(__NR_timer_settime) - return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); +#if defined(__NR_timer_settime64) + __nolibc_assert_time64_type(new_value->it_value.tv_sec); + return my_syscall4(__NR_timer_settime64, timerid, flags, new_value, old_value); #else - struct __kernel_itimerspec knew_value, kold_value; - int ret; - - __nolibc_timespec_user_to_kernel(&new_value->it_value, &knew_value.it_value); - __nolibc_timespec_user_to_kernel(&new_value->it_interval, &knew_value.it_interval); - ret = my_syscall4(__NR_timer_settime64, timerid, flags, &knew_value, &kold_value); - if (old_value) { - __nolibc_timespec_kernel_to_user(&kold_value.it_interval, &old_value->it_interval); - __nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value); - } - return ret; + __nolibc_assert_native_time64(); + return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value); #endif } diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h index 470a5f77bc0f..8f3cb18df7f1 100644 --- a/tools/include/nolibc/types.h +++ b/tools/include/nolibc/types.h @@ -13,9 +13,24 @@ #include "std.h" #include <linux/mman.h> #include <linux/stat.h> -#include <linux/time.h> +#include <linux/time_types.h> #include <linux/wait.h> +struct timespec { + time_t tv_sec; + int64_t tv_nsec; +}; +#define _STRUCT_TIMESPEC + +/* Never use with system calls */ +struct timeval { + time_t tv_sec; + int64_t tv_usec; +}; + +#define timeval __nolibc_kernel_timeval +#include <linux/time.h> +#undef timeval /* Only the generic macros and types may be defined here. The arch-specific * ones such as the O_RDONLY and related macros used by fcntl() and open() diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h index cf9c51ac49f9..92e7ae493ee3 100644 --- a/tools/include/uapi/asm-generic/errno.h +++ b/tools/include/uapi/asm-generic/errno.h @@ -55,6 +55,7 @@ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ @@ -98,6 +99,7 @@ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index be7d8e060e10..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { @@ -918,6 +926,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +992,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1134,6 +1153,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; @@ -1373,6 +1393,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ @@ -1894,6 +1916,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 72f03153dd32..76e9d0664d0c 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1330,14 +1330,16 @@ union perf_mem_data_src { mem_snoopx : 2, /* Snoop mode, ext */ mem_blk : 3, /* Access blocked */ mem_hops : 3, /* Hop level */ - mem_rsvd : 18; + mem_region : 5, /* cache/memory regions */ + mem_rsvd : 13; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd : 18, + __u64 mem_rsvd : 13, + mem_region : 5, /* cache/memory regions */ mem_hops : 3, /* Hop level */ mem_blk : 3, /* Access blocked */ mem_snoopx : 2, /* Snoop mode, ext */ @@ -1394,7 +1396,7 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ -/* 0x007 available */ +#define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ @@ -1447,6 +1449,25 @@ union perf_mem_data_src { /* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 +/* Cache/Memory region */ +#define PERF_MEM_REGION_NA 0x0 /* Invalid */ +#define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ +#define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ +#define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ +#define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ +#define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ +#define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ +#define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ +#define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ +#define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ +#define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ +#define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ +#define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ +#define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ +#define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ +#define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ +#define PERF_MEM_REGION_SHIFT 46 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) diff --git a/tools/include/vdso/unaligned.h b/tools/include/vdso/unaligned.h index ff0c06b6513e..9076483c9fbb 100644 --- a/tools/include/vdso/unaligned.h +++ b/tools/include/vdso/unaligned.h @@ -2,14 +2,43 @@ #ifndef __VDSO_UNALIGNED_H #define __VDSO_UNALIGNED_H -#define __get_unaligned_t(type, ptr) ({ \ - const struct { type x; } __packed * __get_pptr = (typeof(__get_pptr))(ptr); \ - __get_pptr->x; \ +#include <linux/compiler_types.h> + +/** + * __get_unaligned_t - read an unaligned value from memory. + * @type: the type to load from the pointer. + * @ptr: the pointer to load from. + * + * Use memcpy to affect an unaligned type sized load avoiding undefined behavior + * from approaches like type punning that require -fno-strict-aliasing in order + * to be correct. As type may be const, use __unqual_scalar_typeof to map to a + * non-const type - you can't memcpy into a const type. The + * __get_unaligned_ctrl_type gives __unqual_scalar_typeof its required + * expression rather than type, a pointer is used to avoid warnings about mixing + * the use of 0 and NULL. The void* cast silences ubsan warnings. + */ +#define __get_unaligned_t(type, ptr) ({ \ + type *__get_unaligned_ctrl_type __always_unused = NULL; \ + __unqual_scalar_typeof(*__get_unaligned_ctrl_type) __get_unaligned_val; \ + __builtin_memcpy(&__get_unaligned_val, (void *)(ptr), \ + sizeof(__get_unaligned_val)); \ + __get_unaligned_val; \ }) -#define __put_unaligned_t(type, val, ptr) do { \ - struct { type x; } __packed * __put_pptr = (typeof(__put_pptr))(ptr); \ - __put_pptr->x = (val); \ +/** + * __put_unaligned_t - write an unaligned value to memory. + * @type: the type of the value to store. + * @val: the value to store. + * @ptr: the pointer to store to. + * + * Use memcpy to affect an unaligned type sized store avoiding undefined + * behavior from approaches like type punning that require -fno-strict-aliasing + * in order to be correct. The void* cast silences ubsan warnings. + */ +#define __put_unaligned_t(type, val, ptr) do { \ + type __put_unaligned_val = (val); \ + __builtin_memcpy((void *)(ptr), &__put_unaligned_val, \ + sizeof(__put_unaligned_val)); \ } while (0) #endif /* __VDSO_UNALIGNED_H */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index b66f5fbfbbb2..5846de364209 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -794,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: case BPF_LSM_MAC: attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); if (!OPTS_ZEROED(opts, tracing)) @@ -1397,3 +1398,22 @@ int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len, err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz); return libbpf_err_errno(err); } + +int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd, + struct bpf_prog_assoc_struct_ops_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops); + union bpf_attr attr; + int err; + + if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_assoc_struct_ops.map_fd = map_fd; + attr.prog_assoc_struct_ops.prog_fd = prog_fd; + attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0); + + err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz); + return libbpf_err_errno(err); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index e983a3e40d61..2c8e88ddb674 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -289,6 +289,14 @@ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, * Update spin_lock-ed map elements. This must be * specified if the map value contains a spinlock. * + * **BPF_F_CPU** + * As for percpu maps, update value on the specified CPU. And the cpu + * info is embedded into the high 32 bits of **opts->elem_flags**. + * + * **BPF_F_ALL_CPUS** + * As for percpu maps, update value across all CPUs. This flag cannot + * be used with BPF_F_CPU at the same time. + * * @param fd BPF map file descriptor * @param keys pointer to an array of *count* keys * @param values pointer to an array of *count* values @@ -733,6 +741,27 @@ struct bpf_prog_stream_read_opts { LIBBPF_API int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len, struct bpf_prog_stream_read_opts *opts); +struct bpf_prog_assoc_struct_ops_opts { + size_t sz; + __u32 flags; + size_t :0; +}; +#define bpf_prog_assoc_struct_ops_opts__last_field flags + +/** + * @brief **bpf_prog_assoc_struct_ops** associates a BPF program with a + * struct_ops map. + * + * @param prog_fd FD for the BPF program + * @param map_fd FD for the struct_ops map to be associated with the BPF program + * @param opts optional options, can be NULL + * + * @return 0 on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd, + struct bpf_prog_assoc_struct_ops_opts *opts); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index d4e4e388e625..c145da05a67c 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; +extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; #define bpf_stream_printk(stream_id, fmt, args...) \ ({ \ @@ -328,7 +328,7 @@ extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const vo ___bpf_fill(___param, args); \ _Pragma("GCC diagnostic pop") \ \ - bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ + bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 84a4b0abc8be..83fe79ffcb8f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -92,6 +92,8 @@ struct btf { * - for split BTF counts number of types added on top of base BTF. */ __u32 nr_types; + /* the start IDs of named types in sorted BTF */ + int named_start_id; /* if not NULL, points to the base BTF on top of which the current * split BTF is based */ @@ -897,46 +899,105 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) return type_id; } -__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +static void btf_check_sorted(struct btf *btf) { - __u32 i, nr_types = btf__type_cnt(btf); + __u32 i, n, named_start_id = 0; - if (!strcmp(type_name, "void")) - return 0; + n = btf__type_cnt(btf); + for (i = btf->start_id + 1; i < n; i++) { + struct btf_type *ta = btf_type_by_id(btf, i - 1); + struct btf_type *tb = btf_type_by_id(btf, i); + const char *na = btf__str_by_offset(btf, ta->name_off); + const char *nb = btf__str_by_offset(btf, tb->name_off); - for (i = 1; i < nr_types; i++) { - const struct btf_type *t = btf__type_by_id(btf, i); - const char *name = btf__name_by_offset(btf, t->name_off); + if (strcmp(na, nb) > 0) + return; - if (name && !strcmp(type_name, name)) - return i; + if (named_start_id == 0 && na[0] != '\0') + named_start_id = i - 1; + if (named_start_id == 0 && nb[0] != '\0') + named_start_id = i; } - return libbpf_err(-ENOENT); + if (named_start_id) + btf->named_start_id = named_start_id; +} + +static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name, + __s32 start_id) +{ + const struct btf_type *t; + const char *tname; + __s32 l, r, m; + + l = start_id; + r = btf__type_cnt(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } + } + + return btf__type_cnt(btf); } static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, - const char *type_name, __u32 kind) + const char *type_name, __s32 kind) { - __u32 i, nr_types = btf__type_cnt(btf); + __u32 nr_types = btf__type_cnt(btf); + const struct btf_type *t; + const char *tname; + __s32 id; - if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) - return 0; + if (start_id < btf->start_id) { + id = btf_find_by_name_kind(btf->base_btf, start_id, + type_name, kind); + if (id >= 0) + return id; + start_id = btf->start_id; + } - for (i = start_id; i < nr_types; i++) { - const struct btf_type *t = btf__type_by_id(btf, i); - const char *name; + if (kind == BTF_KIND_UNKN || strcmp(type_name, "void") == 0) + return 0; - if (btf_kind(t) != kind) - continue; - name = btf__name_by_offset(btf, t->name_off); - if (name && !strcmp(type_name, name)) - return i; + if (btf->named_start_id > 0 && type_name[0]) { + start_id = max(start_id, btf->named_start_id); + id = btf_find_type_by_name_bsearch(btf, type_name, start_id); + for (; id < nr_types; id++) { + t = btf__type_by_id(btf, id); + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, type_name) != 0) + return libbpf_err(-ENOENT); + if (kind < 0 || btf_kind(t) == kind) + return id; + } + } else { + for (id = start_id; id < nr_types; id++) { + t = btf_type_by_id(btf, id); + if (kind > 0 && btf_kind(t) != kind) + continue; + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, type_name) == 0) + return id; + } } return libbpf_err(-ENOENT); } +/* the kind value of -1 indicates that kind matching should be skipped */ +__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +{ + return btf_find_by_name_kind(btf, 1, type_name, -1); +} + __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind) { @@ -1006,6 +1067,7 @@ static struct btf *btf_new_empty(struct btf *base_btf) btf->fd = -1; btf->ptr_sz = sizeof(void *); btf->swapped_endian = false; + btf->named_start_id = 0; if (base_btf) { btf->base_btf = base_btf; @@ -1057,6 +1119,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b btf->start_id = 1; btf->start_str_off = 0; btf->fd = -1; + btf->named_start_id = 0; if (base_btf) { btf->base_btf = base_btf; @@ -1091,6 +1154,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b err = err ?: btf_sanity_check(btf); if (err) goto done; + btf_check_sorted(btf); done: if (err) { @@ -1715,6 +1779,7 @@ static void btf_invalidate_raw_data(struct btf *btf) free(btf->raw_data_swapped); btf->raw_data_swapped = NULL; } + btf->named_start_id = 0; } /* Ensure BTF is ready to be modified (by splitting into a three memory @@ -2069,7 +2134,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* byte_sz must be power of 2 */ if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) @@ -2117,7 +2182,7 @@ int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* byte_sz must be one of the explicitly allowed values */ @@ -2172,7 +2237,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2249,7 +2314,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2350,7 +2415,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, if (!m) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2388,7 +2453,7 @@ static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2446,7 +2511,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) return libbpf_err(-EINVAL); /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (value < INT_MIN || value > UINT_MAX) return libbpf_err(-E2BIG); @@ -2523,7 +2588,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) return libbpf_err(-EINVAL); /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* decompose and invalidate raw data */ @@ -2563,7 +2628,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) */ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) { - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); switch (fwd_kind) { @@ -2599,7 +2664,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) */ int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id) { - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id, 0); @@ -2651,7 +2716,7 @@ int btf__add_restrict(struct btf *btf, int ref_type_id) */ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) { - if (!value || !value[0]) + if (str_is_empty(value)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 0); @@ -2668,7 +2733,7 @@ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) */ int btf__add_type_attr(struct btf *btf, const char *value, int ref_type_id) { - if (!value || !value[0]) + if (str_is_empty(value)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 1); @@ -2687,7 +2752,7 @@ int btf__add_func(struct btf *btf, const char *name, { int id; - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL && linkage != BTF_FUNC_EXTERN) @@ -2773,7 +2838,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) if (!p) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2808,7 +2873,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED && linkage != BTF_VAR_GLOBAL_EXTERN) @@ -2857,7 +2922,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (btf_ensure_modifiable(btf)) @@ -2934,7 +2999,7 @@ static int btf_add_decl_tag(struct btf *btf, const char *value, int ref_type_id, struct btf_type *t; int sz, value_off; - if (!value || !value[0] || component_idx < -1) + if (str_is_empty(value) || component_idx < -1) return libbpf_err(-EINVAL); if (validate_type_id(ref_type_id)) @@ -4431,11 +4496,14 @@ static bool btf_dedup_identical_types(struct btf_dedup *d, __u32 id1, __u32 id2, struct btf_type *t1, *t2; int k1, k2; recur: - if (depth <= 0) - return false; - t1 = btf_type_by_id(d->btf, id1); t2 = btf_type_by_id(d->btf, id2); + if (depth <= 0) { + pr_debug("Reached depth limit for identical type comparison for '%s'/'%s'\n", + btf__name_by_offset(d->btf, t1->name_off), + btf__name_by_offset(d->btf, t2->name_off)); + return false; + } k1 = btf_kind(t1); k2 = btf_kind(t2); @@ -4497,8 +4565,16 @@ recur: for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { if (m1->type == m2->type) continue; - if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) + if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) { + if (t1->name_off) { + pr_debug("%s '%s' size=%d vlen=%d id1[%u] id2[%u] shallow-equal but not identical for field#%d '%s'\n", + k1 == BTF_KIND_STRUCT ? "STRUCT" : "UNION", + btf__name_by_offset(d->btf, t1->name_off), + t1->size, btf_vlen(t1), id1, id2, i, + btf__name_by_offset(d->btf, m1->name_off)); + } return false; + } } return true; } @@ -4739,8 +4815,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, canon_m = btf_members(canon_type); for (i = 0; i < vlen; i++) { eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); - if (eq <= 0) + if (eq <= 0) { + if (cand_type->name_off) { + pr_debug("%s '%s' size=%d vlen=%d cand_id[%u] canon_id[%u] shallow-equal but not equiv for field#%d '%s': %d\n", + cand_kind == BTF_KIND_STRUCT ? "STRUCT" : "UNION", + btf__name_by_offset(d->btf, cand_type->name_off), + cand_type->size, vlen, cand_id, canon_id, i, + btf__name_by_offset(d->btf, cand_m->name_off), eq); + } return eq; + } cand_m++; canon_m++; } @@ -5868,3 +5952,136 @@ int btf__relocate(struct btf *btf, const struct btf *base_btf) btf->owns_base = false; return libbpf_err(err); } + +struct btf_permute { + struct btf *btf; + __u32 *id_map; + __u32 start_offs; +}; + +/* Callback function to remap individual type ID references */ +static int btf_permute_remap_type_id(__u32 *type_id, void *ctx) +{ + struct btf_permute *p = ctx; + __u32 new_id = *type_id; + + /* refer to the base BTF or VOID type */ + if (new_id < p->btf->start_id) + return 0; + + if (new_id >= btf__type_cnt(p->btf)) + return -EINVAL; + + *type_id = p->id_map[new_id - p->btf->start_id + p->start_offs]; + return 0; +} + +int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt, + const struct btf_permute_opts *opts) +{ + struct btf_permute p; + struct btf_ext *btf_ext; + void *nt, *new_types = NULL; + __u32 *order_map = NULL; + int err = 0, i; + __u32 n, id, start_offs = 0; + + if (!OPTS_VALID(opts, btf_permute_opts)) + return libbpf_err(-EINVAL); + + if (btf__base_btf(btf)) { + n = btf->nr_types; + } else { + if (id_map[0] != 0) + return libbpf_err(-EINVAL); + n = btf__type_cnt(btf); + start_offs = 1; + } + + if (id_map_cnt != n) + return libbpf_err(-EINVAL); + + /* record the sequence of types */ + order_map = calloc(id_map_cnt, sizeof(*id_map)); + if (!order_map) { + err = -ENOMEM; + goto done; + } + + new_types = calloc(btf->hdr->type_len, 1); + if (!new_types) { + err = -ENOMEM; + goto done; + } + + if (btf_ensure_modifiable(btf)) { + err = -ENOMEM; + goto done; + } + + for (i = start_offs; i < id_map_cnt; i++) { + id = id_map[i]; + if (id < btf->start_id || id >= btf__type_cnt(btf)) { + err = -EINVAL; + goto done; + } + id -= btf->start_id - start_offs; + /* cannot be mapped to the same ID */ + if (order_map[id]) { + err = -EINVAL; + goto done; + } + order_map[id] = i + btf->start_id - start_offs; + } + + p.btf = btf; + p.id_map = id_map; + p.start_offs = start_offs; + nt = new_types; + for (i = start_offs; i < id_map_cnt; i++) { + struct btf_field_iter it; + const struct btf_type *t; + __u32 *type_id; + int type_size; + + id = order_map[i]; + t = btf__type_by_id(btf, id); + type_size = btf_type_size(t); + memcpy(nt, t, type_size); + + /* fix up referenced IDs for BTF */ + err = btf_field_iter_init(&it, nt, BTF_FIELD_ITER_IDS); + if (err) + goto done; + while ((type_id = btf_field_iter_next(&it))) { + err = btf_permute_remap_type_id(type_id, &p); + if (err) + goto done; + } + + nt += type_size; + } + + /* fix up referenced IDs for btf_ext */ + btf_ext = OPTS_GET(opts, btf_ext, NULL); + if (btf_ext) { + err = btf_ext_visit_type_ids(btf_ext, btf_permute_remap_type_id, &p); + if (err) + goto done; + } + + for (nt = new_types, i = 0; i < id_map_cnt - start_offs; i++) { + btf->type_offs[i] = nt - new_types; + nt += btf_type_size(nt); + } + + free(order_map); + free(btf->types_data); + btf->types_data = new_types; + return 0; + +done: + free(order_map); + free(new_types); + return libbpf_err(err); +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index cc01494d6210..b30008c267c0 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -281,6 +281,48 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); */ LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf); +struct btf_permute_opts { + size_t sz; + /* optional .BTF.ext info along the main BTF info */ + struct btf_ext *btf_ext; + size_t :0; +}; +#define btf_permute_opts__last_field btf_ext + +/** + * @brief **btf__permute()** rearranges BTF types in-place according to a specified ID mapping + * @param btf BTF object to permute + * @param id_map Array mapping original type IDs to new IDs + * @param id_map_cnt Number of elements in @id_map + * @param opts Optional parameters, including BTF extension data for reference updates + * @return 0 on success, negative error code on failure + * + * **btf__permute()** reorders BTF types based on the provided @id_map array, + * updating all internal type references to maintain consistency. The function + * operates in-place, modifying the BTF object directly. + * + * For **base BTF**: + * - @id_map must include all types from ID 0 to `btf__type_cnt(btf) - 1` + * - @id_map_cnt must be `btf__type_cnt(btf)` + * - Mapping is defined as `id_map[original_id] = new_id` + * - `id_map[0]` must be 0 (void type cannot be moved) + * + * For **split BTF**: + * - @id_map must include only split types (types added on top of the base BTF) + * - @id_map_cnt must be `btf__type_cnt(btf) - btf__type_cnt(btf__base_btf(btf))` + * - Mapping is defined as `id_map[original_id - start_id] = new_id` + * - `start_id` equals `btf__type_cnt(btf__base_btf(btf))` + * + * After permutation, all type references within the BTF data and optional + * BTF extension (if provided via @opts) are updated automatically. + * + * On error, returns a negative error code and sets errno: + * - `-EINVAL`: Invalid parameters or invalid ID mapping + * - `-ENOMEM`: Memory allocation failure + */ +LIBBPF_API int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt, + const struct btf_permute_opts *opts); + struct btf_dump; struct btf_dump_opts { diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 6388392f49a0..53c6624161d7 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1762,9 +1762,18 @@ static int btf_dump_get_bitfield_value(struct btf_dump *d, __u16 left_shift_bits, right_shift_bits; const __u8 *bytes = data; __u8 nr_copy_bits; + __u8 start_bit, nr_bytes; __u64 num = 0; int i; + /* Calculate how many bytes cover the bitfield */ + start_bit = bits_offset % 8; + nr_bytes = (start_bit + bit_sz + 7) / 8; + + /* Bound check */ + if (data + nr_bytes > d->typed_dump->data_end) + return -E2BIG; + /* Maximum supported bitfield size is 64 bits */ if (t->size > 8) { pr_warn("unexpected bitfield size %d\n", t->size); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f4dfd23148a5..0c8bf0b5cce4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_FENTRY] = "trace_fentry", [BPF_TRACE_FEXIT] = "trace_fexit", [BPF_MODIFY_RETURN] = "modify_return", + [BPF_TRACE_FSESSION] = "trace_fsession", [BPF_LSM_MAC] = "lsm_mac", [BPF_LSM_CGROUP] = "lsm_cgroup", [BPF_SK_LOOKUP] = "sk_lookup", @@ -380,7 +381,7 @@ struct reloc_desc { const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ struct { int map_idx; - int sym_off; + unsigned int sym_off; /* * The following two fields can be unionized, as the * ext_idx field is used for extern symbols, and the @@ -757,13 +758,14 @@ struct bpf_object { int arena_map_idx; void *arena_data; size_t arena_data_sz; + size_t arena_data_off; void *jumptables_data; size_t jumptables_data_sz; struct { struct bpf_program *prog; - int sym_off; + unsigned int sym_off; int fd; } *jumptable_maps; size_t jumptable_map_cnt; @@ -2903,7 +2905,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, var_extra = btf_var(var); map_name = btf__name_by_offset(obj->btf, var->name_off); - if (map_name == NULL || map_name[0] == '\0') { + if (str_is_empty(map_name)) { pr_warn("map #%d: empty name.\n", var_idx); return -EINVAL; } @@ -2991,10 +2993,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, void *data, size_t data_sz) { const long page_sz = sysconf(_SC_PAGE_SIZE); + const size_t data_alloc_sz = roundup(data_sz, page_sz); size_t mmap_sz; mmap_sz = bpf_map_mmap_sz(map); - if (roundup(data_sz, page_sz) > mmap_sz) { + if (data_alloc_sz > mmap_sz) { pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", sec_name, mmap_sz, data_sz); return -E2BIG; @@ -3006,6 +3009,9 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, memcpy(obj->arena_data, data, data_sz); obj->arena_data_sz = data_sz; + /* place globals at the end of the arena */ + obj->arena_data_off = mmap_sz - data_alloc_sz; + /* make bpf_map__init_value() work for ARENA maps */ map->mmaped = obj->arena_data; @@ -4276,7 +4282,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj) if (!sym_is_extern(sym)) continue; ext_name = elf_sym_str(obj, sym->st_name); - if (!ext_name || !ext_name[0]) + if (str_is_empty(ext_name)) continue; ext = obj->externs; @@ -4663,7 +4669,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_DATA; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = obj->arena_map_idx; - reloc_desc->sym_off = sym->st_value; + reloc_desc->sym_off = sym->st_value + obj->arena_data_off; map = &obj->maps[obj->arena_map_idx]; pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n", @@ -5624,7 +5630,8 @@ retry: return err; } if (obj->arena_data) { - memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + memcpy(map->mmaped + obj->arena_data_off, obj->arena_data, + obj->arena_data_sz); zfree(&obj->arena_data); } } @@ -6192,7 +6199,7 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; } -static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off) +static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off) { size_t i; @@ -6210,7 +6217,7 @@ static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym return -ENOENT; } -static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd) +static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off, int map_fd) { size_t cnt = obj->jumptable_map_cnt; size_t size = sizeof(obj->jumptable_maps[0]); @@ -6244,7 +6251,7 @@ static int find_subprog_idx(struct bpf_program *prog, int insn_idx) static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo) { const __u32 jt_entry_size = 8; - int sym_off = relo->sym_off; + unsigned int sym_off = relo->sym_off; int jt_size = relo->sym_size; __u32 max_entries = jt_size / jt_entry_size; __u32 value_size = sizeof(struct bpf_insn_array_value); @@ -6260,7 +6267,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc return map_fd; if (sym_off % jt_entry_size) { - pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n", + pr_warn("map '.jumptables': jumptable start %u should be multiple of %u\n", sym_off, jt_entry_size); return -EINVAL; } @@ -6316,7 +6323,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc * should contain values that fit in u32. */ if (insn_off > UINT32_MAX) { - pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n", + pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %u\n", (long long)jt[i], sym_off + i * jt_entry_size); err = -EINVAL; goto err_close; @@ -9853,6 +9860,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fsession+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fsession.s+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), @@ -10913,7 +10922,7 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) } static int validate_map_op(const struct bpf_map *map, size_t key_sz, - size_t value_sz, bool check_value_sz) + size_t value_sz, bool check_value_sz, __u64 flags) { if (!map_is_created(map)) /* map is not yet created */ return -ENOENT; @@ -10940,6 +10949,20 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, int num_cpu = libbpf_num_possible_cpus(); size_t elem_sz = roundup(map->def.value_size, 8); + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) { + pr_warn("map '%s': BPF_F_CPU and BPF_F_ALL_CPUS are mutually exclusive\n", + map->name); + return -EINVAL; + } + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided for either BPF_F_CPU or BPF_F_ALL_CPUS, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + if (value_sz != num_cpu * elem_sz) { pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); @@ -10964,7 +10987,7 @@ int bpf_map__lookup_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -10977,7 +11000,7 @@ int bpf_map__update_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -10989,7 +11012,7 @@ int bpf_map__delete_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, flags); if (err) return libbpf_err(err); @@ -11002,7 +11025,7 @@ int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -11014,7 +11037,7 @@ int bpf_map__get_next_key(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, 0); if (err) return libbpf_err(err); @@ -14134,6 +14157,37 @@ int bpf_program__set_attach_target(struct bpf_program *prog, return 0; } +int bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map, + struct bpf_prog_assoc_struct_ops_opts *opts) +{ + int prog_fd, map_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't associate BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + pr_warn("prog '%s': can't associate struct_ops program\n", prog->name); + return libbpf_err(-EINVAL); + } + + map_fd = bpf_map__fd(map); + if (map_fd < 0) { + pr_warn("map '%s': can't associate BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + + if (!bpf_map__is_struct_ops(map)) { + pr_warn("map '%s': can't associate non-struct_ops map\n", map->name); + return libbpf_err(-EINVAL); + } + + return bpf_prog_assoc_struct_ops(prog_fd, map_fd, opts); +} + int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) { int err = 0, n, len, start, end = -1; @@ -14399,7 +14453,10 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) if (!map_skel->mmaped) continue; - *map_skel->mmaped = map->mmaped; + if (map->def.type == BPF_MAP_TYPE_ARENA) + *map_skel->mmaped = map->mmaped + map->obj->arena_data_off; + else + *map_skel->mmaped = map->mmaped; } return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 65e68e964b89..dfc37a615578 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1006,6 +1006,22 @@ LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name); +struct bpf_prog_assoc_struct_ops_opts; /* defined in bpf.h */ + +/** + * @brief **bpf_program__assoc_struct_ops()** associates a BPF program with a + * struct_ops map. + * + * @param prog BPF program + * @param map struct_ops map to be associated with the BPF program + * @param opts optional options, can be NULL + * + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int +bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map, + struct bpf_prog_assoc_struct_ops_opts *opts); + /** * @brief **bpf_object__find_map_by_name()** returns BPF map of * the given name, if it exists within the passed BPF object @@ -1200,12 +1216,13 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** * @param value pointer to memory in which looked up value will be stored * @param value_sz size in byte of value data memory; it has to match BPF map - * definition's **value_size**. For per-CPU BPF maps value size has to be - * a product of BPF map value size and number of possible CPUs in the system - * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for - * per-CPU values value size has to be aligned up to closest 8 bytes for - * alignment reasons, so expected size is: `round_up(value_size, 8) - * * libbpf_num_possible_cpus()`. + * definition's **value_size**. For per-CPU BPF maps, value size can be + * `value_size` if either **BPF_F_CPU** or **BPF_F_ALL_CPUS** is specified + * in **flags**, otherwise a product of BPF map value size and number of + * possible CPUs in the system (could be fetched with + * **libbpf_num_possible_cpus()**). Note also that for per-CPU values value + * size has to be aligned up to closest 8 bytes, so expected size is: + * `round_up(value_size, 8) * libbpf_num_possible_cpus()`. * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * @@ -1223,13 +1240,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, * @param key pointer to memory containing bytes of the key * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** * @param value pointer to memory containing bytes of the value - * @param value_sz size in byte of value data memory; it has to match BPF map - * definition's **value_size**. For per-CPU BPF maps value size has to be - * a product of BPF map value size and number of possible CPUs in the system - * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for - * per-CPU values value size has to be aligned up to closest 8 bytes for - * alignment reasons, so expected size is: `round_up(value_size, 8) - * * libbpf_num_possible_cpus()`. + * @param value_sz refer to **bpf_map__lookup_elem**'s description.' * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 8ed8749907d4..d18fbcea7578 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -451,4 +451,7 @@ LIBBPF_1.7.0 { global: bpf_map__set_exclusive_program; bpf_map__exclusive_program; + bpf_prog_assoc_struct_ops; + bpf_program__assoc_struct_ops; + btf__permute; } LIBBPF_1.6.0; diff --git a/tools/lib/python/abi/abi_parser.py b/tools/lib/python/abi/abi_parser.py index 9b8db70067ef..d7bb20ef3acc 100644 --- a/tools/lib/python/abi/abi_parser.py +++ b/tools/lib/python/abi/abi_parser.py @@ -21,14 +21,17 @@ from abi.helpers import AbiDebug, ABI_DIR class AbiParser: - """Main class to parse ABI files""" + """Main class to parse ABI files.""" + #: Valid tags at Documentation/ABI. TAGS = r"(what|where|date|kernelversion|contact|description|users)" + + #: ABI elements that will auto-generate cross-references. XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)" def __init__(self, directory, logger=None, enable_lineno=False, show_warnings=True, debug=0): - """Stores arguments for the class and initialize class vars""" + """Stores arguments for the class and initialize class vars.""" self.directory = directory self.enable_lineno = enable_lineno @@ -65,7 +68,7 @@ class AbiParser: self.re_xref_node = re.compile(self.XREF) def warn(self, fdata, msg, extra=None): - """Displays a parse error if warning is enabled""" + """Displays a parse error if warning is enabled.""" if not self.show_warnings: return @@ -77,7 +80,7 @@ class AbiParser: self.log.warning(msg) def add_symbol(self, what, fname, ln=None, xref=None): - """Create a reference table describing where each 'what' is located""" + """Create a reference table describing where each 'what' is located.""" if what not in self.what_symbols: self.what_symbols[what] = {"file": {}} @@ -92,7 +95,7 @@ class AbiParser: self.what_symbols[what]["xref"] = xref def _parse_line(self, fdata, line): - """Parse a single line of an ABI file""" + """Parse a single line of an ABI file.""" new_what = False new_tag = False @@ -264,7 +267,7 @@ class AbiParser: self.warn(fdata, "Unexpected content", line) def parse_readme(self, nametag, fname): - """Parse ABI README file""" + """Parse ABI README file.""" nametag["what"] = ["Introduction"] nametag["path"] = "README" @@ -282,7 +285,7 @@ class AbiParser: nametag["description"] += line def parse_file(self, fname, path, basename): - """Parse a single file""" + """Parse a single file.""" ref = f"abi_file_{path}_{basename}" ref = self.re_unprintable.sub("_", ref).strip("_") @@ -348,7 +351,7 @@ class AbiParser: self.add_symbol(what=w, fname=fname, xref=fdata.key) def _parse_abi(self, root=None): - """Internal function to parse documentation ABI recursively""" + """Internal function to parse documentation ABI recursively.""" if not root: root = self.directory @@ -377,7 +380,7 @@ class AbiParser: self.parse_file(name, path, basename) def parse_abi(self, root=None): - """Parse documentation ABI""" + """Parse documentation ABI.""" self._parse_abi(root) @@ -385,7 +388,7 @@ class AbiParser: self.log.debug(pformat(self.data)) def desc_txt(self, desc): - """Print description as found inside ABI files""" + """Print description as found inside ABI files.""" desc = desc.strip(" \t\n") @@ -393,7 +396,7 @@ class AbiParser: def xref(self, fname): """ - Converts a Documentation/ABI + basename into a ReST cross-reference + Converts a Documentation/ABI + basename into a ReST cross-reference. """ xref = self.file_refs.get(fname) @@ -403,7 +406,7 @@ class AbiParser: return xref def desc_rst(self, desc): - """Enrich ReST output by creating cross-references""" + """Enrich ReST output by creating cross-references.""" # Remove title markups from the description # Having titles inside ABI files will only work if extra @@ -459,7 +462,7 @@ class AbiParser: def doc(self, output_in_txt=False, show_symbols=True, show_file=True, filter_path=None): - """Print ABI at stdout""" + """Print ABI at stdout.""" part = None for key, v in sorted(self.data.items(), @@ -549,7 +552,7 @@ class AbiParser: yield (msg, file_ref[0][0], ln) def check_issues(self): - """Warn about duplicated ABI entries""" + """Warn about duplicated ABI entries.""" for what, v in self.what_symbols.items(): files = v.get("file") @@ -575,7 +578,7 @@ class AbiParser: self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) def search_symbols(self, expr): - """ Searches for ABI symbols """ + """ Searches for ABI symbols.""" regex = re.compile(expr, re.I) diff --git a/tools/lib/python/abi/abi_regex.py b/tools/lib/python/abi/abi_regex.py index d5553206de3c..d0c5e3ede6b5 100644 --- a/tools/lib/python/abi/abi_regex.py +++ b/tools/lib/python/abi/abi_regex.py @@ -16,10 +16,22 @@ from abi.abi_parser import AbiParser from abi.helpers import AbiDebug class AbiRegex(AbiParser): - """Extends AbiParser to search ABI nodes with regular expressions""" + """ + Extends AbiParser to search ABI nodes with regular expressions. - # Escape only ASCII visible characters + There some optimizations here to allow a quick symbol search: + instead of trying to place all symbols altogether an doing linear + search which is very time consuming, create a tree with one depth, + grouping similar symbols altogether. + + Yet, sometimes a full search will be needed, so we have a special branch + on such group tree where other symbols are placed. + """ + + #: Escape only ASCII visible characters. escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])" + + #: Special group for other nodes. leave_others = "others" # Tuples with regular expressions to be compiled and replacement data @@ -88,13 +100,15 @@ class AbiRegex(AbiParser): # Recover plus characters (re.compile(r"\xf7"), "+"), ] + + #: Regex to check if the symbol name has a number on it. re_has_num = re.compile(r"\\d") - # Symbol name after escape_chars that are considered a devnode basename + #: Symbol name after escape_chars that are considered a devnode basename. re_symbol_name = re.compile(r"(\w|\\[\.\-\:])+$") - # List of popular group names to be skipped to minimize regex group size - # Use AbiDebug.SUBGROUP_SIZE to detect those + #: List of popular group names to be skipped to minimize regex group size + #: Use AbiDebug.SUBGROUP_SIZE to detect those. skip_names = set(["devices", "hwmon"]) def regex_append(self, what, new): @@ -148,7 +162,7 @@ class AbiRegex(AbiParser): def get_regexes(self, what): """ Given an ABI devnode, return a list of all regular expressions that - may match it, based on the sub-groups created by regex_append() + may match it, based on the sub-groups created by regex_append(). """ re_list = [] diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py index 639b23e4ca33..2a378d780d3c 100644 --- a/tools/lib/python/abi/helpers.py +++ b/tools/lib/python/abi/helpers.py @@ -13,26 +13,28 @@ ABI_DIR = "Documentation/ABI/" class AbiDebug: """Debug levels""" - WHAT_PARSING = 1 - WHAT_OPEN = 2 - DUMP_ABI_STRUCTS = 4 - UNDEFINED = 8 - REGEX = 16 - SUBGROUP_MAP = 32 - SUBGROUP_DICT = 64 - SUBGROUP_SIZE = 128 - GRAPH = 256 - + WHAT_PARSING = 1 #: Enable debug parsing logic. + WHAT_OPEN = 2 #: Enable debug messages on file open. + DUMP_ABI_STRUCTS = 4 #: Enable debug for ABI parse data. + UNDEFINED = 8 #: Enable extra undefined symbol data. + REGEX = 16 #: Enable debug for what to regex conversion. + SUBGROUP_MAP = 32 #: Enable debug for symbol regex subgroups + SUBGROUP_DICT = 64 #: Enable debug for sysfs graph tree variable. + SUBGROUP_SIZE = 128 #: Enable debug of search groups. + GRAPH = 256 #: Display ref tree graph for undefined symbols. +#: Helper messages for each debug variable DEBUG_HELP = """ -1 - enable debug parsing logic -2 - enable debug messages on file open -4 - enable debug for ABI parse data -8 - enable extra debug information to identify troubles - with ABI symbols found at the local machine that - weren't found on ABI documentation (used only for - undefined subcommand) -16 - enable debug for what to regex conversion -32 - enable debug for symbol regex subgroups -64 - enable debug for sysfs graph tree variable +1 - enable debug parsing logic +2 - enable debug messages on file open +4 - enable debug for ABI parse data +8 - enable extra debug information to identify troubles + with ABI symbols found at the local machine that + weren't found on ABI documentation (used only for + undefined subcommand) +16 - enable debug for what to regex conversion +32 - enable debug for symbol regex subgroups +64 - enable debug for sysfs graph tree variable +128 - enable debug of search groups +256 - enable displaying refrence tree graphs for undefined symbols. """ diff --git a/tools/lib/python/abi/system_symbols.py b/tools/lib/python/abi/system_symbols.py index 4a2554da217b..7bbefd274ea2 100644 --- a/tools/lib/python/abi/system_symbols.py +++ b/tools/lib/python/abi/system_symbols.py @@ -18,11 +18,11 @@ from random import shuffle from abi.helpers import AbiDebug class SystemSymbols: - """Stores arguments for the class and initialize class vars""" + """Stores arguments for the class and initialize class vars.""" def graph_add_file(self, path, link=None): """ - add a file path to the sysfs graph stored at self.root + add a file path to the sysfs graph stored at self.root. """ if path in self.files: @@ -43,7 +43,7 @@ class SystemSymbols: self.files.add(path) def print_graph(self, root_prefix="", root=None, level=0): - """Prints a reference tree graph using UTF-8 characters""" + """Prints a reference tree graph using UTF-8 characters.""" if not root: root = self.root @@ -173,7 +173,7 @@ class SystemSymbols: self._walk(sysfs) def check_file(self, refs, found): - """Check missing ABI symbols for a given sysfs file""" + """Check missing ABI symbols for a given sysfs file.""" res_list = [] @@ -214,7 +214,7 @@ class SystemSymbols: return res_list def _ref_interactor(self, root): - """Recursive function to interact over the sysfs tree""" + """Recursive function to interact over the sysfs tree.""" for k, v in root.items(): if isinstance(v, dict): @@ -232,7 +232,7 @@ class SystemSymbols: def get_fileref(self, all_refs, chunk_size): - """Interactor to group refs into chunks""" + """Interactor to group refs into chunks.""" n = 0 refs = [] @@ -250,7 +250,7 @@ class SystemSymbols: def check_undefined_symbols(self, max_workers=None, chunk_size=50, found=None, dry_run=None): - """Seach ABI for sysfs symbols missing documentation""" + """Seach ABI for sysfs symbols missing documentation.""" self.abi.parse_abi() diff --git a/tools/lib/python/feat/parse_features.py b/tools/lib/python/feat/parse_features.py index b88c04d3e2fe..41a51d9d6f62 100755 --- a/tools/lib/python/feat/parse_features.py +++ b/tools/lib/python/feat/parse_features.py @@ -21,14 +21,25 @@ class ParseFeature: from it. """ + #: feature header string. h_name = "Feature" + + #: Kernel config header string. h_kconfig = "Kconfig" + + #: description header string. h_description = "Description" + + #: subsystem header string. h_subsys = "Subsystem" + + #: status header string. h_status = "Status" + + #: architecture header string. h_arch = "Architecture" - # Sort order for status. Others will be mapped at the end. + #: Sort order for status. Others will be mapped at the end. status_map = { "ok": 0, "TODO": 1, @@ -40,7 +51,7 @@ class ParseFeature: def __init__(self, prefix, debug=0, enable_fname=False): """ - Sets internal variables + Sets internal variables. """ self.prefix = prefix @@ -63,11 +74,13 @@ class ParseFeature: self.msg = "" def emit(self, msg="", end="\n"): + """Helper function to append a new message for feature output.""" + self.msg += msg + end def parse_error(self, fname, ln, msg, data=None): """ - Displays an error message, printing file name and line + Displays an error message, printing file name and line. """ if ln: @@ -82,7 +95,7 @@ class ParseFeature: print("", file=sys.stderr) def parse_feat_file(self, fname): - """Parses a single arch-support.txt feature file""" + """Parses a single arch-support.txt feature file.""" if os.path.isdir(fname): return @@ -204,7 +217,7 @@ class ParseFeature: self.max_size_arch_with_header = self.max_size_arch + len(self.h_arch) def parse(self): - """Parses all arch-support.txt feature files inside self.prefix""" + """Parses all arch-support.txt feature files inside self.prefix.""" path = os.path.expanduser(self.prefix) @@ -281,7 +294,7 @@ class ParseFeature: def output_feature(self, feat): """ - Output a feature on all architectures + Output a feature on all architectures. """ title = f"Feature {feat}" @@ -331,7 +344,7 @@ class ParseFeature: def matrix_lines(self, desc_size, max_size_status, header): """ - Helper function to split element tables at the output matrix + Helper function to split element tables at the output matrix. """ if header: diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index a24f30ef4fa8..aba22c33393d 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -11,20 +11,23 @@ Interacts with the POSIX jobserver during the Kernel build time. A "normal" jobserver task, like the one initiated by a make subrocess would do: - open read/write file descriptors to communicate with the job server; - - ask for one slot by calling: + - ask for one slot by calling:: + claim = os.read(reader, 1) - - when the job finshes, call: + + - when the job finshes, call:: + os.write(writer, b"+") # os.write(writer, claim) Here, the goal is different: This script aims to get the remaining number of slots available, using all of them to run a command which handle tasks in parallel. To to that, it has a loop that ends only after there are no slots left. It then increments the number by one, in order to allow a -call equivalent to make -j$((claim+1)), e.g. having a parent make creating +call equivalent to ``make -j$((claim+1))``, e.g. having a parent make creating $claim child to do the actual work. The end goal here is to keep the total number of build tasks under the -limit established by the initial make -j$n_proc call. +limit established by the initial ``make -j$n_proc`` call. See: https://www.gnu.org/software/make/manual/html_node/POSIX-Jobserver.html#POSIX-Jobserver @@ -35,18 +38,22 @@ import os import subprocess import sys +def warn(text, *args): + print(f'WARNING: {text}', *args, file = sys.stderr) + class JobserverExec: """ Claim all slots from make using POSIX Jobserver. The main methods here are: + - open(): reserves all slots; - close(): method returns all used slots back to make; - - run(): executes a command setting PARALLELISM=<available slots jobs + 1> + - run(): executes a command setting PARALLELISM=<available slots jobs + 1>. """ def __init__(self): - """Initialize internal vars""" + """Initialize internal vars.""" self.claim = 0 self.jobs = b"" self.reader = None @@ -54,66 +61,105 @@ class JobserverExec: self.is_open = False def open(self): - """Reserve all available slots to be claimed later on""" + """Reserve all available slots to be claimed later on.""" if self.is_open: return - - try: - # Fetch the make environment options. - flags = os.environ["MAKEFLAGS"] - # Look for "--jobserver=R,W" - # Note that GNU Make has used --jobserver-fds and --jobserver-auth - # so this handles all of them. - opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] - - # Parse out R,W file descriptor numbers and set them nonblocking. - # If the MAKEFLAGS variable contains multiple instances of the - # --jobserver-auth= option, the last one is relevant. - fds = opts[-1].split("=", 1)[1] - - # Starting with GNU Make 4.4, named pipes are used for reader - # and writer. - # Example argument: --jobserver-auth=fifo:/tmp/GMfifo8134 - _, _, path = fds.partition("fifo:") - - if path: + self.is_open = True # We only try once + self.claim = None + # + # Check the make flags for "--jobserver=R,W" + # Note that GNU Make has used --jobserver-fds and --jobserver-auth + # so this handles all of them. + # + flags = os.environ.get('MAKEFLAGS', '') + opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] + if not opts: + return + # + # Separate out the provided file descriptors + # + split_opt = opts[-1].split('=', 1) + if len(split_opt) != 2: + warn('unparseable option:', opts[-1]) + return + fds = split_opt[1] + # + # As of GNU Make 4.4, we'll be looking for a named pipe + # identified as fifo:path + # + if fds.startswith('fifo:'): + path = fds[len('fifo:'):] + try: self.reader = os.open(path, os.O_RDONLY | os.O_NONBLOCK) self.writer = os.open(path, os.O_WRONLY) - else: - self.reader, self.writer = [int(x) for x in fds.split(",", 1)] + except (OSError, IOError): + warn('unable to open jobserver pipe', path) + return + # + # Otherwise look for integer file-descriptor numbers. + # + else: + split_fds = fds.split(',') + if len(split_fds) != 2: + warn('malformed jobserver file descriptors:', fds) + return + try: + self.reader = int(split_fds[0]) + self.writer = int(split_fds[1]) + except ValueError: + warn('non-integer jobserver file-descriptors:', fds) + return + try: + # # Open a private copy of reader to avoid setting nonblocking # on an unexpecting process with the same reader fd. - self.reader = os.open("/proc/self/fd/%d" % (self.reader), + # + self.reader = os.open(f"/proc/self/fd/{self.reader}", os.O_RDONLY | os.O_NONBLOCK) - - # Read out as many jobserver slots as possible - while True: - try: - slot = os.read(self.reader, 8) - self.jobs += slot - except (OSError, IOError) as e: - if e.errno == errno.EWOULDBLOCK: - # Stop at the end of the jobserver queue. - break - # If something went wrong, give back the jobs. - if self.jobs: - os.write(self.writer, self.jobs) - raise e - - # Add a bump for our caller's reserveration, since we're just going - # to sit here blocked on our child. - self.claim = len(self.jobs) + 1 - - except (KeyError, IndexError, ValueError, OSError, IOError): - # Any missing environment strings or bad fds should result in just - # not being parallel. - self.claim = None - - self.is_open = True + except (IOError, OSError) as e: + warn('Unable to reopen jobserver read-side pipe:', repr(e)) + return + # + # OK, we have the channel to the job server; read out as many jobserver + # slots as possible. + # + while True: + try: + slot = os.read(self.reader, 8) + if not slot: + # + # Something went wrong. Clear self.jobs to avoid writing + # weirdness back to the jobserver and give up. + self.jobs = b"" + warn("unexpected empty token from jobserver;" + " possible invalid '--jobserver-auth=' setting") + self.claim = None + return + except (OSError, IOError) as e: + # + # If there is nothing more to read then we are done. + # + if e.errno == errno.EWOULDBLOCK: + break + # + # Anything else says that something went weird; give back + # the jobs and give up. + # + if self.jobs: + os.write(self.writer, self.jobs) + self.claim = None + warn('error reading from jobserver pipe', repr(e)) + return + self.jobs += slot + # + # Add a bump for our caller's reserveration, since we're just going + # to sit here blocked on our child. + # + self.claim = len(self.jobs) + 1 def close(self): - """Return all reserved slots to Jobserver""" + """Return all reserved slots to Jobserver.""" if not self.is_open: return diff --git a/tools/lib/python/kdoc/enrich_formatter.py b/tools/lib/python/kdoc/enrich_formatter.py index bb171567a4ca..d1be4e5e1962 100644 --- a/tools/lib/python/kdoc/enrich_formatter.py +++ b/tools/lib/python/kdoc/enrich_formatter.py @@ -26,12 +26,16 @@ class EnrichFormatter(argparse.HelpFormatter): and how they're used at the __doc__ description. """ def __init__(self, *args, **kwargs): - """Initialize class and check if is TTY""" + """ + Initialize class and check if is TTY. + """ super().__init__(*args, **kwargs) self._tty = sys.stdout.isatty() def enrich_text(self, text): - """Handle ReST markups (currently, only ``foo``)""" + r""" + Handle ReST markups (currently, only \`\`text\`\` markups). + """ if self._tty and text: # Replace ``text`` with ANSI SGR (bold) return re.sub(r'\`\`(.+?)\`\`', @@ -39,12 +43,16 @@ class EnrichFormatter(argparse.HelpFormatter): return text def _fill_text(self, text, width, indent): - """Enrich descriptions with markups on it""" + """ + Enrich descriptions with markups on it. + """ enriched = self.enrich_text(text) return "\n".join(indent + line for line in enriched.splitlines()) def _format_usage(self, usage, actions, groups, prefix): - """Enrich positional arguments at usage: line""" + """ + Enrich positional arguments at usage: line. + """ prog = self._prog parts = [] @@ -63,7 +71,9 @@ class EnrichFormatter(argparse.HelpFormatter): return usage_text def _format_action_invocation(self, action): - """Enrich argument names""" + """ + Enrich argument names. + """ if not action.option_strings: return self.enrich_text(f"``{action.dest.upper()}``") diff --git a/tools/lib/python/kdoc/kdoc_files.py b/tools/lib/python/kdoc/kdoc_files.py index bfe02baf1606..022487ea2cc6 100644 --- a/tools/lib/python/kdoc/kdoc_files.py +++ b/tools/lib/python/kdoc/kdoc_files.py @@ -5,7 +5,8 @@ # pylint: disable=R0903,R0913,R0914,R0917 """ -Parse lernel-doc tags on multiple kernel source files. +Classes for navigating through the files that kernel-doc needs to handle +to generate documentation. """ import argparse @@ -43,7 +44,7 @@ class GlobSourceFiles: self.srctree = srctree def _parse_dir(self, dirname): - """Internal function to parse files recursively""" + """Internal function to parse files recursively.""" with os.scandir(dirname) as obj: for entry in obj: @@ -65,7 +66,7 @@ class GlobSourceFiles: def parse_files(self, file_list, file_not_found_cb): """ Define an iterator to parse all source files from file_list, - handling directories if any + handling directories if any. """ if not file_list: @@ -91,18 +92,18 @@ class KernelFiles(): There are two type of parsers defined here: - self.parse_file(): parses both kernel-doc markups and - EXPORT_SYMBOL* macros; - - self.process_export_file(): parses only EXPORT_SYMBOL* macros. + ``EXPORT_SYMBOL*`` macros; + - self.process_export_file(): parses only ``EXPORT_SYMBOL*`` macros. """ def warning(self, msg): - """Ancillary routine to output a warning and increment error count""" + """Ancillary routine to output a warning and increment error count.""" self.config.log.warning(msg) self.errors += 1 def error(self, msg): - """Ancillary routine to output an error and increment error count""" + """Ancillary routine to output an error and increment error count.""" self.config.log.error(msg) self.errors += 1 @@ -128,7 +129,7 @@ class KernelFiles(): def process_export_file(self, fname): """ - Parses EXPORT_SYMBOL* macros from a single Kernel source file. + Parses ``EXPORT_SYMBOL*`` macros from a single Kernel source file. """ # Prevent parsing the same file twice if results are cached @@ -157,7 +158,7 @@ class KernelFiles(): wcontents_before_sections=False, logger=None): """ - Initialize startup variables and parse all files + Initialize startup variables and parse all files. """ if not verbose: @@ -213,7 +214,7 @@ class KernelFiles(): def parse(self, file_list, export_file=None): """ - Parse all files + Parse all files. """ glob = GlobSourceFiles(srctree=self.config.src_tree) @@ -242,7 +243,7 @@ class KernelFiles(): filenames=None, export_file=None): """ Interacts over the kernel-doc results and output messages, - returning kernel-doc markups on each interaction + returning kernel-doc markups on each interaction. """ self.out_style.set_config(self.config) diff --git a/tools/lib/python/kdoc/kdoc_item.py b/tools/lib/python/kdoc/kdoc_item.py index 19805301cb2c..2b8a93f79716 100644 --- a/tools/lib/python/kdoc/kdoc_item.py +++ b/tools/lib/python/kdoc/kdoc_item.py @@ -4,7 +4,16 @@ # then pass into the output modules. # +""" +Data class to store a kernel-doc Item. +""" + class KdocItem: + """ + A class that will, eventually, encapsulate all of the parsed data that we + then pass into the output modules. + """ + def __init__(self, name, fname, type, start_line, **other_stuff): self.name = name self.fname = fname @@ -24,6 +33,9 @@ class KdocItem: self.other_stuff = other_stuff def get(self, key, default = None): + """ + Get a value from optional keys. + """ return self.other_stuff.get(key, default) def __getitem__(self, key): @@ -33,10 +45,16 @@ class KdocItem: # Tracking of section and parameter information. # def set_sections(self, sections, start_lines): + """ + Set sections and start lines. + """ self.sections = sections self.section_start_lines = start_lines def set_params(self, names, descs, types, starts): + """ + Set parameter list: names, descriptions, types and start lines. + """ self.parameterlist = names self.parameterdescs = descs self.parametertypes = types diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py index b1aaa7fc3604..4210b91dde5f 100644 --- a/tools/lib/python/kdoc/kdoc_output.py +++ b/tools/lib/python/kdoc/kdoc_output.py @@ -5,14 +5,16 @@ # pylint: disable=C0301,R0902,R0911,R0912,R0913,R0914,R0915,R0917 """ -Implement output filters to print kernel-doc documentation. +Classes to implement output filters to print kernel-doc documentation. -The implementation uses a virtual base class (OutputFormat) which +The implementation uses a virtual base class ``OutputFormat``. It contains dispatches to virtual methods, and some code to filter out output messages. The actual implementation is done on one separate class per each type -of output. Currently, there are output classes for ReST and man/troff. +of output, e.g. ``RestFormat`` and ``ManFormat`` classes. + +Currently, there are output classes for ReST and man/troff. """ import os @@ -54,16 +56,19 @@ class OutputFormat: """ # output mode. - OUTPUT_ALL = 0 # output all symbols and doc sections - OUTPUT_INCLUDE = 1 # output only specified symbols - OUTPUT_EXPORTED = 2 # output exported symbols - OUTPUT_INTERNAL = 3 # output non-exported symbols + OUTPUT_ALL = 0 #: Output all symbols and doc sections. + OUTPUT_INCLUDE = 1 #: Output only specified symbols. + OUTPUT_EXPORTED = 2 #: Output exported symbols. + OUTPUT_INTERNAL = 3 #: Output non-exported symbols. - # Virtual member to be overridden at the inherited classes + #: Highlights to be used in ReST format. highlights = [] + #: Blank line character. + blankline = "" + def __init__(self): - """Declare internal vars and set mode to OUTPUT_ALL""" + """Declare internal vars and set mode to ``OUTPUT_ALL``.""" self.out_mode = self.OUTPUT_ALL self.enable_lineno = None @@ -128,7 +133,7 @@ class OutputFormat: self.config.warning(log_msg) def check_doc(self, name, args): - """Check if DOC should be output""" + """Check if DOC should be output.""" if self.no_doc_sections: return False @@ -177,7 +182,7 @@ class OutputFormat: def msg(self, fname, name, args): """ - Handles a single entry from kernel-doc parser + Handles a single entry from kernel-doc parser. """ self.data = "" @@ -199,6 +204,10 @@ class OutputFormat: self.out_enum(fname, name, args) return self.data + if dtype == "var": + self.out_var(fname, name, args) + return self.data + if dtype == "typedef": self.out_typedef(fname, name, args) return self.data @@ -216,27 +225,31 @@ class OutputFormat: # Virtual methods to be overridden by inherited classes # At the base class, those do nothing. def set_symbols(self, symbols): - """Get a list of all symbols from kernel_doc""" + """Get a list of all symbols from kernel_doc.""" def out_doc(self, fname, name, args): - """Outputs a DOC block""" + """Outputs a DOC block.""" def out_function(self, fname, name, args): - """Outputs a function""" + """Outputs a function.""" def out_enum(self, fname, name, args): - """Outputs an enum""" + """Outputs an enum.""" + + def out_var(self, fname, name, args): + """Outputs a variable.""" def out_typedef(self, fname, name, args): - """Outputs a typedef""" + """Outputs a typedef.""" def out_struct(self, fname, name, args): - """Outputs a struct""" + """Outputs a struct.""" class RestFormat(OutputFormat): - """Consts and functions used by ReST output""" + """Consts and functions used by ReST output.""" + #: Highlights to be used in ReST format highlights = [ (type_constant, r"``\1``"), (type_constant2, r"``\1``"), @@ -256,9 +269,13 @@ class RestFormat(OutputFormat): (type_fallback, r":c:type:`\1`"), (type_param_ref, r"**\1\2**") ] + blankline = "\n" + #: Sphinx literal block regex. sphinx_literal = KernRe(r'^[^.].*::$', cache=False) + + #: Sphinx code block regex. sphinx_cblock = KernRe(r'^\.\.\ +code-block::', cache=False) def __init__(self): @@ -273,7 +290,7 @@ class RestFormat(OutputFormat): self.lineprefix = "" def print_lineno(self, ln): - """Outputs a line number""" + """Outputs a line number.""" if self.enable_lineno and ln is not None: ln += 1 @@ -282,7 +299,7 @@ class RestFormat(OutputFormat): def output_highlight(self, args): """ Outputs a C symbol that may require being converted to ReST using - the self.highlights variable + the self.highlights variable. """ input_text = args @@ -472,6 +489,25 @@ class RestFormat(OutputFormat): self.lineprefix = oldprefix self.out_section(args) + def out_var(self, fname, name, args): + oldprefix = self.lineprefix + ln = args.declaration_start_line + full_proto = args.other_stuff["full_proto"] + + self.lineprefix = " " + + self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}``{full_proto}``\n\n" + + self.print_lineno(ln) + self.output_highlight(args.get('purpose', '')) + self.data += "\n" + + if args.other_stuff["default_val"]: + self.data += f'{self.lineprefix}**Initialization**\n\n' + self.output_highlight(f'default: ``{args.other_stuff["default_val"]}``') + + self.out_section(args) + def out_typedef(self, fname, name, args): oldprefix = self.lineprefix @@ -544,7 +580,7 @@ class RestFormat(OutputFormat): class ManFormat(OutputFormat): - """Consts and functions used by man pages output""" + """Consts and functions used by man pages output.""" highlights = ( (type_constant, r"\1"), @@ -561,6 +597,7 @@ class ManFormat(OutputFormat): ) blankline = "" + #: Allowed timestamp formats. date_formats = [ "%a %b %d %H:%M:%S %Z %Y", "%a %b %d %H:%M:%S %Y", @@ -627,7 +664,7 @@ class ManFormat(OutputFormat): self.symbols = symbols def out_tail(self, fname, name, args): - """Adds a tail for all man pages""" + """Adds a tail for all man pages.""" # SEE ALSO section self.data += f'.SH "SEE ALSO"' + "\n.PP\n" @@ -663,7 +700,7 @@ class ManFormat(OutputFormat): def output_highlight(self, block): """ Outputs a C symbol that may require being highlighted with - self.highlights variable using troff syntax + self.highlights variable using troff syntax. """ contents = self.highlight_block(block) @@ -694,7 +731,6 @@ class ManFormat(OutputFormat): self.output_highlight(text) def out_function(self, fname, name, args): - """output function in man""" out_name = self.arg_name(args, name) @@ -773,6 +809,26 @@ class ManFormat(OutputFormat): self.data += f'.SH "{section}"' + "\n" self.output_highlight(text) + def out_var(self, fname, name, args): + out_name = self.arg_name(args, name) + full_proto = args.other_stuff["full_proto"] + + self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n" + + self.data += ".SH NAME\n" + self.data += f"{name} \\- {args['purpose']}\n" + + self.data += ".SH SYNOPSIS\n" + self.data += f"{full_proto}\n" + + if args.other_stuff["default_val"]: + self.data += f'.SH "Initialization"' + "\n" + self.output_highlight(f'default: {args.other_stuff["default_val"]}') + + for section, text in args.sections.items(): + self.data += f'.SH "{section}"' + "\n" + self.output_highlight(text) + def out_typedef(self, fname, name, args): module = self.modulename purpose = args.get('purpose') diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py index 500aafc50032..fd57944ae907 100644 --- a/tools/lib/python/kdoc/kdoc_parser.py +++ b/tools/lib/python/kdoc/kdoc_parser.py @@ -5,11 +5,8 @@ # pylint: disable=C0301,C0302,R0904,R0912,R0913,R0914,R0915,R0917,R1702 """ -kdoc_parser -=========== - -Read a C language source or header FILE and extract embedded -documentation comments +Classes and functions related to reading a C language source or header FILE +and extract embedded documentation comments from it. """ import sys @@ -53,7 +50,7 @@ doc_content = doc_com_body + KernRe(r'(.*)', cache=False) doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False) doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False) doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False) -doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False) +doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@\s*[\w][\w\.]*\s*):\s*(.*)\s*\*/\s*$', cache=False) export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False) export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False) @@ -64,7 +61,7 @@ type_param = KernRe(r"@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False) # Tests for the beginning of a kerneldoc block in its various forms. # doc_block = doc_com + KernRe(r'DOC:\s*(.*)?', cache=False) -doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef)\b\s*(\w*)", cache = False) +doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef|var)\b\s*(\w*)", cache = False) doc_begin_func = KernRe(str(doc_com) + # initial " * ' r"(?:\w+\s*\*\s*)?" + # type (not captured) r'(?:define\s+)?' + # possible "define" (not captured) @@ -195,25 +192,28 @@ function_xforms = [ ] # -# Apply a set of transforms to a block of text. +# Ancillary functions # + def apply_transforms(xforms, text): + """ + Apply a set of transforms to a block of text. + """ for search, subst in xforms: text = search.sub(subst, text) return text -# -# A little helper to get rid of excess white space -# multi_space = KernRe(r'\s\s+') def trim_whitespace(s): + """ + A little helper to get rid of excess white space. + """ return multi_space.sub(' ', s.strip()) -# -# Remove struct/enum members that have been marked "private". -# def trim_private_members(text): - # + """ + Remove ``struct``/``enum`` members that have been marked "private". + """ # First look for a "public:" block that ends a private region, then # handle the "private until the end" case. # @@ -226,20 +226,21 @@ def trim_private_members(text): class state: """ - State machine enums + States used by the parser's state machine. """ # Parser states - NORMAL = 0 # normal code - NAME = 1 # looking for function name - DECLARATION = 2 # We have seen a declaration which might not be done - BODY = 3 # the body of the comment - SPECIAL_SECTION = 4 # doc section ending with a blank line - PROTO = 5 # scanning prototype - DOCBLOCK = 6 # documentation block - INLINE_NAME = 7 # gathering doc outside main block - INLINE_TEXT = 8 # reading the body of inline docs - + NORMAL = 0 #: Normal code. + NAME = 1 #: Looking for function name. + DECLARATION = 2 #: We have seen a declaration which might not be done. + BODY = 3 #: The body of the comment. + SPECIAL_SECTION = 4 #: Doc section ending with a blank line. + PROTO = 5 #: Scanning prototype. + DOCBLOCK = 6 #: Documentation block. + INLINE_NAME = 7 #: Gathering doc outside main block. + INLINE_TEXT = 8 #: Reading the body of inline docs. + + #: Names for each parser state. name = [ "NORMAL", "NAME", @@ -253,9 +254,12 @@ class state: ] -SECTION_DEFAULT = "Description" # default section +SECTION_DEFAULT = "Description" #: Default section. class KernelEntry: + """ + Encapsulates a Kernel documentation entry. + """ def __init__(self, config, fname, ln): self.config = config @@ -288,14 +292,16 @@ class KernelEntry: # Management of section contents # def add_text(self, text): + """Add a new text to the entry contents list.""" self._contents.append(text) def contents(self): + """Returns a string with all content texts that were added.""" return '\n'.join(self._contents) + '\n' # TODO: rename to emit_message after removal of kernel-doc.pl def emit_msg(self, ln, msg, *, warning=True): - """Emit a message""" + """Emit a message.""" log_msg = f"{self.fname}:{ln} {msg}" @@ -309,10 +315,10 @@ class KernelEntry: self.warnings.append(log_msg) return - # - # Begin a new section. - # def begin_section(self, line_no, title = SECTION_DEFAULT, dump = False): + """ + Begin a new section. + """ if dump: self.dump_section(start_new = True) self.section = title @@ -366,11 +372,13 @@ class KernelDoc: documentation comments. """ - # Section names - + #: Name of context section. section_context = "Context" + + #: Name of return section. section_return = "Return" + #: String to write when a parameter is not described. undescribed = "-- undescribed --" def __init__(self, config, fname): @@ -416,7 +424,7 @@ class KernelDoc: def dump_section(self, start_new=True): """ - Dumps section contents to arrays/hashes intended for that purpose. + Dump section contents to arrays/hashes intended for that purpose. """ if self.entry: @@ -425,9 +433,9 @@ class KernelDoc: # TODO: rename it to store_declaration after removal of kernel-doc.pl def output_declaration(self, dtype, name, **args): """ - Stores the entry into an entry array. + Store the entry into an entry array. - The actual output and output filters will be handled elsewhere + The actual output and output filters will be handled elsewhere. """ item = KdocItem(name, self.fname, dtype, @@ -448,18 +456,37 @@ class KernelDoc: self.config.log.debug("Output: %s:%s = %s", dtype, name, pformat(args)) + def emit_unused_warnings(self): + """ + When the parser fails to produce a valid entry, it places some + warnings under `entry.warnings` that will be discarded when resetting + the state. + + Ensure that those warnings are not lost. + + .. note:: + + Because we are calling `config.warning()` here, those + warnings are not filtered by the `-W` parameters: they will all + be produced even when `-Wreturn`, `-Wshort-desc`, and/or + `-Wcontents-before-sections` are used. + + Allowing those warnings to be filtered is complex, because it + would require storing them in a buffer and then filtering them + during the output step of the code, depending on the + selected symbols. + """ + if self.entry and self.entry not in self.entries: + for log_msg in self.entry.warnings: + self.config.warning(log_msg) + def reset_state(self, ln): """ Ancillary routine to create a new entry. It initializes all variables used by the state machine. """ - # - # Flush the warnings out before we proceed further - # - if self.entry and self.entry not in self.entries: - for log_msg in self.entry.warnings: - self.config.log.warning(log_msg) + self.emit_unused_warnings() self.entry = KernelEntry(self.config, self.fname, ln) @@ -663,10 +690,12 @@ class KernelDoc: self.emit_msg(ln, f"No description found for return value of '{declaration_name}'") - # - # Split apart a structure prototype; returns (struct|union, name, members) or None - # def split_struct_proto(self, proto): + """ + Split apart a structure prototype; returns (struct|union, name, + members) or ``None``. + """ + type_pattern = r'(struct|union)' qualifiers = [ "__attribute__", @@ -685,21 +714,26 @@ class KernelDoc: if r.search(proto): return (r.group(1), r.group(3), r.group(2)) return None - # - # Rewrite the members of a structure or union for easier formatting later on. - # Among other things, this function will turn a member like: - # - # struct { inner_members; } foo; - # - # into: - # - # struct foo; inner_members; - # + def rewrite_struct_members(self, members): + """ + Process ``struct``/``union`` members from the most deeply nested + outward. + + Rewrite the members of a ``struct`` or ``union`` for easier formatting + later on. Among other things, this function will turn a member like:: + + struct { inner_members; } foo; + + into:: + + struct foo; inner_members; + """ + # - # Process struct/union members from the most deeply nested outward. The - # trick is in the ^{ below - it prevents a match of an outer struct/union - # until the inner one has been munged (removing the "{" in the process). + # The trick is in the ``^{`` below - it prevents a match of an outer + # ``struct``/``union`` until the inner one has been munged + # (removing the ``{`` in the process). # struct_members = KernRe(r'(struct|union)' # 0: declaration type r'([^\{\};]+)' # 1: possible name @@ -777,11 +811,12 @@ class KernelDoc: tuples = struct_members.findall(members) return members - # - # Format the struct declaration into a standard form for inclusion in the - # resulting docs. - # def format_struct_decl(self, declaration): + """ + Format the ``struct`` declaration into a standard form for inclusion + in the resulting docs. + """ + # # Insert newlines, get rid of extra spaces. # @@ -815,7 +850,7 @@ class KernelDoc: def dump_struct(self, ln, proto): """ - Store an entry for a struct or union + Store an entry for a ``struct`` or ``union`` """ # # Do the basic parse to get the pieces of the declaration. @@ -857,7 +892,7 @@ class KernelDoc: def dump_enum(self, ln, proto): """ - Stores an enum inside self.entries array. + Store an ``enum`` inside self.entries array. """ # # Strip preprocessor directives. Note that this depends on the @@ -927,9 +962,84 @@ class KernelDoc: self.output_declaration('enum', declaration_name, purpose=self.entry.declaration_purpose) + def dump_var(self, ln, proto): + """ + Store variables that are part of kAPI. + """ + VAR_ATTRIBS = [ + "extern", + ] + OPTIONAL_VAR_ATTR = "^(?:" + "|".join(VAR_ATTRIBS) + ")?" + + sub_prefixes = [ + (KernRe(r"__read_mostly"), ""), + (KernRe(r"__ro_after_init"), ""), + (KernRe(r"(?://.*)$"), ""), + (KernRe(r"(?:/\*.*\*/)"), ""), + (KernRe(r";$"), ""), + (KernRe(r"=.*"), ""), + ] + + # + # Store the full prototype before modifying it + # + full_proto = proto + declaration_name = None + + # + # Handle macro definitions + # + macro_prefixes = [ + KernRe(r"DEFINE_[\w_]+\s*\(([\w_]+)\)"), + ] + + for r in macro_prefixes: + match = r.search(proto) + if match: + declaration_name = match.group(1) + break + + # + # Drop comments and macros to have a pure C prototype + # + if not declaration_name: + for r, sub in sub_prefixes: + proto = r.sub(sub, proto) + + proto = proto.rstrip() + + # + # Variable name is at the end of the declaration + # + + default_val = None + + r= KernRe(OPTIONAL_VAR_ATTR + r"\w.*\s+(?:\*+)?([\w_]+)\s*[\d\]\[]*\s*(=.*)?") + if r.match(proto): + if not declaration_name: + declaration_name = r.group(1) + + default_val = r.group(2) + else: + r= KernRe(OPTIONAL_VAR_ATTR + r"(?:\w.*)?\s+(?:\*+)?(?:[\w_]+)\s*[\d\]\[]*\s*(=.*)?") + if r.match(proto): + default_val = r.group(1) + + if not declaration_name: + self.emit_msg(ln,f"{proto}: can't parse variable") + return + + if default_val: + default_val = default_val.lstrip("=").strip() + + self.output_declaration("var", declaration_name, + full_proto=full_proto, + default_val=default_val, + purpose=self.entry.declaration_purpose) + def dump_declaration(self, ln, prototype): """ - Stores a data declaration inside self.entries array. + Store a data declaration inside self.entries array. """ if self.entry.decl_type == "enum": @@ -938,13 +1048,15 @@ class KernelDoc: self.dump_typedef(ln, prototype) elif self.entry.decl_type in ["union", "struct"]: self.dump_struct(ln, prototype) + elif self.entry.decl_type == "var": + self.dump_var(ln, prototype) else: # This would be a bug self.emit_message(ln, f'Unknown declaration type: {self.entry.decl_type}') def dump_function(self, ln, prototype): """ - Stores a function or function macro inside self.entries array. + Store a function or function macro inside self.entries array. """ found = func_macro = False @@ -1045,7 +1157,7 @@ class KernelDoc: def dump_typedef(self, ln, proto): """ - Stores a typedef inside self.entries array. + Store a ``typedef`` inside self.entries array. """ # # We start by looking for function typedefs. @@ -1099,7 +1211,7 @@ class KernelDoc: @staticmethod def process_export(function_set, line): """ - process EXPORT_SYMBOL* tags + process ``EXPORT_SYMBOL*`` tags This method doesn't use any variable from the class, so declare it with a staticmethod decorator. @@ -1130,7 +1242,7 @@ class KernelDoc: def process_normal(self, ln, line): """ - STATE_NORMAL: looking for the /** to begin everything. + STATE_NORMAL: looking for the ``/**`` to begin everything. """ if not doc_start.match(line): @@ -1220,10 +1332,10 @@ class KernelDoc: else: self.emit_msg(ln, f"Cannot find identifier on line:\n{line}") - # - # Helper function to determine if a new section is being started. - # def is_new_section(self, ln, line): + """ + Helper function to determine if a new section is being started. + """ if doc_sect.search(line): self.state = state.BODY # @@ -1255,10 +1367,10 @@ class KernelDoc: return True return False - # - # Helper function to detect (and effect) the end of a kerneldoc comment. - # def is_comment_end(self, ln, line): + """ + Helper function to detect (and effect) the end of a kerneldoc comment. + """ if doc_end.search(line): self.dump_section() @@ -1277,7 +1389,7 @@ class KernelDoc: def process_decl(self, ln, line): """ - STATE_DECLARATION: We've seen the beginning of a declaration + STATE_DECLARATION: We've seen the beginning of a declaration. """ if self.is_new_section(ln, line) or self.is_comment_end(ln, line): return @@ -1306,7 +1418,7 @@ class KernelDoc: def process_special(self, ln, line): """ - STATE_SPECIAL_SECTION: a section ending with a blank line + STATE_SPECIAL_SECTION: a section ending with a blank line. """ # # If we have hit a blank line (only the " * " marker), then this @@ -1396,7 +1508,7 @@ class KernelDoc: def syscall_munge(self, ln, proto): # pylint: disable=W0613 """ - Handle syscall definitions + Handle syscall definitions. """ is_void = False @@ -1435,7 +1547,7 @@ class KernelDoc: def tracepoint_munge(self, ln, proto): """ - Handle tracepoint definitions + Handle tracepoint definitions. """ tracepointname = None @@ -1471,7 +1583,7 @@ class KernelDoc: return proto def process_proto_function(self, ln, line): - """Ancillary routine to process a function prototype""" + """Ancillary routine to process a function prototype.""" # strip C99-style comments to end of line line = KernRe(r"//.*$", re.S).sub('', line) @@ -1516,7 +1628,9 @@ class KernelDoc: self.reset_state(ln) def process_proto_type(self, ln, line): - """Ancillary routine to process a type""" + """ + Ancillary routine to process a type. + """ # Strip C99-style comments and surrounding whitespace line = KernRe(r"//.*$", re.S).sub('', line).strip() @@ -1570,7 +1684,7 @@ class KernelDoc: self.process_proto_type(ln, line) def process_docblock(self, ln, line): - """STATE_DOCBLOCK: within a DOC: block.""" + """STATE_DOCBLOCK: within a ``DOC:`` block.""" if doc_end.search(line): self.dump_section() @@ -1582,7 +1696,7 @@ class KernelDoc: def parse_export(self): """ - Parses EXPORT_SYMBOL* macros from a single Kernel source file. + Parses ``EXPORT_SYMBOL*`` macros from a single Kernel source file. """ export_table = set() @@ -1599,10 +1713,7 @@ class KernelDoc: return export_table - # - # The state/action table telling us which function to invoke in - # each state. - # + #: The state/action table telling us which function to invoke in each state. state_actions = { state.NORMAL: process_normal, state.NAME: process_name, @@ -1664,6 +1775,8 @@ class KernelDoc: # Hand this line to the appropriate state handler self.state_actions[self.state](self, ln, line) + self.emit_unused_warnings() + except OSError: self.config.log.error(f"Error: Cannot open file {self.fname}") diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py index 2dfa1bf83d64..0bf9e01cdc57 100644 --- a/tools/lib/python/kdoc/kdoc_re.py +++ b/tools/lib/python/kdoc/kdoc_re.py @@ -51,6 +51,9 @@ class KernRe: """ return self.regex.pattern + def __repr__(self): + return f're.compile("{self.regex.pattern}")' + def __add__(self, other): """ Allows adding two regular expressions into one. @@ -61,7 +64,7 @@ class KernRe: def match(self, string): """ - Handles a re.match storing its results + Handles a re.match storing its results. """ self.last_match = self.regex.match(string) @@ -69,7 +72,7 @@ class KernRe: def search(self, string): """ - Handles a re.search storing its results + Handles a re.search storing its results. """ self.last_match = self.regex.search(string) @@ -77,28 +80,28 @@ class KernRe: def findall(self, string): """ - Alias to re.findall + Alias to re.findall. """ return self.regex.findall(string) def split(self, string): """ - Alias to re.split + Alias to re.split. """ return self.regex.split(string) def sub(self, sub, string, count=0): """ - Alias to re.sub + Alias to re.sub. """ return self.regex.sub(sub, string, count=count) def group(self, num): """ - Returns the group results of the last match + Returns the group results of the last match. """ return self.last_match.group(num) @@ -110,7 +113,7 @@ class NestedMatch: even harder on Python with its normal re module, as there are several advanced regular expressions that are missing. - This is the case of this pattern: + This is the case of this pattern:: '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' @@ -121,6 +124,7 @@ class NestedMatch: replace nested expressions. The original approach was suggested by: + https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex Although I re-implemented it to make it more generic and match 3 types @@ -224,14 +228,18 @@ class NestedMatch: yield line[t[0]:t[2]] def sub(self, regex, sub, line, count=0): - """ + r""" This is similar to re.sub: It matches a regex that it is followed by a delimiter, replacing occurrences only if all delimiters are paired. - if r'\1' is used, it works just like re: it places there the - matched paired data with the delimiter stripped. + if the sub argument contains:: + + r'\1' + + it will work just like re: it places there the matched paired data + with the delimiter stripped. If count is different than zero, it will replace at most count items. diff --git a/tools/lib/python/kdoc/latex_fonts.py b/tools/lib/python/kdoc/latex_fonts.py index 29317f8006ea..1d04cbda169f 100755 --- a/tools/lib/python/kdoc/latex_fonts.py +++ b/tools/lib/python/kdoc/latex_fonts.py @@ -5,12 +5,13 @@ # Ported to Python by (c) Mauro Carvalho Chehab, 2025 """ -Detect problematic Noto CJK variable fonts. +Detect problematic Noto CJK variable fonts +========================================== -For "make pdfdocs", reports of build errors of translations.pdf started -arriving early 2024 [1, 2]. It turned out that Fedora and openSUSE -tumbleweed have started deploying variable-font [3] format of "Noto CJK" -fonts [4, 5]. For PDF, a LaTeX package named xeCJK is used for CJK +For ``make pdfdocs``, reports of build errors of translations.pdf started +arriving early 2024 [1]_ [2]_. It turned out that Fedora and openSUSE +tumbleweed have started deploying variable-font [3]_ format of "Noto CJK" +fonts [4]_ [5]_. For PDF, a LaTeX package named xeCJK is used for CJK (Chinese, Japanese, Korean) pages. xeCJK requires XeLaTeX/XeTeX, which does not (and likely never will) understand variable fonts for historical reasons. @@ -25,68 +26,77 @@ This script is invoked from the error path of "make pdfdocs" and emits suggestions if variable-font files of "Noto CJK" fonts are in the list of fonts accessible from XeTeX. -References: -[1]: https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/ -[2]: https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/ -[3]: https://en.wikipedia.org/wiki/Variable_font -[4]: https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts -[5]: https://build.opensuse.org/request/show/1157217 +.. [1] https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/ +.. [2] https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/ +.. [3] https://en.wikipedia.org/wiki/Variable_font +.. [4] https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts +.. [5] https://build.opensuse.org/request/show/1157217 -#=========================================================================== Workarounds for building translations.pdf -#=========================================================================== +----------------------------------------- * Denylist "variable font" Noto CJK fonts. + - Create $HOME/deny-vf/fontconfig/fonts.conf from template below, with tweaks if necessary. Remove leading "". + - Path of fontconfig/fonts.conf can be overridden by setting an env variable FONTS_CONF_DENY_VF. - * Template: ------------------------------------------------------------------ -<?xml version="1.0"?> -<!DOCTYPE fontconfig SYSTEM "urn:fontconfig:fonts.dtd"> -<fontconfig> -<!-- - Ignore variable-font glob (not to break xetex) ---> - <selectfont> - <rejectfont> - <!-- - for Fedora - --> - <glob>/usr/share/fonts/google-noto-*-cjk-vf-fonts</glob> - <!-- - for openSUSE tumbleweed - --> - <glob>/usr/share/fonts/truetype/Noto*CJK*-VF.otf</glob> - </rejectfont> - </selectfont> -</fontconfig> ------------------------------------------------------------------ + * Template:: + + <?xml version="1.0"?> + <!DOCTYPE fontconfig SYSTEM "urn:fontconfig:fonts.dtd"> + <fontconfig> + <!-- + Ignore variable-font glob (not to break xetex) + --> + <selectfont> + <rejectfont> + <!-- + for Fedora + --> + <glob>/usr/share/fonts/google-noto-*-cjk-vf-fonts</glob> + <!-- + for openSUSE tumbleweed + --> + <glob>/usr/share/fonts/truetype/Noto*CJK*-VF.otf</glob> + </rejectfont> + </selectfont> + </fontconfig> The denylisting is activated for "make pdfdocs". * For skipping CJK pages in PDF + - Uninstall texlive-xecjk. Denylisting is not needed in this case. * For printing CJK pages in PDF + - Need non-variable "Noto CJK" fonts. + * Fedora + - google-noto-sans-cjk-fonts - google-noto-serif-cjk-fonts + * openSUSE tumbleweed + - Non-variable "Noto CJK" fonts are not available as distro packages as of April, 2024. Fetch a set of font files from upstream Noto CJK Font released at: + https://github.com/notofonts/noto-cjk/tree/main/Sans#super-otc + and at: + https://github.com/notofonts/noto-cjk/tree/main/Serif#super-otc - , then uncompress and deploy them. + + then uncompress and deploy them. - Remember to update fontconfig cache by running fc-cache. -!!! Caution !!! +.. caution:: Uninstalling "variable font" packages can be dangerous. They might be depended upon by other packages important for your work. Denylisting should be less invasive, as it is effective only while @@ -115,10 +125,15 @@ class LatexFontChecker: self.re_cjk = re.compile(r"([^:]+):\s*Noto\s+(Sans|Sans Mono|Serif) CJK") def description(self): + """ + Returns module description. + """ return __doc__ def get_noto_cjk_vf_fonts(self): - """Get Noto CJK fonts""" + """ + Get Noto CJK fonts. + """ cjk_fonts = set() cmd = ["fc-list", ":", "file", "family", "variable"] @@ -143,7 +158,9 @@ class LatexFontChecker: return sorted(cjk_fonts) def check(self): - """Check for problems with CJK fonts""" + """ + Check for problems with CJK fonts. + """ fonts = textwrap.indent("\n".join(self.get_noto_cjk_vf_fonts()), " ") if not fonts: diff --git a/tools/lib/python/kdoc/parse_data_structs.py b/tools/lib/python/kdoc/parse_data_structs.py index 25361996cd20..9941cd19032e 100755 --- a/tools/lib/python/kdoc/parse_data_structs.py +++ b/tools/lib/python/kdoc/parse_data_structs.py @@ -9,12 +9,12 @@ Parse a source file or header, creating ReStructured Text cross references. It accepts an optional file to change the default symbol reference or to suppress symbols from the output. -It is capable of identifying defines, functions, structs, typedefs, -enums and enum symbols and create cross-references for all of them. +It is capable of identifying ``define``, function, ``struct``, ``typedef``, +``enum`` and ``enum`` symbols and create cross-references for all of them. It is also capable of distinguish #define used for specifying a Linux ioctl. -The optional rules file contains a set of rules like: +The optional rules file contains a set of rules like:: ignore ioctl VIDIOC_ENUM_FMT replace ioctl VIDIOC_DQBUF vidioc_qbuf @@ -34,8 +34,8 @@ class ParseDataStructs: It is meant to allow having a more comprehensive documentation, where uAPI headers will create cross-reference links to the code. - It is capable of identifying defines, functions, structs, typedefs, - enums and enum symbols and create cross-references for all of them. + It is capable of identifying ``define``, function, ``struct``, ``typedef``, + ``enum`` and ``enum`` symbols and create cross-references for all of them. It is also capable of distinguish #define used for specifying a Linux ioctl. @@ -43,13 +43,13 @@ class ParseDataStructs: allows parsing an exception file. Such file contains a set of rules using the syntax below: - 1. Ignore rules: + 1. Ignore rules:: ignore <type> <symbol>` Removes the symbol from reference generation. - 2. Replace rules: + 2. Replace rules:: replace <type> <old_symbol> <new_reference> @@ -58,22 +58,22 @@ class ParseDataStructs: - A simple symbol name; - A full Sphinx reference. - 3. Namespace rules + 3. Namespace rules:: namespace <namespace> Sets C namespace to be used during cross-reference generation. Can be overridden by replace rules. - On ignore and replace rules, <type> can be: - - ioctl: for defines that end with _IO*, e.g. ioctl definitions - - define: for other defines - - symbol: for symbols defined within enums; - - typedef: for typedefs; - - enum: for the name of a non-anonymous enum; - - struct: for structs. + On ignore and replace rules, ``<type>`` can be: + - ``ioctl``: for defines that end with ``_IO*``, e.g. ioctl definitions + - ``define``: for other defines + - ``symbol``: for symbols defined within enums; + - ``typedef``: for typedefs; + - ``enum``: for the name of a non-anonymous enum; + - ``struct``: for structs. - Examples: + Examples:: ignore define __LINUX_MEDIA_H ignore ioctl VIDIOC_ENUM_FMT @@ -83,13 +83,15 @@ class ParseDataStructs: namespace MC """ - # Parser regexes with multiple ways to capture enums and structs + #: Parser regex with multiple ways to capture enums. RE_ENUMS = [ re.compile(r"^\s*enum\s+([\w_]+)\s*\{"), re.compile(r"^\s*enum\s+([\w_]+)\s*$"), re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"), re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"), ] + + #: Parser regex with multiple ways to capture structs. RE_STRUCTS = [ re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"), re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"), @@ -97,11 +99,13 @@ class ParseDataStructs: re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"), ] - # FIXME: the original code was written a long time before Sphinx C + # NOTE: the original code was written a long time before Sphinx C # domain to have multiple namespaces. To avoid to much turn at the # existing hyperlinks, the code kept using "c:type" instead of the # right types. To change that, we need to change the types not only # here, but also at the uAPI media documentation. + + #: Dictionary containing C type identifiers to be transformed. DEF_SYMBOL_TYPES = { "ioctl": { "prefix": "\\ ", @@ -158,6 +162,10 @@ class ParseDataStructs: self.symbols[symbol_type] = {} def read_exceptions(self, fname: str): + """ + Read an optional exceptions file, used to override defaults. + """ + if not fname: return @@ -242,9 +250,9 @@ class ParseDataStructs: def store_type(self, ln, symbol_type: str, symbol: str, ref_name: str = None, replace_underscores: bool = True): """ - Stores a new symbol at self.symbols under symbol_type. + Store a new symbol at self.symbols under symbol_type. - By default, underscores are replaced by "-" + By default, underscores are replaced by ``-``. """ defs = self.DEF_SYMBOL_TYPES[symbol_type] @@ -276,12 +284,16 @@ class ParseDataStructs: self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln) def store_line(self, line): - """Stores a line at self.data, properly indented""" + """ + Store a line at self.data, properly indented. + """ line = " " + line.expandtabs() self.data += line.rstrip(" ") def parse_file(self, file_in: str, exceptions: str = None): - """Reads a C source file and get identifiers""" + """ + Read a C source file and get identifiers. + """ self.data = "" is_enum = False is_comment = False @@ -433,7 +445,7 @@ class ParseDataStructs: def gen_toc(self): """ - Create a list of symbols to be part of a TOC contents table + Create a list of symbols to be part of a TOC contents table. """ text = [] @@ -464,6 +476,10 @@ class ParseDataStructs: return "\n".join(text) def write_output(self, file_in: str, file_out: str, toc: bool): + """ + Write a ReST output file. + """ + title = os.path.basename(file_in) if toc: diff --git a/tools/lib/python/kdoc/python_version.py b/tools/lib/python/kdoc/python_version.py index e83088013db2..4ddb7ead5f56 100644 --- a/tools/lib/python/kdoc/python_version.py +++ b/tools/lib/python/kdoc/python_version.py @@ -33,21 +33,31 @@ class PythonVersion: """ def __init__(self, version): - """Ïnitialize self.version tuple from a version string""" + """ + Ïnitialize self.version tuple from a version string. + """ self.version = self.parse_version(version) @staticmethod def parse_version(version): - """Convert a major.minor.patch version into a tuple""" + """ + Convert a major.minor.patch version into a tuple. + """ return tuple(int(x) for x in version.split(".")) @staticmethod def ver_str(version): - """Returns a version tuple as major.minor.patch""" + """ + Returns a version tuple as major.minor.patch. + """ return ".".join([str(x) for x in version]) @staticmethod def cmd_print(cmd, max_len=80): + """ + Outputs a command line, repecting maximum width. + """ + cmd_line = [] for w in cmd: @@ -66,7 +76,9 @@ class PythonVersion: return "\n ".join(cmd_line) def __str__(self): - """Returns a version tuple as major.minor.patch from self.version""" + """ + Return a version tuple as major.minor.patch from self.version. + """ return self.ver_str(self.version) @staticmethod diff --git a/tools/lib/thermal/libthermal.pc.template b/tools/lib/thermal/libthermal.pc.template index ac24d0ab17f5..3b8a24d0a8b8 100644 --- a/tools/lib/thermal/libthermal.pc.template +++ b/tools/lib/thermal/libthermal.pc.template @@ -8,5 +8,5 @@ Name: libthermal Description: thermal library Requires: libnl-3.0 libnl-genl-3.0 Version: @VERSION@ -Libs: -L${libdir} -lnl-genl-3 -lnl-3 -Cflags: -I${includedir} -I${include}/libnl3 +Libs: -L${libdir} -lnl-genl-3 -lnl-3 -lthermal +Cflags: -I${includedir} -I${includedir}/libnl3 diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index 80cdbd3db82d..54c7265ab52d 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -1405,7 +1405,7 @@ struct option opts[] = { { "numa", no_argument, NULL, 'n' }, { "lines", required_argument, NULL, 'N'}, { "ops", no_argument, NULL, 'o' }, - { "partial", no_argument, NULL, 'p'}, + { "partial", no_argument, NULL, 'P'}, { "report", no_argument, NULL, 'r' }, { "shrink", no_argument, NULL, 's' }, { "Size", no_argument, NULL, 'S'}, diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c index 83afc52275a5..d4434df3dcff 100644 --- a/tools/mm/thp_swap_allocator_test.c +++ b/tools/mm/thp_swap_allocator_test.c @@ -142,7 +142,7 @@ int main(int argc, char *argv[]) } if (use_small_folio) { - mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP); + mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_SMALLFOLIO); if (mem2 == NULL) { fprintf(stderr, "Failed to allocate small folios memory\n"); free(mem1); diff --git a/tools/net/sunrpc/xdrgen/README b/tools/net/sunrpc/xdrgen/README index 27218a78ab40..2cf05d1e4cd9 100644 --- a/tools/net/sunrpc/xdrgen/README +++ b/tools/net/sunrpc/xdrgen/README @@ -250,8 +250,6 @@ Add more pragma directives: Enable something like a #include to dynamically insert the content of other specification files -Properly support line-by-line pass-through via the "%" decorator - Build a unit test suite for verifying translation of XDR language into compilable code diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py index e22632cf38fb..5c3a4a47ded8 100644 --- a/tools/net/sunrpc/xdrgen/generators/__init__.py +++ b/tools/net/sunrpc/xdrgen/generators/__init__.py @@ -6,7 +6,7 @@ from pathlib import Path from jinja2 import Environment, FileSystemLoader, Template from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier -from xdr_ast import public_apis, pass_by_reference, get_header_name +from xdr_ast import public_apis, pass_by_reference, structs, get_header_name from xdr_parse import get_xdr_annotate @@ -25,6 +25,7 @@ def create_jinja2_environment(language: str, xdr_type: str) -> Environment: environment.globals["annotate"] = get_xdr_annotate() environment.globals["public_apis"] = public_apis environment.globals["pass_by_reference"] = pass_by_reference + environment.globals["structs"] = structs return environment case _: raise NotImplementedError("Language not supported") @@ -58,6 +59,8 @@ def kernel_c_type(spec: _XdrTypeSpecifier) -> str: """Return name of C type""" builtin_native_c_type = { "bool": "bool", + "short": "s16", + "unsigned_short": "u16", "int": "s32", "unsigned_int": "u32", "long": "s32", diff --git a/tools/net/sunrpc/xdrgen/generators/enum.py b/tools/net/sunrpc/xdrgen/generators/enum.py index e62f715d3996..b4ed3ed6431e 100644 --- a/tools/net/sunrpc/xdrgen/generators/enum.py +++ b/tools/net/sunrpc/xdrgen/generators/enum.py @@ -5,6 +5,7 @@ from generators import SourceGenerator, create_jinja2_environment from xdr_ast import _XdrEnum, public_apis, big_endian, get_header_name +from xdr_parse import get_xdr_enum_validation class XdrEnumGenerator(SourceGenerator): @@ -42,7 +43,13 @@ class XdrEnumGenerator(SourceGenerator): template = self.environment.get_template("decoder/enum_be.j2") else: template = self.environment.get_template("decoder/enum.j2") - print(template.render(name=node.name)) + print( + template.render( + name=node.name, + enumerators=node.enumerators, + validate=get_xdr_enum_validation(), + ) + ) def emit_encoder(self, node: _XdrEnum) -> None: """Emit one encoder function for an XDR enum type""" diff --git a/tools/net/sunrpc/xdrgen/generators/passthru.py b/tools/net/sunrpc/xdrgen/generators/passthru.py new file mode 100644 index 000000000000..cb17bd977f1e --- /dev/null +++ b/tools/net/sunrpc/xdrgen/generators/passthru.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# ex: set filetype=python: + +"""Generate code for XDR pass-through lines""" + +from generators import SourceGenerator, create_jinja2_environment +from xdr_ast import _XdrPassthru + + +class XdrPassthruGenerator(SourceGenerator): + """Generate source code for XDR pass-through content""" + + def __init__(self, language: str, peer: str): + """Initialize an instance of this class""" + self.environment = create_jinja2_environment(language, "passthru") + self.peer = peer + + def emit_definition(self, node: _XdrPassthru) -> None: + """Emit one pass-through line""" + template = self.environment.get_template("definition.j2") + print(template.render(content=node.content)) + + def emit_decoder(self, node: _XdrPassthru) -> None: + """Emit one pass-through line""" + template = self.environment.get_template("source.j2") + print(template.render(content=node.content)) diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py index ac3cf1694b68..c0cb3f6d3319 100644 --- a/tools/net/sunrpc/xdrgen/generators/program.py +++ b/tools/net/sunrpc/xdrgen/generators/program.py @@ -5,8 +5,9 @@ from jinja2 import Environment -from generators import SourceGenerator, create_jinja2_environment +from generators import SourceGenerator, create_jinja2_environment, get_jinja2_template from xdr_ast import _RpcProgram, _RpcVersion, excluded_apis +from xdr_ast import max_widths, get_header_name def emit_version_definitions( @@ -127,6 +128,9 @@ class XdrProgramGenerator(SourceGenerator): for version in node.versions: emit_version_definitions(self.environment, program, version) + template = self.environment.get_template("definition/program.j2") + print(template.render(name=raw_name, value=node.number)) + def emit_declaration(self, node: _RpcProgram) -> None: """Emit a declaration pair for each of an RPC programs's procedures""" raw_name = node.name @@ -166,3 +170,35 @@ class XdrProgramGenerator(SourceGenerator): emit_version_argument_encoders( self.environment, program, version, ) + + def emit_maxsize(self, node: _RpcProgram) -> None: + """Emit maxsize macro for maximum RPC argument size""" + header = get_header_name().upper() + + # Find the largest argument across all versions + max_arg_width = 0 + max_arg_name = None + for version in node.versions: + for procedure in version.procedures: + if procedure.name in excluded_apis: + continue + arg_name = procedure.argument.type_name + if arg_name == "void": + continue + if arg_name not in max_widths: + continue + if max_widths[arg_name] > max_arg_width: + max_arg_width = max_widths[arg_name] + max_arg_name = arg_name + + if max_arg_name is None: + return + + macro_name = header + "_MAX_ARGS_SZ" + template = get_jinja2_template(self.environment, "maxsize", "max_args") + print( + template.render( + macro=macro_name, + width=header + "_" + max_arg_name + "_sz", + ) + ) diff --git a/tools/net/sunrpc/xdrgen/generators/typedef.py b/tools/net/sunrpc/xdrgen/generators/typedef.py index fab72e9d6915..75e3a40e14e1 100644 --- a/tools/net/sunrpc/xdrgen/generators/typedef.py +++ b/tools/net/sunrpc/xdrgen/generators/typedef.py @@ -58,7 +58,7 @@ def emit_typedef_declaration(environment: Environment, node: _XdrDeclaration) -> elif isinstance(node, _XdrOptionalData): raise NotImplementedError("<optional_data> typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError("<void> typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") @@ -104,7 +104,7 @@ def emit_type_definition(environment: Environment, node: _XdrDeclaration) -> Non elif isinstance(node, _XdrOptionalData): raise NotImplementedError("<optional_data> typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError("<void> typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") @@ -165,7 +165,7 @@ def emit_typedef_decoder(environment: Environment, node: _XdrDeclaration) -> Non elif isinstance(node, _XdrOptionalData): raise NotImplementedError("<optional_data> typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError("<void> typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") @@ -225,7 +225,7 @@ def emit_typedef_encoder(environment: Environment, node: _XdrDeclaration) -> Non elif isinstance(node, _XdrOptionalData): raise NotImplementedError("<optional_data> typedef not yet implemented") elif isinstance(node, _XdrVoid): - raise NotImplementedError("<void> typedef not yet implemented") + raise ValueError("invalid void usage in RPC Specification") else: raise NotImplementedError("typedef: type not recognized") diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py index ad1f214ef22a..d15837dae651 100644 --- a/tools/net/sunrpc/xdrgen/generators/union.py +++ b/tools/net/sunrpc/xdrgen/generators/union.py @@ -84,6 +84,31 @@ def emit_union_switch_spec_decoder( print(template.render(name=node.name, type=node.spec.type_name)) +def emit_union_arm_decoder( + environment: Environment, node: _XdrCaseSpec +) -> None: + """Emit decoder for an XDR union's arm (data only, no case/break)""" + + if isinstance(node.arm, _XdrVoid): + return + if isinstance(node.arm, _XdrString): + type_name = "char *" + classifier = "" + else: + type_name = node.arm.spec.type_name + classifier = node.arm.spec.c_classifier + + assert isinstance(node.arm, (_XdrBasic, _XdrString)) + template = get_jinja2_template(environment, "decoder", node.arm.template) + print( + template.render( + name=node.arm.name, + type=type_name, + classifier=classifier, + ) + ) + + def emit_union_case_spec_decoder( environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool ) -> None: @@ -151,19 +176,33 @@ def emit_union_decoder(environment: Environment, node: _XdrUnion) -> None: template = get_jinja2_template(environment, "decoder", "open") print(template.render(name=node.name)) - emit_union_switch_spec_decoder(environment, node.discriminant) + # For boolean discriminants, use if statement instead of switch + if node.discriminant.spec.type_name == "bool": + template = get_jinja2_template(environment, "decoder", "bool_spec") + print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name)) - for case in node.cases: - emit_union_case_spec_decoder( - environment, - case, - node.discriminant.spec.type_name in big_endian, - ) + # Find and emit the TRUE case + for case in node.cases: + if case.values and case.values[0] == "TRUE": + emit_union_arm_decoder(environment, case) + break - emit_union_default_spec_decoder(environment, node) + template = get_jinja2_template(environment, "decoder", "close") + print(template.render()) + else: + emit_union_switch_spec_decoder(environment, node.discriminant) - template = get_jinja2_template(environment, "decoder", "close") - print(template.render()) + for case in node.cases: + emit_union_case_spec_decoder( + environment, + case, + node.discriminant.spec.type_name in big_endian, + ) + + emit_union_default_spec_decoder(environment, node) + + template = get_jinja2_template(environment, "decoder", "close") + print(template.render()) def emit_union_switch_spec_encoder( @@ -175,6 +214,28 @@ def emit_union_switch_spec_encoder( print(template.render(name=node.name, type=node.spec.type_name)) +def emit_union_arm_encoder( + environment: Environment, node: _XdrCaseSpec +) -> None: + """Emit encoder for an XDR union's arm (data only, no case/break)""" + + if isinstance(node.arm, _XdrVoid): + return + if isinstance(node.arm, _XdrString): + type_name = "char *" + else: + type_name = node.arm.spec.type_name + + assert isinstance(node.arm, (_XdrBasic, _XdrString)) + template = get_jinja2_template(environment, "encoder", node.arm.template) + print( + template.render( + name=node.arm.name, + type=type_name, + ) + ) + + def emit_union_case_spec_encoder( environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool ) -> None: @@ -235,19 +296,33 @@ def emit_union_encoder(environment, node: _XdrUnion) -> None: template = get_jinja2_template(environment, "encoder", "open") print(template.render(name=node.name)) - emit_union_switch_spec_encoder(environment, node.discriminant) + # For boolean discriminants, use if statement instead of switch + if node.discriminant.spec.type_name == "bool": + template = get_jinja2_template(environment, "encoder", "bool_spec") + print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name)) - for case in node.cases: - emit_union_case_spec_encoder( - environment, - case, - node.discriminant.spec.type_name in big_endian, - ) + # Find and emit the TRUE case + for case in node.cases: + if case.values and case.values[0] == "TRUE": + emit_union_arm_encoder(environment, case) + break - emit_union_default_spec_encoder(environment, node) + template = get_jinja2_template(environment, "encoder", "close") + print(template.render()) + else: + emit_union_switch_spec_encoder(environment, node.discriminant) - template = get_jinja2_template(environment, "encoder", "close") - print(template.render()) + for case in node.cases: + emit_union_case_spec_encoder( + environment, + case, + node.discriminant.spec.type_name in big_endian, + ) + + emit_union_default_spec_encoder(environment, node) + + template = get_jinja2_template(environment, "encoder", "close") + print(template.render()) def emit_union_maxsize(environment: Environment, node: _XdrUnion) -> None: diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark index 7c2c1b8c86d1..1d2afff98ac5 100644 --- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark +++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark @@ -20,9 +20,11 @@ constant : decimal_constant | hexadecimal_constant | octal_consta type_specifier : unsigned_hyper | unsigned_long | unsigned_int + | unsigned_short | hyper | long | int + | short | float | double | quadruple @@ -35,9 +37,11 @@ type_specifier : unsigned_hyper unsigned_hyper : "unsigned" "hyper" unsigned_long : "unsigned" "long" unsigned_int : "unsigned" "int" +unsigned_short : "unsigned" "short" hyper : "hyper" long : "long" int : "int" +short : "short" float : "float" double : "double" quadruple : "quadruple" @@ -74,6 +78,9 @@ definition : constant_def | type_def | program_def | pragma_def + | passthru_def + +passthru_def : PASSTHRU // // RPC program definitions not specified in RFC 4506 @@ -111,8 +118,7 @@ decimal_constant : /[\+-]?(0|[1-9][0-9]*)/ hexadecimal_constant : /0x([a-f]|[A-F]|[0-9])+/ octal_constant : /0[0-7]+/ -PASSTHRU : "%" | "%" /.+/ -%ignore PASSTHRU +PASSTHRU : /%.*/ %import common.C_COMMENT %ignore C_COMMENT diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py index c5e8d79986ef..ed83d48d1f68 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py +++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py @@ -8,9 +8,8 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput +from lark.exceptions import VisitError -from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator from generators.header_bottom import XdrHeaderBottomGenerator from generators.header_top import XdrHeaderTopGenerator @@ -21,9 +20,10 @@ from generators.struct import XdrStructGenerator from generators.union import XdrUnionGenerator from xdr_ast import transform_parse_tree, _RpcProgram, Specification -from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer -from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion +from xdr_ast import _XdrEnum, _XdrPointer, _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import make_error_handler, XdrParseError +from xdr_parse import handle_transform_error logger.setLevel(logging.INFO) @@ -50,20 +50,24 @@ def emit_header_declarations( gen.emit_declaration(definition.value) -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Generate definitions and declarations""" set_xdr_annotate(args.annotate) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) - ast = transform_parse_tree(parse_tree) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 + try: + ast = transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 gen = XdrHeaderTopGenerator(args.language, args.peer) gen.emit_declaration(args.filename, ast) diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py index c956e27f37c0..a48ca0549382 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py +++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py @@ -8,12 +8,13 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput +from lark.exceptions import VisitError from generators.constant import XdrConstantGenerator from generators.enum import XdrEnumGenerator from generators.header_bottom import XdrHeaderBottomGenerator from generators.header_top import XdrHeaderTopGenerator +from generators.passthru import XdrPassthruGenerator from generators.pointer import XdrPointerGenerator from generators.program import XdrProgramGenerator from generators.typedef import XdrTypedefGenerator @@ -21,9 +22,11 @@ from generators.struct import XdrStructGenerator from generators.union import XdrUnionGenerator from xdr_ast import transform_parse_tree, Specification -from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer +from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPassthru, _XdrPointer from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import make_error_handler, XdrParseError +from xdr_parse import handle_transform_error logger.setLevel(logging.INFO) @@ -45,6 +48,8 @@ def emit_header_definitions(root: Specification, language: str, peer: str) -> No gen = XdrStructGenerator(language, peer) elif isinstance(definition.value, _XdrUnion): gen = XdrUnionGenerator(language, peer) + elif isinstance(definition.value, _XdrPassthru): + gen = XdrPassthruGenerator(language, peer) else: continue gen.emit_definition(definition.value) @@ -64,25 +69,31 @@ def emit_header_maxsize(root: Specification, language: str, peer: str) -> None: gen = XdrStructGenerator(language, peer) elif isinstance(definition.value, _XdrUnion): gen = XdrUnionGenerator(language, peer) + elif isinstance(definition.value, _RpcProgram): + gen = XdrProgramGenerator(language, peer) else: continue gen.emit_maxsize(definition.value) -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Generate definitions""" set_xdr_annotate(args.annotate) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) - ast = transform_parse_tree(parse_tree) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 + try: + ast = transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 gen = XdrHeaderTopGenerator(args.language, args.peer) gen.emit_definition(args.filename, ast) diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py index 36cc43717d30..e1da49632e62 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/lint.py +++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py @@ -8,26 +8,31 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput +from lark.exceptions import VisitError -from xdr_parse import xdr_parser +from xdr_parse import xdr_parser, make_error_handler, XdrParseError +from xdr_parse import handle_transform_error from xdr_ast import transform_parse_tree logger.setLevel(logging.DEBUG) -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Lexical and syntax check of an XDR specification""" parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) - transform_parse_tree(parse_tree) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 + try: + transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 return 0 diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py index 2024954748f0..27e8767b1b58 100644 --- a/tools/net/sunrpc/xdrgen/subcmds/source.py +++ b/tools/net/sunrpc/xdrgen/subcmds/source.py @@ -8,10 +8,11 @@ import logging from argparse import Namespace from lark import logger -from lark.exceptions import UnexpectedInput +from lark.exceptions import VisitError from generators.source_top import XdrSourceTopGenerator from generators.enum import XdrEnumGenerator +from generators.passthru import XdrPassthruGenerator from generators.pointer import XdrPointerGenerator from generators.program import XdrProgramGenerator from generators.typedef import XdrTypedefGenerator @@ -19,10 +20,12 @@ from generators.struct import XdrStructGenerator from generators.union import XdrUnionGenerator from xdr_ast import transform_parse_tree, _RpcProgram, Specification -from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer +from xdr_ast import _XdrAst, _XdrEnum, _XdrPassthru, _XdrPointer from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion -from xdr_parse import xdr_parser, set_xdr_annotate +from xdr_parse import xdr_parser, set_xdr_annotate, set_xdr_enum_validation +from xdr_parse import make_error_handler, XdrParseError +from xdr_parse import handle_transform_error logger.setLevel(logging.INFO) @@ -72,40 +75,54 @@ def generate_server_source(filename: str, root: Specification, language: str) -> gen.emit_source(filename, root) for definition in root.definitions: - emit_source_decoder(definition.value, language, "server") + if isinstance(definition.value, _XdrPassthru): + passthru_gen = XdrPassthruGenerator(language, "server") + passthru_gen.emit_decoder(definition.value) + else: + emit_source_decoder(definition.value, language, "server") for definition in root.definitions: - emit_source_encoder(definition.value, language, "server") + if not isinstance(definition.value, _XdrPassthru): + emit_source_encoder(definition.value, language, "server") def generate_client_source(filename: str, root: Specification, language: str) -> None: - """Generate server-side source code""" + """Generate client-side source code""" gen = XdrSourceTopGenerator(language, "client") gen.emit_source(filename, root) - print("") for definition in root.definitions: - emit_source_encoder(definition.value, language, "client") + if isinstance(definition.value, _XdrPassthru): + passthru_gen = XdrPassthruGenerator(language, "client") + passthru_gen.emit_decoder(definition.value) + else: + emit_source_encoder(definition.value, language, "client") for definition in root.definitions: - emit_source_decoder(definition.value, language, "client") + if not isinstance(definition.value, _XdrPassthru): + emit_source_decoder(definition.value, language, "client") # cel: todo: client needs PROC macros -def handle_parse_error(e: UnexpectedInput) -> bool: - """Simple parse error reporting, no recovery attempted""" - print(e) - return True - - def subcmd(args: Namespace) -> int: """Generate encoder and decoder functions""" set_xdr_annotate(args.annotate) + set_xdr_enum_validation(not args.no_enum_validation) parser = xdr_parser() with open(args.filename, encoding="utf-8") as f: - parse_tree = parser.parse(f.read(), on_error=handle_parse_error) - ast = transform_parse_tree(parse_tree) + source = f.read() + try: + parse_tree = parser.parse( + source, on_error=make_error_handler(source, args.filename) + ) + except XdrParseError: + return 1 + try: + ast = transform_parse_tree(parse_tree) + except VisitError as e: + handle_transform_error(e, source, args.filename) + return 1 match args.peer: case "server": generate_server_source(args.filename, ast, args.language) diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 index d1405c7c5354..c7ae506076bb 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 @@ -1,4 +1,3 @@ {# SPDX-License-Identifier: GPL-2.0 #} - bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr); bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, {{ name }} value); diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 index 6482984f1cb7..735a34157fdf 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 @@ -14,6 +14,17 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr) if (xdr_stream_decode_u32(xdr, &val) < 0) return false; +{% if validate and enumerators %} + /* Compiler may optimize to a range check for dense enums */ + switch (val) { +{% for e in enumerators %} + case {{ e.name }}: +{% endfor %} + break; + default: + return false; + } +{% endif %} *ptr = val; return true; } diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 index 44c391c10b42..82782a510d47 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 @@ -10,5 +10,25 @@ static bool __maybe_unused {% endif %} xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr) { +{% if validate and enumerators %} + __be32 raw; + u32 val; + + if (xdr_stream_decode_be32(xdr, &raw) < 0) + return false; + val = be32_to_cpu(raw); + /* Compiler may optimize to a range check for dense enums */ + switch (val) { +{% for e in enumerators %} + case {{ e.name }}: +{% endfor %} + break; + default: + return false; + } + *ptr = raw; + return true; +{% else %} return xdr_stream_decode_be32(xdr, ptr) == 0; +{% endif %} } diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 index a07586cbee17..446266ad6d17 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 @@ -1,3 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} }; + typedef enum {{ name }} {{ name }}; diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 index 2c18948bddf7..cfeee2287e68 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 @@ -1,3 +1,4 @@ {# SPDX-License-Identifier: GPL-2.0 #} }; + typedef __be32 {{ name }}; diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 new file mode 100644 index 000000000000..900c7516a29c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 @@ -0,0 +1,3 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +{{ content }} diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 new file mode 100644 index 000000000000..900c7516a29c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 @@ -0,0 +1,3 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +{{ content }} diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 index 0b1709cca0d4..19b219dd276d 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 @@ -14,7 +14,11 @@ bool {{ program }}_svc_decode_{{ argument }}(struct svc_rqst *rqstp, struct xdr_ {% if argument == 'void' %} return xdrgen_decode_void(xdr); {% else %} +{% if argument in structs %} struct {{ argument }} *argp = rqstp->rq_argp; +{% else %} + {{ argument }} *argp = rqstp->rq_argp; +{% endif %} return xdrgen_decode_{{ argument }}(xdr, argp); {% endif %} diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 new file mode 100644 index 000000000000..320663ffc37f --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 @@ -0,0 +1,5 @@ +{# SPDX-License-Identifier: GPL-2.0 #} + +#ifndef {{ name }} +#define {{ name }} ({{ value }}) +#endif diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 index 6fc61a5d47b7..746592cfda56 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 @@ -14,8 +14,14 @@ bool {{ program }}_svc_encode_{{ result }}(struct svc_rqst *rqstp, struct xdr_st {% if result == 'void' %} return xdrgen_encode_void(xdr); {% else %} +{% if result in structs %} struct {{ result }} *resp = rqstp->rq_resp; return xdrgen_encode_{{ result }}(xdr, resp); +{% else %} + {{ result }} *resp = rqstp->rq_resp; + + return xdrgen_encode_{{ result }}(xdr, *resp); +{% endif %} {% endif %} } diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 new file mode 100644 index 000000000000..9f3bfb47d2f4 --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 @@ -0,0 +1,3 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +#define {{ '{:<31}'.format(macro) }} \ + ({{ width }}) diff --git a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 index c5518c519854..df3598c38b2c 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 @@ -8,6 +8,5 @@ #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/xdrgen/_defs.h> #include <linux/sunrpc/xdrgen/_builtins.h> -#include <linux/sunrpc/xdrgen/nlm4.h> #include <linux/sunrpc/clnt.h> diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 new file mode 100644 index 000000000000..05ad491f74af --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 @@ -0,0 +1,7 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +{% if annotate %} + /* discriminant {{ name }} */ +{% endif %} + if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }})) + return false; + if (ptr->{{ name }}) { diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 index 01d716d0099e..5fc1937ba774 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 @@ -3,6 +3,7 @@ }; {%- if name in public_apis %} + bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr); bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *ptr); {%- endif -%} diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 new file mode 100644 index 000000000000..e5135ed6471c --- /dev/null +++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 @@ -0,0 +1,7 @@ +{# SPDX-License-Identifier: GPL-2.0 #} +{% if annotate %} + /* discriminant {{ name }} */ +{% endif %} + if (!xdrgen_encode_{{ type }}(xdr, ptr->{{ name }})) + return false; + if (ptr->{{ name }}) { diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py index 5233e73c7046..14bff9477473 100644 --- a/tools/net/sunrpc/xdrgen/xdr_ast.py +++ b/tools/net/sunrpc/xdrgen/xdr_ast.py @@ -34,6 +34,8 @@ def xdr_quadlen(val: str) -> int: symbolic_widths = { "void": ["XDR_void"], "bool": ["XDR_bool"], + "short": ["XDR_short"], + "unsigned_short": ["XDR_unsigned_short"], "int": ["XDR_int"], "unsigned_int": ["XDR_unsigned_int"], "long": ["XDR_long"], @@ -48,6 +50,8 @@ symbolic_widths = { max_widths = { "void": 0, "bool": 1, + "short": 1, + "unsigned_short": 1, "int": 1, "unsigned_int": 1, "long": 1, @@ -326,8 +330,6 @@ class _XdrEnum(_XdrAst): """An XDR enum definition""" name: str - minimum: int - maximum: int enumerators: List[_XdrEnumerator] def max_width(self) -> int: @@ -515,6 +517,13 @@ class _Pragma(_XdrAst): @dataclass +class _XdrPassthru(_XdrAst): + """Passthrough line to emit verbatim in output""" + + content: str + + +@dataclass class Definition(_XdrAst, ast_utils.WithMeta): """Corresponds to 'definition' in the grammar""" @@ -568,8 +577,6 @@ class ParseToAst(Transformer): value = children[1].value return _XdrConstant(name, value) - # cel: Python can compute a min() and max() for the enumerator values - # so that the generated code can perform proper range checking. def enum(self, children): """Instantiate one _XdrEnum object""" enum_name = children[0].symbol @@ -583,7 +590,7 @@ class ParseToAst(Transformer): enumerators.append(_XdrEnumerator(name, value)) i = i + 2 - return _XdrEnum(enum_name, 0, 0, enumerators) + return _XdrEnum(enum_name, enumerators) def fixed_length_opaque(self, children): """Instantiate one _XdrFixedLengthOpaque declaration object""" @@ -738,14 +745,42 @@ class ParseToAst(Transformer): raise NotImplementedError("Directive not supported") return _Pragma() + def passthru_def(self, children): + """Instantiate one _XdrPassthru object""" + token = children[0] + content = token.value[1:] + return _XdrPassthru(content) + transformer = ast_utils.create_transformer(this_module, ParseToAst()) +def _merge_consecutive_passthru(definitions: List[Definition]) -> List[Definition]: + """Merge consecutive passthru definitions into single nodes""" + result = [] + i = 0 + while i < len(definitions): + if isinstance(definitions[i].value, _XdrPassthru): + lines = [definitions[i].value.content] + meta = definitions[i].meta + j = i + 1 + while j < len(definitions) and isinstance(definitions[j].value, _XdrPassthru): + lines.append(definitions[j].value.content) + j += 1 + merged = _XdrPassthru("\n".join(lines)) + result.append(Definition(meta, merged)) + i = j + else: + result.append(definitions[i]) + i += 1 + return result + + def transform_parse_tree(parse_tree): """Transform productions into an abstract syntax tree""" - - return transformer.transform(parse_tree) + ast = transformer.transform(parse_tree) + ast.definitions = _merge_consecutive_passthru(ast.definitions) + return ast def get_header_name() -> str: diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py index 964b44e675df..241e96c1fdd9 100644 --- a/tools/net/sunrpc/xdrgen/xdr_parse.py +++ b/tools/net/sunrpc/xdrgen/xdr_parse.py @@ -3,12 +3,43 @@ """Common parsing code for xdrgen""" +import sys +from typing import Callable + from lark import Lark +from lark.exceptions import UnexpectedInput, UnexpectedToken, VisitError # Set to True to emit annotation comments in generated source annotate = False +# Set to True to emit enum value validation in decoders +enum_validation = True + +# Map internal Lark token names to human-readable names +TOKEN_NAMES = { + "__ANON_0": "identifier", + "__ANON_1": "number", + "SEMICOLON": "';'", + "LBRACE": "'{'", + "RBRACE": "'}'", + "LPAR": "'('", + "RPAR": "')'", + "LSQB": "'['", + "RSQB": "']'", + "LESSTHAN": "'<'", + "MORETHAN": "'>'", + "EQUAL": "'='", + "COLON": "':'", + "COMMA": "','", + "STAR": "'*'", + "$END": "end of file", +} + + +class XdrParseError(Exception): + """Raised when XDR parsing fails""" + def set_xdr_annotate(set_it: bool) -> None: """Set 'annotate' if --annotate was specified on the command line""" @@ -21,6 +52,113 @@ def get_xdr_annotate() -> bool: return annotate +def set_xdr_enum_validation(set_it: bool) -> None: + """Set 'enum_validation' based on command line options""" + global enum_validation + enum_validation = set_it + + +def get_xdr_enum_validation() -> bool: + """Return True when enum validation is enabled for decoder generation""" + return enum_validation + + +def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput], bool]: + """Create an error handler that reports the first parse error and aborts. + + Args: + source: The XDR source text being parsed + filename: The name of the file being parsed + + Returns: + An error handler function for use with Lark's on_error parameter + """ + lines = source.splitlines() + + def handle_parse_error(e: UnexpectedInput) -> bool: + """Report a parse error with context and abort parsing""" + line_num = e.line + column = e.column + line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else "" + + # Build the error message + msg_parts = [f"{filename}:{line_num}:{column}: parse error"] + + # Show what was found vs what was expected + if isinstance(e, UnexpectedToken): + token = e.token + if token.type == "__ANON_0": + found = f"identifier '{token.value}'" + elif token.type == "__ANON_1": + found = f"number '{token.value}'" + else: + found = f"'{token.value}'" + msg_parts.append(f"Unexpected {found}") + + # Provide helpful expected tokens list + expected = e.expected + if expected: + readable = [ + TOKEN_NAMES.get(exp, exp.lower().replace("_", " ")) + for exp in sorted(expected) + ] + if len(readable) == 1: + msg_parts.append(f"Expected {readable[0]}") + elif len(readable) <= 4: + msg_parts.append(f"Expected one of: {', '.join(readable)}") + else: + msg_parts.append(str(e).split("\n")[0]) + + # Show the offending line with a caret pointing to the error + msg_parts.append("") + msg_parts.append(f" {line_text}") + prefix = line_text[: column - 1].expandtabs() + msg_parts.append(f" {' ' * len(prefix)}^") + + sys.stderr.write("\n".join(msg_parts) + "\n") + raise XdrParseError() + + return handle_parse_error + + +def handle_transform_error(e: VisitError, source: str, filename: str) -> None: + """Report a transform error with context. + + Args: + e: The VisitError from Lark's transformer + source: The XDR source text being parsed + filename: The name of the file being parsed + """ + lines = source.splitlines() + + # Extract position from the tree node if available + line_num = 0 + column = 0 + if hasattr(e.obj, "meta") and e.obj.meta: + line_num = e.obj.meta.line + column = e.obj.meta.column + + line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else "" + + # Build the error message + msg_parts = [f"{filename}:{line_num}:{column}: semantic error"] + + # The original exception is typically a KeyError for undefined types + if isinstance(e.orig_exc, KeyError): + msg_parts.append(f"Undefined type '{e.orig_exc.args[0]}'") + else: + msg_parts.append(str(e.orig_exc)) + + # Show the offending line with a caret pointing to the error + if line_text: + msg_parts.append("") + msg_parts.append(f" {line_text}") + prefix = line_text[: column - 1].expandtabs() + msg_parts.append(f" {' ' * len(prefix)}^") + + sys.stderr.write("\n".join(msg_parts) + "\n") + + def xdr_parser() -> Lark: """Return a Lark parser instance configured with the XDR language grammar""" diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen index 3afd0547d67c..b2fb43f4a2ec 100755 --- a/tools/net/sunrpc/xdrgen/xdrgen +++ b/tools/net/sunrpc/xdrgen/xdrgen @@ -123,6 +123,12 @@ There is NO WARRANTY, to the extent permitted by law.""", help="Generate code for client or server side", type=str, ) + source_parser.add_argument( + "--no-enum-validation", + action="store_true", + default=False, + help="Disable enum value validation in decoders", + ) source_parser.add_argument("filename", help="File containing an XDR specification") source_parser.set_defaults(func=source.subcmd) @@ -133,7 +139,5 @@ There is NO WARRANTY, to the extent permitted by law.""", try: if __name__ == "__main__": sys.exit(main()) -except SystemExit: - sys.exit(0) except (KeyboardInterrupt, BrokenPipeError): sys.exit(1) diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py index af02a5b7e5a2..94a5ba348b69 100755 --- a/tools/net/ynl/pyynl/cli.py +++ b/tools/net/ynl/pyynl/cli.py @@ -1,43 +1,85 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +""" +YNL cli tool +""" + import argparse import json import os import pathlib import pprint +import shutil import sys import textwrap +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) -from lib import YnlFamily, Netlink, NlError, SpecFamily +from lib import YnlFamily, Netlink, NlError, SpecFamily, SpecException, YnlException + +SYS_SCHEMA_DIR='/usr/share/ynl' +RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink' + +# pylint: disable=too-few-public-methods,too-many-locals +class Colors: + """ANSI color and font modifier codes""" + RESET = '\033[0m' + + BOLD = '\033[1m' + ITALICS = '\033[3m' + UNDERLINE = '\033[4m' + INVERT = '\033[7m' + + +def color(text, modifiers): + """Add color to text if output is a TTY -sys_schema_dir='/usr/share/ynl' -relative_schema_dir='../../../../Documentation/netlink' + Returns: + Colored text if stdout is a TTY, otherwise plain text + """ + if sys.stdout.isatty(): + # Join the colors if they are a list, if it's a string this a noop + modifiers = "".join(modifiers) + return f"{modifiers}{text}{Colors.RESET}" + return text + +def term_width(): + """ Get terminal width in columns (80 if stdout is not a terminal) """ + return shutil.get_terminal_size().columns def schema_dir(): + """ + Return the effective schema directory, preferring in-tree before + system schema directory. + """ script_dir = os.path.dirname(os.path.abspath(__file__)) - schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}") - if not os.path.isdir(schema_dir): - schema_dir = sys_schema_dir - if not os.path.isdir(schema_dir): - raise Exception(f"Schema directory {schema_dir} does not exist") - return schema_dir + schema_dir_ = os.path.abspath(f"{script_dir}/{RELATIVE_SCHEMA_DIR}") + if not os.path.isdir(schema_dir_): + schema_dir_ = SYS_SCHEMA_DIR + if not os.path.isdir(schema_dir_): + raise YnlException(f"Schema directory {schema_dir_} does not exist") + return schema_dir_ def spec_dir(): - spec_dir = schema_dir() + '/specs' - if not os.path.isdir(spec_dir): - raise Exception(f"Spec directory {spec_dir} does not exist") - return spec_dir + """ + Return the effective spec directory, relative to the effective + schema directory. + """ + spec_dir_ = schema_dir() + '/specs' + if not os.path.isdir(spec_dir_): + raise YnlException(f"Spec directory {spec_dir_} does not exist") + return spec_dir_ class YnlEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, bytes): - return bytes.hex(obj) - if isinstance(obj, set): - return list(obj) - return json.JSONEncoder.default(self, obj) + """A custom encoder for emitting JSON with ynl-specific instance types""" + def default(self, o): + if isinstance(o, bytes): + return bytes.hex(o) + if isinstance(o, set): + return list(o) + return json.JSONEncoder.default(self, o) def print_attr_list(ynl, attr_names, attr_set, indent=2): @@ -46,7 +88,7 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): for attr_name in attr_names: if attr_name in attr_set.attrs: attr = attr_set.attrs[attr_name] - attr_info = f'{prefix}- {attr_name}: {attr.type}' + attr_info = f'{prefix}- {color(attr_name, Colors.BOLD)}: {attr.type}' if 'enum' in attr.yaml: enum_name = attr.yaml['enum'] attr_info += f" (enum: {enum_name})" @@ -54,7 +96,8 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): if enum_name in ynl.consts: const = ynl.consts[enum_name] enum_values = list(const.entries.keys()) - attr_info += f"\n{prefix} {const.type.capitalize()}: {', '.join(enum_values)}" + type_fmted = color(const.type.capitalize(), Colors.ITALICS) + attr_info += f"\n{prefix} {type_fmted}: {', '.join(enum_values)}" # Show nested attributes reference and recursively display them nested_set_name = None @@ -63,7 +106,10 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): attr_info += f" -> {nested_set_name}" if attr.yaml.get('doc'): - doc_text = textwrap.indent(attr.yaml['doc'], prefix + ' ') + doc_prefix = prefix + ' ' * 4 + doc_text = textwrap.fill(attr.yaml['doc'], width=term_width(), + initial_indent=doc_prefix, + subsequent_indent=doc_prefix) attr_info += f"\n{doc_text}" print(attr_info) @@ -77,24 +123,62 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2): print_attr_list(ynl, nested_names, nested_set, indent + 4) -def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True): +def print_mode_attrs(ynl, mode, mode_spec, attr_set, consistent_dd_reply=None): """Print a given mode (do/dump/event/notify).""" mode_title = mode.capitalize() - if print_request and 'request' in mode_spec and 'attributes' in mode_spec['request']: + if 'request' in mode_spec and 'attributes' in mode_spec['request']: print(f'\n{mode_title} request attributes:') print_attr_list(ynl, mode_spec['request']['attributes'], attr_set) if 'reply' in mode_spec and 'attributes' in mode_spec['reply']: - print(f'\n{mode_title} reply attributes:') - print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set) + if consistent_dd_reply and mode == "do": + title = None # Dump handling will print in combined format + elif consistent_dd_reply and mode == "dump": + title = 'Do and Dump' + else: + title = f'{mode_title}' + if title: + print(f'\n{title} reply attributes:') + print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set) + + +def do_doc(ynl, op): + """Handle --list-attrs $op, print the attr information to stdout""" + print(f'Operation: {color(op.name, Colors.BOLD)}') + print(op.yaml['doc']) + + consistent_dd_reply = False + if 'do' in op.yaml and 'dump' in op.yaml and 'reply' in op.yaml['do'] and \ + op.yaml['do']['reply'] == op.yaml['dump'].get('reply'): + consistent_dd_reply = True + + for mode in ['do', 'dump']: + if mode in op.yaml: + print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, + consistent_dd_reply=consistent_dd_reply) + + if 'attributes' in op.yaml.get('event', {}): + print('\nEvent attributes:') + print_attr_list(ynl, op.yaml['event']['attributes'], op.attr_set) - if 'attributes' in mode_spec: - print(f'\n{mode_title} attributes:') - print_attr_list(ynl, mode_spec['attributes'], attr_set) + if 'notify' in op.yaml: + mode_spec = op.yaml['notify'] + ref_spec = ynl.msgs.get(mode_spec).yaml.get('do') + if not ref_spec: + ref_spec = ynl.msgs.get(mode_spec).yaml.get('dump') + if ref_spec: + print('\nNotification attributes:') + print_attr_list(ynl, ref_spec['reply']['attributes'], op.attr_set) + if 'mcgrp' in op.yaml: + print(f"\nMulticast group: {op.yaml['mcgrp']}") + +# pylint: disable=too-many-locals,too-many-branches,too-many-statements def main(): + """YNL cli tool""" + description = """ YNL CLI utility - a general purpose netlink utility that uses YAML specs to drive protocol encoding and decoding. @@ -105,54 +189,85 @@ def main(): """ parser = argparse.ArgumentParser(description=description, - epilog=epilog) - spec_group = parser.add_mutually_exclusive_group(required=True) - spec_group.add_argument('--family', dest='family', type=str, - help='name of the netlink FAMILY') - spec_group.add_argument('--list-families', action='store_true', - help='list all netlink families supported by YNL (has spec)') - spec_group.add_argument('--spec', dest='spec', type=str, - help='choose the family by SPEC file path') - - parser.add_argument('--schema', dest='schema', type=str) - parser.add_argument('--no-schema', action='store_true') - parser.add_argument('--json', dest='json_text', type=str) - - group = parser.add_mutually_exclusive_group() - group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) - group.add_argument('--multi', dest='multi', nargs=2, action='append', - metavar=('DO-OPERATION', 'JSON_TEXT'), type=str) - group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) - group.add_argument('--list-ops', action='store_true') - group.add_argument('--list-msgs', action='store_true') - group.add_argument('--list-attrs', dest='list_attrs', metavar='OPERATION', type=str, - help='List attributes for an operation') - group.add_argument('--validate', action='store_true') - - parser.add_argument('--duration', dest='duration', type=int, - help='when subscribed, watch for DURATION seconds') - parser.add_argument('--sleep', dest='duration', type=int, - help='alias for duration') - parser.add_argument('--subscribe', dest='ntf', type=str) - parser.add_argument('--replace', dest='flags', action='append_const', - const=Netlink.NLM_F_REPLACE) - parser.add_argument('--excl', dest='flags', action='append_const', - const=Netlink.NLM_F_EXCL) - parser.add_argument('--create', dest='flags', action='append_const', - const=Netlink.NLM_F_CREATE) - parser.add_argument('--append', dest='flags', action='append_const', - const=Netlink.NLM_F_APPEND) - parser.add_argument('--process-unknown', action=argparse.BooleanOptionalAction) - parser.add_argument('--output-json', action='store_true') - parser.add_argument('--dbg-small-recv', default=0, const=4000, - action='store', nargs='?', type=int) + epilog=epilog, add_help=False) + + gen_group = parser.add_argument_group('General options') + gen_group.add_argument('-h', '--help', action='help', + help='show this help message and exit') + + spec_group = parser.add_argument_group('Netlink family selection') + spec_sel = spec_group.add_mutually_exclusive_group(required=True) + spec_sel.add_argument('--list-families', action='store_true', + help=('list Netlink families supported by YNL ' + '(which have a spec available in the standard ' + 'system path)')) + spec_sel.add_argument('--family', dest='family', type=str, + help='name of the Netlink FAMILY to use') + spec_sel.add_argument('--spec', dest='spec', type=str, + help='full file path to the YAML spec file') + + ops_group = parser.add_argument_group('Operations') + ops = ops_group.add_mutually_exclusive_group() + ops.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str) + ops.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str) + ops.add_argument('--multi', dest='multi', nargs=2, action='append', + metavar=('DO-OPERATION', 'JSON_TEXT'), type=str, + help="Multi-message operation sequence (for nftables)") + ops.add_argument('--list-ops', action='store_true', + help="List available --do and --dump operations") + ops.add_argument('--list-msgs', action='store_true', + help="List all messages of the family (incl. notifications)") + ops.add_argument('--list-attrs', '--doc', dest='list_attrs', metavar='MSG', + type=str, help='List attributes for a message / operation') + ops.add_argument('--validate', action='store_true', + help="Validate the spec against schema and exit") + + io_group = parser.add_argument_group('Input / Output') + io_group.add_argument('--json', dest='json_text', type=str, + help=('Specify attributes of the message to send ' + 'to the kernel in JSON format. Can be left out ' + 'if the message is expected to be empty.')) + io_group.add_argument('--output-json', action='store_true', + help='Format output as JSON') + + ntf_group = parser.add_argument_group('Notifications') + ntf_group.add_argument('--subscribe', dest='ntf', type=str) + ntf_group.add_argument('--duration', dest='duration', type=int, + help='when subscribed, watch for DURATION seconds') + ntf_group.add_argument('--sleep', dest='duration', type=int, + help='alias for duration') + + nlflags = parser.add_argument_group('Netlink message flags (NLM_F_*)', + ('Extra flags to set in nlmsg_flags of ' + 'the request, used mostly by older ' + 'Classic Netlink families.')) + nlflags.add_argument('--replace', dest='flags', action='append_const', + const=Netlink.NLM_F_REPLACE) + nlflags.add_argument('--excl', dest='flags', action='append_const', + const=Netlink.NLM_F_EXCL) + nlflags.add_argument('--create', dest='flags', action='append_const', + const=Netlink.NLM_F_CREATE) + nlflags.add_argument('--append', dest='flags', action='append_const', + const=Netlink.NLM_F_APPEND) + + schema_group = parser.add_argument_group('Development options') + schema_group.add_argument('--schema', dest='schema', type=str, + help="JSON schema to validate the spec") + schema_group.add_argument('--no-schema', action='store_true') + + dbg_group = parser.add_argument_group('Debug options') + dbg_group.add_argument('--dbg-small-recv', default=0, const=4000, + action='store', nargs='?', type=int, metavar='INT', + help="Length of buffers used for recv()") + dbg_group.add_argument('--process-unknown', action=argparse.BooleanOptionalAction) + args = parser.parse_args() def output(msg): if args.output_json: print(json.dumps(msg, cls=YnlEncoder)) else: - pprint.PrettyPrinter().pprint(msg) + pprint.pprint(msg, width=term_width(), compact=True) if args.list_families: for filename in sorted(os.listdir(spec_dir())): @@ -172,18 +287,18 @@ def main(): else: spec = args.spec if not os.path.isfile(spec): - raise Exception(f"Spec file {spec} does not exist") + raise YnlException(f"Spec file {spec} does not exist") if args.validate: try: SpecFamily(spec, args.schema) - except Exception as error: + except SpecException as error: print(error) - exit(1) + sys.exit(1) return if args.family: # set behaviour when using installed specs - if args.schema is None and spec.startswith(sys_schema_dir): + if args.schema is None and spec.startswith(SYS_SCHEMA_DIR): args.schema = '' # disable schema validation when installed if args.process_unknown is None: args.process_unknown = True @@ -207,23 +322,9 @@ def main(): op = ynl.msgs.get(args.list_attrs) if not op: print(f'Operation {args.list_attrs} not found') - exit(1) - - print(f'Operation: {op.name}') - print(op.yaml['doc']) - - for mode in ['do', 'dump', 'event']: - if mode in op.yaml: - print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True) - - if 'notify' in op.yaml: - mode_spec = op.yaml['notify'] - ref_spec = ynl.msgs.get(mode_spec).yaml.get('do') - if ref_spec: - print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False) + sys.exit(1) - if 'mcgrp' in op.yaml: - print(f"\nMulticast group: {op.yaml['mcgrp']}") + do_doc(ynl, op) try: if args.do: @@ -242,7 +343,7 @@ def main(): output(msg) except NlError as e: print(e) - exit(1) + sys.exit(1) except KeyboardInterrupt: pass except BrokenPipeError: diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py index fd0f6b8d54d1..f1a2a2a89985 100755 --- a/tools/net/ynl/pyynl/ethtool.py +++ b/tools/net/ynl/pyynl/ethtool.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# pylint: disable=too-many-locals, too-many-branches, too-many-statements +# pylint: disable=too-many-return-statements + +""" YNL ethtool utility """ import argparse import pathlib @@ -8,9 +13,12 @@ import sys import re import os +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) -from lib import YnlFamily +# pylint: disable=import-error from cli import schema_dir, spec_dir +from lib import YnlFamily + def args_to_req(ynl, op_name, args, req): """ @@ -48,7 +56,8 @@ def print_field(reply, *desc): return if len(desc) == 0: - return print_field(reply, *zip(reply.keys(), reply.keys())) + print_field(reply, *zip(reply.keys(), reply.keys())) + return for spec in desc: try: @@ -88,11 +97,12 @@ def doit(ynl, args, op_name): args_to_req(ynl, op_name, args.args, req) ynl.do(op_name, req) -def dumpit(ynl, args, op_name, extra = {}): +def dumpit(ynl, args, op_name, extra=None): """ Prepare request header, parse arguments and dumpit (filtering out the devices we're not interested in). """ + extra = extra or {} reply = ynl.dump(op_name, { 'header': {} } | extra) if not reply: return {} @@ -114,9 +124,9 @@ def bits_to_dict(attr): """ ret = {} if 'bits' not in attr: - return dict() + return {} if 'bit' not in attr['bits']: - return dict() + return {} for bit in attr['bits']['bit']: if bit['name'] == '': continue @@ -126,6 +136,8 @@ def bits_to_dict(attr): return ret def main(): + """ YNL ethtool utility """ + parser = argparse.ArgumentParser(description='ethtool wannabe') parser.add_argument('--json', action=argparse.BooleanOptionalAction) parser.add_argument('--show-priv-flags', action=argparse.BooleanOptionalAction) @@ -155,7 +167,7 @@ def main(): # TODO: rss-get parser.add_argument('device', metavar='device', type=str) parser.add_argument('args', metavar='args', type=str, nargs='*') - global args + args = parser.parse_args() spec = os.path.join(spec_dir(), 'ethtool.yaml') @@ -169,13 +181,16 @@ def main(): return if args.set_eee: - return doit(ynl, args, 'eee-set') + doit(ynl, args, 'eee-set') + return if args.set_pause: - return doit(ynl, args, 'pause-set') + doit(ynl, args, 'pause-set') + return if args.set_coalesce: - return doit(ynl, args, 'coalesce-set') + doit(ynl, args, 'coalesce-set') + return if args.set_features: # TODO: parse the bitmask @@ -183,10 +198,12 @@ def main(): return if args.set_channels: - return doit(ynl, args, 'channels-set') + doit(ynl, args, 'channels-set') + return if args.set_ring: - return doit(ynl, args, 'rings-set') + doit(ynl, args, 'rings-set') + return if args.show_priv_flags: flags = bits_to_dict(dumpit(ynl, args, 'privflags-get')['flags']) @@ -337,25 +354,25 @@ def main(): print(f'Time stamping parameters for {args.device}:') print('Capabilities:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])] + _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])] print(f'PTP Hardware Clock: {tsinfo.get("phc-index", "none")}') if 'tx-types' in tsinfo: print('Hardware Transmit Timestamp Modes:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])] + _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])] else: print('Hardware Transmit Timestamp Modes: none') if 'rx-filters' in tsinfo: print('Hardware Receive Filter Modes:') - [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] + _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])] else: print('Hardware Receive Filter Modes: none') if 'stats' in tsinfo and tsinfo['stats']: print('Statistics:') - [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] + _ = [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()] return diff --git a/tools/net/ynl/pyynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py index ec9ea00071be..33a96155fb3b 100644 --- a/tools/net/ynl/pyynl/lib/__init__.py +++ b/tools/net/ynl/pyynl/lib/__init__.py @@ -1,11 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +""" YNL library """ + from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \ - SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat -from .ynl import YnlFamily, Netlink, NlError + SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat, \ + SpecException +from .ynl import YnlFamily, Netlink, NlError, YnlException from .doc_generator import YnlDocGenerator __all__ = ["SpecAttr", "SpecAttrSet", "SpecEnumEntry", "SpecEnumSet", "SpecFamily", "SpecOperation", "SpecSubMessage", "SpecSubMessageFormat", - "YnlFamily", "Netlink", "NlError", "YnlDocGenerator"] + "SpecException", + "YnlFamily", "Netlink", "NlError", "YnlDocGenerator", "YnlException"] diff --git a/tools/net/ynl/pyynl/lib/doc_generator.py b/tools/net/ynl/pyynl/lib/doc_generator.py index 8b922d8f89e8..74f5d408e048 100644 --- a/tools/net/ynl/pyynl/lib/doc_generator.py +++ b/tools/net/ynl/pyynl/lib/doc_generator.py @@ -109,8 +109,7 @@ class RstFormatters: 'fixed-header': 'definition', 'nested-attributes': 'attribute-set', 'struct': 'definition'} - if prefix in mappings: - prefix = mappings[prefix] + prefix = mappings.get(prefix, prefix) return f":ref:`{namespace}-{prefix}-{name}`" def rst_header(self) -> str: diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py index 85c17fe01e35..fcffeb5b7ba3 100644 --- a/tools/net/ynl/pyynl/lib/nlspec.py +++ b/tools/net/ynl/pyynl/lib/nlspec.py @@ -1,13 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# pylint: disable=missing-function-docstring, too-many-instance-attributes, too-many-branches + +""" +The nlspec is a python library for parsing and using YNL netlink +specifications. +""" import collections import importlib import os -import yaml +import yaml as pyyaml -# To be loaded dynamically as needed -jsonschema = None +class SpecException(Exception): + """Netlink spec exception. + """ class SpecElement: @@ -93,8 +101,7 @@ class SpecEnumEntry(SpecElement): def user_value(self, as_flags=None): if self.enum_set['type'] == 'flags' or as_flags: return 1 << self.value - else: - return self.value + return self.value class SpecEnumSet(SpecElement): @@ -117,8 +124,8 @@ class SpecEnumSet(SpecElement): prev_entry = None value_start = self.yaml.get('value-start', 0) - self.entries = dict() - self.entries_by_val = dict() + self.entries = {} + self.entries_by_val = {} for entry in self.yaml['entries']: e = self.new_entry(entry, prev_entry, value_start) self.entries[e.name] = e @@ -182,7 +189,7 @@ class SpecAttr(SpecElement): self.sub_message = yaml.get('sub-message') self.selector = yaml.get('selector') - self.is_auto_scalar = self.type == "sint" or self.type == "uint" + self.is_auto_scalar = self.type in ("sint", "uint") class SpecAttrSet(SpecElement): @@ -288,7 +295,7 @@ class SpecStruct(SpecElement): yield from self.members def items(self): - return self.members.items() + return self.members class SpecSubMessage(SpecElement): @@ -306,11 +313,11 @@ class SpecSubMessage(SpecElement): self.formats = collections.OrderedDict() for elem in self.yaml['formats']: - format = self.new_format(family, elem) - self.formats[format.value] = format + msg_format = self.new_format(family, elem) + self.formats[msg_format.value] = msg_format - def new_format(self, family, format): - return SpecSubMessageFormat(family, format) + def new_format(self, family, msg_format): + return SpecSubMessageFormat(family, msg_format) class SpecSubMessageFormat(SpecElement): @@ -378,7 +385,7 @@ class SpecOperation(SpecElement): elif self.is_resv: attr_set_name = '' else: - raise Exception(f"Can't resolve attribute set for op '{self.name}'") + raise SpecException(f"Can't resolve attribute set for op '{self.name}'") if attr_set_name: self.attr_set = self.family.attr_sets[attr_set_name] @@ -428,17 +435,22 @@ class SpecFamily(SpecElement): mcast_groups dict of all multicast groups (index by name) kernel_family dict of kernel family attributes """ + + # To be loaded dynamically as needed + jsonschema = None + def __init__(self, spec_path, schema_path=None, exclude_ops=None): - with open(spec_path, "r") as stream: + with open(spec_path, "r", encoding='utf-8') as stream: prefix = '# SPDX-License-Identifier: ' first = stream.readline().strip() if not first.startswith(prefix): - raise Exception('SPDX license tag required in the spec') + raise SpecException('SPDX license tag required in the spec') self.license = first[len(prefix):] stream.seek(0) - spec = yaml.safe_load(stream) + spec = pyyaml.safe_load(stream) + self.fixed_header = None self._resolution_list = [] super().__init__(self, spec) @@ -451,15 +463,13 @@ class SpecFamily(SpecElement): if schema_path is None: schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml' if schema_path: - global jsonschema - - with open(schema_path, "r") as stream: - schema = yaml.safe_load(stream) + with open(schema_path, "r", encoding='utf-8') as stream: + schema = pyyaml.safe_load(stream) - if jsonschema is None: - jsonschema = importlib.import_module("jsonschema") + if SpecFamily.jsonschema is None: + SpecFamily.jsonschema = importlib.import_module("jsonschema") - jsonschema.validate(self.yaml, schema) + SpecFamily.jsonschema.validate(self.yaml, schema) self.attr_sets = collections.OrderedDict() self.sub_msgs = collections.OrderedDict() @@ -548,7 +558,7 @@ class SpecFamily(SpecElement): req_val_next = req_val + 1 rsp_val_next = rsp_val + rsp_inc else: - raise Exception("Can't parse directional ops") + raise SpecException("Can't parse directional ops") if req_val == req_val_next: req_val = None @@ -560,20 +570,19 @@ class SpecFamily(SpecElement): skip |= bool(exclude.match(elem['name'])) if not skip: op = self.new_operation(elem, req_val, rsp_val) + self.msgs[op.name] = op req_val = req_val_next rsp_val = rsp_val_next - self.msgs[op.name] = op - def find_operation(self, name): - """ - For a given operation name, find and return operation spec. - """ - for op in self.yaml['operations']['list']: - if name == op['name']: - return op - return None + """ + For a given operation name, find and return operation spec. + """ + for op in self.yaml['operations']['list']: + if name == op['name']: + return op + return None def resolve(self): self.resolve_up(super()) diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 36d36eb7e3b8..9774005e7ad1 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -1,4 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# +# pylint: disable=missing-class-docstring, missing-function-docstring +# pylint: disable=too-many-branches, too-many-locals, too-many-instance-attributes +# pylint: disable=too-many-lines + +""" +YAML Netlink Library + +An implementation of the genetlink and raw netlink protocols. +""" from collections import namedtuple from enum import Enum @@ -22,6 +32,11 @@ from .nlspec import SpecFamily # +class YnlException(Exception): + pass + + +# pylint: disable=too-few-public-methods class Netlink: # Netlink socket SOL_NETLINK = 270 @@ -144,22 +159,22 @@ class NlAttr: @classmethod def get_format(cls, attr_type, byte_order=None): - format = cls.type_formats[attr_type] + format_ = cls.type_formats[attr_type] if byte_order: - return format.big if byte_order == "big-endian" \ - else format.little - return format.native + return format_.big if byte_order == "big-endian" \ + else format_.little + return format_.native def as_scalar(self, attr_type, byte_order=None): - format = self.get_format(attr_type, byte_order) - return format.unpack(self.raw)[0] + format_ = self.get_format(attr_type, byte_order) + return format_.unpack(self.raw)[0] def as_auto_scalar(self, attr_type, byte_order=None): if len(self.raw) != 4 and len(self.raw) != 8: - raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}") + raise YnlException(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}") real_type = attr_type[0] + str(len(self.raw) * 8) - format = self.get_format(real_type, byte_order) - return format.unpack(self.raw)[0] + format_ = self.get_format(real_type, byte_order) + return format_.unpack(self.raw)[0] def as_strz(self): return self.raw.decode('ascii')[:-1] @@ -167,9 +182,9 @@ class NlAttr: def as_bin(self): return self.raw - def as_c_array(self, type): - format = self.get_format(type) - return [ x[0] for x in format.iter_unpack(self.raw) ] + def as_c_array(self, c_type): + format_ = self.get_format(c_type) + return [ x[0] for x in format_.iter_unpack(self.raw) ] def __repr__(self): return f"[type:{self.type} len:{self._len}] {self.raw}" @@ -220,7 +235,7 @@ class NlMsg: self.extack = None if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off: - self.extack = dict() + self.extack = {} extack_attrs = NlAttrs(self.raw[extack_off:]) for extack in extack_attrs: if extack.type == Netlink.NLMSGERR_ATTR_MSG: @@ -245,8 +260,8 @@ class NlMsg: policy = {} for attr in NlAttrs(raw): if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE: - type = attr.as_scalar('u32') - policy['type'] = Netlink.AttrType(type).name + type_ = attr.as_scalar('u32') + policy['type'] = Netlink.AttrType(type_).name elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S: policy['min-value'] = attr.as_scalar('s64') elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S: @@ -281,7 +296,8 @@ class NlMsg: return self.nl_type def __repr__(self): - msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}" + msg = (f"nl_len = {self.nl_len} ({len(self.raw)}) " + f"nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}") if self.error: msg += '\n\terror: ' + str(self.error) if self.extack: @@ -289,6 +305,7 @@ class NlMsg: return msg +# pylint: disable=too-few-public-methods class NlMsgs: def __init__(self, data): self.msgs = [] @@ -303,9 +320,6 @@ class NlMsgs: yield from self.msgs -genl_family_name_to_id = None - - def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None): # we prepend length in _genl_msg_finalize() if seq is None: @@ -319,7 +333,10 @@ def _genl_msg_finalize(msg): return struct.pack("I", len(msg) + 4) + msg +# pylint: disable=too-many-nested-blocks def _genl_load_families(): + genl_family_name_to_id = {} + with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock: sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1) @@ -330,21 +347,17 @@ def _genl_load_families(): sock.send(msg, 0) - global genl_family_name_to_id - genl_family_name_to_id = dict() - while True: reply = sock.recv(128 * 1024) nms = NlMsgs(reply) for nl_msg in nms: if nl_msg.error: - print("Netlink error:", nl_msg.error) - return + raise YnlException(f"Netlink error: {nl_msg.error}") if nl_msg.done: - return + return genl_family_name_to_id gm = GenlMsg(nl_msg) - fam = dict() + fam = {} for attr in NlAttrs(gm.raw): if attr.type == Netlink.CTRL_ATTR_FAMILY_ID: fam['id'] = attr.as_scalar('u16') @@ -353,7 +366,7 @@ def _genl_load_families(): elif attr.type == Netlink.CTRL_ATTR_MAXATTR: fam['maxattr'] = attr.as_scalar('u32') elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS: - fam['mcast'] = dict() + fam['mcast'] = {} for entry in NlAttrs(attr.raw): mcast_name = None mcast_id = None @@ -373,6 +386,7 @@ class GenlMsg: self.nl = nl_msg self.genl_cmd, self.genl_version, _ = struct.unpack_from("BBH", nl_msg.raw, 0) self.raw = nl_msg.raw[4:] + self.raw_attrs = [] def cmd(self): return self.genl_cmd @@ -396,7 +410,7 @@ class NetlinkProtocol: nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0) return nlmsg - def message(self, flags, command, version, seq=None): + def message(self, flags, command, _version, seq=None): return self._message(command, flags, seq) def _decode(self, nl_msg): @@ -406,13 +420,13 @@ class NetlinkProtocol: msg = self._decode(nl_msg) if op is None: op = ynl.rsp_by_value[msg.cmd()] - fixed_header_size = ynl._struct_size(op.fixed_header) + fixed_header_size = ynl.struct_size(op.fixed_header) msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size) return msg def get_mcast_id(self, mcast_name, mcast_groups): if mcast_name not in mcast_groups: - raise Exception(f'Multicast group "{mcast_name}" not present in the spec') + raise YnlException(f'Multicast group "{mcast_name}" not present in the spec') return mcast_groups[mcast_name].value def msghdr_size(self): @@ -420,15 +434,16 @@ class NetlinkProtocol: class GenlProtocol(NetlinkProtocol): + genl_family_name_to_id = {} + def __init__(self, family_name): super().__init__(family_name, Netlink.NETLINK_GENERIC) - global genl_family_name_to_id - if genl_family_name_to_id is None: - _genl_load_families() + if not GenlProtocol.genl_family_name_to_id: + GenlProtocol.genl_family_name_to_id = _genl_load_families() - self.genl_family = genl_family_name_to_id[family_name] - self.family_id = genl_family_name_to_id[family_name]['id'] + self.genl_family = GenlProtocol.genl_family_name_to_id[family_name] + self.family_id = GenlProtocol.genl_family_name_to_id[family_name]['id'] def message(self, flags, command, version, seq=None): nlmsg = self._message(self.family_id, flags, seq) @@ -440,13 +455,14 @@ class GenlProtocol(NetlinkProtocol): def get_mcast_id(self, mcast_name, mcast_groups): if mcast_name not in self.genl_family['mcast']: - raise Exception(f'Multicast group "{mcast_name}" not present in the family') + raise YnlException(f'Multicast group "{mcast_name}" not present in the family') return self.genl_family['mcast'][mcast_name] def msghdr_size(self): return super().msghdr_size() + 4 +# pylint: disable=too-few-public-methods class SpaceAttrs: SpecValuesPair = namedtuple('SpecValuesPair', ['spec', 'values']) @@ -461,9 +477,9 @@ class SpaceAttrs: if name in scope.values: return scope.values[name] spec_name = scope.spec.yaml['name'] - raise Exception( + raise YnlException( f"No value for '{name}' in attribute space '{spec_name}'") - raise Exception(f"Attribute '{name}' not defined in any attribute-set") + raise YnlException(f"Attribute '{name}' not defined in any attribute-set") # @@ -485,8 +501,8 @@ class YnlFamily(SpecFamily): self.yaml['protonum']) else: self.nlproto = GenlProtocol(self.yaml['name']) - except KeyError: - raise Exception(f"Family '{self.yaml['name']}' not supported by the kernel") + except KeyError as err: + raise YnlException(f"Family '{self.yaml['name']}' not supported by the kernel") from err self._recv_dbg = False # Note that netlink will use conservative (min) message size for @@ -542,8 +558,7 @@ class YnlFamily(SpecFamily): for single_value in value: scalar += enum.entries[single_value].user_value(as_flags = True) return scalar - else: - return enum.entries[value].user_value() + return enum.entries[value].user_value() def _get_scalar(self, attr_spec, value): try: @@ -555,11 +570,12 @@ class YnlFamily(SpecFamily): return self._from_string(value, attr_spec) raise e + # pylint: disable=too-many-statements def _add_attr(self, space, name, value, search_attrs): try: attr = self.attr_sets[space][name] - except KeyError: - raise Exception(f"Space '{space}' has no attribute '{name}'") + except KeyError as err: + raise YnlException(f"Space '{space}' has no attribute '{name}'") from err nl_type = attr.value if attr.is_multi and isinstance(value, list): @@ -597,18 +613,18 @@ class YnlFamily(SpecFamily): elif isinstance(value, dict) and attr.struct_name: attr_payload = self._encode_struct(attr.struct_name, value) elif isinstance(value, list) and attr.sub_type in NlAttr.type_formats: - format = NlAttr.get_format(attr.sub_type) - attr_payload = b''.join([format.pack(x) for x in value]) + format_ = NlAttr.get_format(attr.sub_type) + attr_payload = b''.join([format_.pack(x) for x in value]) else: - raise Exception(f'Unknown type for binary attribute, value: {value}') + raise YnlException(f'Unknown type for binary attribute, value: {value}') elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar: scalar = self._get_scalar(attr, value) if attr.is_auto_scalar: attr_type = attr["type"][0] + ('32' if scalar.bit_length() <= 32 else '64') else: attr_type = attr["type"] - format = NlAttr.get_format(attr_type, attr.byte_order) - attr_payload = format.pack(scalar) + format_ = NlAttr.get_format(attr_type, attr.byte_order) + attr_payload = format_.pack(scalar) elif attr['type'] in "bitfield32": scalar_value = self._get_scalar(attr, value["value"]) scalar_selector = self._get_scalar(attr, value["selector"]) @@ -626,9 +642,9 @@ class YnlFamily(SpecFamily): attr_payload += self._add_attr(msg_format.attr_set, subname, subvalue, sub_attrs) else: - raise Exception(f"Unknown attribute-set '{msg_format.attr_set}'") + raise YnlException(f"Unknown attribute-set '{msg_format.attr_set}'") else: - raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}') + raise YnlException(f'Unknown type at {space} {name} {value} {attr["type"]}') return self._add_attr_raw(nl_type, attr_payload) @@ -715,7 +731,7 @@ class YnlFamily(SpecFamily): subattr = self._formatted_string(subattr, attr_spec.display_hint) decoded.append(subattr) else: - raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') + raise YnlException(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded def _decode_nest_type_value(self, attr, attr_spec): @@ -731,12 +747,11 @@ class YnlFamily(SpecFamily): def _decode_unknown(self, attr): if attr.is_nest: return self._decode(NlAttrs(attr.raw), None) - else: - return attr.as_bin() + return attr.as_bin() def _rsp_add(self, rsp, name, is_multi, decoded): if is_multi is None: - if name in rsp and type(rsp[name]) is not list: + if name in rsp and not isinstance(rsp[name], list): rsp[name] = [rsp[name]] is_multi = True else: @@ -752,13 +767,13 @@ class YnlFamily(SpecFamily): def _resolve_selector(self, attr_spec, search_attrs): sub_msg = attr_spec.sub_message if sub_msg not in self.sub_msgs: - raise Exception(f"No sub-message spec named {sub_msg} for {attr_spec.name}") + raise YnlException(f"No sub-message spec named {sub_msg} for {attr_spec.name}") sub_msg_spec = self.sub_msgs[sub_msg] selector = attr_spec.selector value = search_attrs.lookup(selector) if value not in sub_msg_spec.formats: - raise Exception(f"No message format for '{value}' in sub-message spec '{sub_msg}'") + raise YnlException(f"No message format for '{value}' in sub-message spec '{sub_msg}'") spec = sub_msg_spec.formats[value] return spec, value @@ -769,17 +784,20 @@ class YnlFamily(SpecFamily): offset = 0 if msg_format.fixed_header: decoded.update(self._decode_struct(attr.raw, msg_format.fixed_header)) - offset = self._struct_size(msg_format.fixed_header) + offset = self.struct_size(msg_format.fixed_header) if msg_format.attr_set: if msg_format.attr_set in self.attr_sets: subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set) decoded.update(subdict) else: - raise Exception(f"Unknown attribute-set '{msg_format.attr_set}' when decoding '{attr_spec.name}'") + raise YnlException(f"Unknown attribute-set '{msg_format.attr_set}' " + f"when decoding '{attr_spec.name}'") return decoded + # pylint: disable=too-many-statements def _decode(self, attrs, space, outer_attrs = None): - rsp = dict() + rsp = {} + search_attrs = {} if space: attr_space = self.attr_sets[space] search_attrs = SpaceAttrs(attr_space, rsp, outer_attrs) @@ -787,16 +805,19 @@ class YnlFamily(SpecFamily): for attr in attrs: try: attr_spec = attr_space.attrs_by_val[attr.type] - except (KeyError, UnboundLocalError): + except (KeyError, UnboundLocalError) as err: if not self.process_unknown: - raise Exception(f"Space '{space}' has no attribute with value '{attr.type}'") + raise YnlException(f"Space '{space}' has no attribute " + f"with value '{attr.type}'") from err attr_name = f"UnknownAttr({attr.type})" self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr)) continue try: if attr_spec["type"] == 'nest': - subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs) + subdict = self._decode(NlAttrs(attr.raw), + attr_spec['nested-attributes'], + search_attrs) decoded = subdict elif attr_spec["type"] == 'string': decoded = attr.as_strz() @@ -828,7 +849,8 @@ class YnlFamily(SpecFamily): decoded = self._decode_nest_type_value(attr, attr_spec) else: if not self.process_unknown: - raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}') + raise YnlException(f'Unknown {attr_spec["type"]} ' + f'with name {attr_spec["name"]}') decoded = self._decode_unknown(attr) self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded) @@ -838,12 +860,14 @@ class YnlFamily(SpecFamily): return rsp + # pylint: disable=too-many-arguments, too-many-positional-arguments def _decode_extack_path(self, attrs, attr_set, offset, target, search_attrs): for attr in attrs: try: attr_spec = attr_set.attrs_by_val[attr.type] - except KeyError: - raise Exception(f"Space '{attr_set.name}' has no attribute with value '{attr.type}'") + except KeyError as err: + raise YnlException( + f"Space '{attr_set.name}' has no attribute with value '{attr.type}'") from err if offset > target: break if offset == target: @@ -860,11 +884,12 @@ class YnlFamily(SpecFamily): elif attr_spec['type'] == 'sub-message': msg_format, value = self._resolve_selector(attr_spec, search_attrs) if msg_format is None: - raise Exception(f"Can't resolve sub-message of {attr_spec['name']} for extack") + raise YnlException(f"Can't resolve sub-message of " + f"{attr_spec['name']} for extack") sub_attrs = self.attr_sets[msg_format.attr_set] pathname += f"({value})" else: - raise Exception(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack") + raise YnlException(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack") offset += 4 subpath = self._decode_extack_path(NlAttrs(attr.raw), sub_attrs, offset, target, search_attrs) @@ -879,7 +904,7 @@ class YnlFamily(SpecFamily): return msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op) - offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header) + offset = self.nlproto.msghdr_size() + self.struct_size(op.fixed_header) search_attrs = SpaceAttrs(op.attr_set, vals) path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset, extack['bad-attr-offs'], search_attrs) @@ -887,26 +912,25 @@ class YnlFamily(SpecFamily): del extack['bad-attr-offs'] extack['bad-attr'] = path - def _struct_size(self, name): + def struct_size(self, name): if name: members = self.consts[name].members size = 0 for m in members: if m.type in ['pad', 'binary']: if m.struct: - size += self._struct_size(m.struct) + size += self.struct_size(m.struct) else: size += m.len else: - format = NlAttr.get_format(m.type, m.byte_order) - size += format.size + format_ = NlAttr.get_format(m.type, m.byte_order) + size += format_.size return size - else: - return 0 + return 0 def _decode_struct(self, data, name): members = self.consts[name].members - attrs = dict() + attrs = {} offset = 0 for m in members: value = None @@ -914,17 +938,17 @@ class YnlFamily(SpecFamily): offset += m.len elif m.type == 'binary': if m.struct: - len = self._struct_size(m.struct) - value = self._decode_struct(data[offset : offset + len], + len_ = self.struct_size(m.struct) + value = self._decode_struct(data[offset : offset + len_], m.struct) - offset += len + offset += len_ else: value = data[offset : offset + m.len] offset += m.len else: - format = NlAttr.get_format(m.type, m.byte_order) - [ value ] = format.unpack_from(data, offset) - offset += format.size + format_ = NlAttr.get_format(m.type, m.byte_order) + [ value ] = format_.unpack_from(data, offset) + offset += format_.size if value is not None: if m.enum: value = self._decode_enum(value, m) @@ -943,7 +967,7 @@ class YnlFamily(SpecFamily): elif m.type == 'binary': if m.struct: if value is None: - value = dict() + value = {} attr_payload += self._encode_struct(m.struct, value) else: if value is None: @@ -953,13 +977,13 @@ class YnlFamily(SpecFamily): else: if value is None: value = 0 - format = NlAttr.get_format(m.type, m.byte_order) - attr_payload += format.pack(value) + format_ = NlAttr.get_format(m.type, m.byte_order) + attr_payload += format_.pack(value) return attr_payload def _formatted_string(self, raw, display_hint): if display_hint == 'mac': - formatted = ':'.join('%02x' % b for b in raw) + formatted = ':'.join(f'{b:02x}' for b in raw) elif display_hint == 'hex': if isinstance(raw, int): formatted = hex(raw) @@ -991,16 +1015,16 @@ class YnlFamily(SpecFamily): mac_bytes = [int(x, 16) for x in string.split(':')] else: if len(string) % 2 != 0: - raise Exception(f"Invalid MAC address format: {string}") + raise YnlException(f"Invalid MAC address format: {string}") mac_bytes = [int(string[i:i+2], 16) for i in range(0, len(string), 2)] raw = bytes(mac_bytes) else: - raise Exception(f"Display hint '{attr_spec.display_hint}' not implemented" + raise YnlException(f"Display hint '{attr_spec.display_hint}' not implemented" f" when parsing '{attr_spec['name']}'") return raw def handle_ntf(self, decoded): - msg = dict() + msg = {} if self.include_raw: msg['raw'] = decoded op = self.rsp_by_value[decoded.cmd()] @@ -1081,6 +1105,7 @@ class YnlFamily(SpecFamily): msg = _genl_msg_finalize(msg) return msg + # pylint: disable=too-many-statements def _ops(self, ops): reqs_by_seq = {} req_seq = random.randint(1024, 65535) @@ -1139,9 +1164,8 @@ class YnlFamily(SpecFamily): if decoded.cmd() in self.async_msg_ids: self.handle_ntf(decoded) continue - else: - print('Unexpected message: ' + repr(decoded)) - continue + print('Unexpected message: ' + repr(decoded)) + continue rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name) if op.fixed_header: diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index b517d0c605ad..0e1e486c1185 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1,5 +1,17 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# pylint: disable=line-too-long, missing-class-docstring, missing-function-docstring +# pylint: disable=too-many-positional-arguments, too-many-arguments, too-many-statements +# pylint: disable=too-many-branches, too-many-locals, too-many-instance-attributes +# pylint: disable=too-many-nested-blocks, too-many-lines, too-few-public-methods +# pylint: disable=broad-exception-raised, broad-exception-caught, protected-access + +""" +ynl_gen_c + +A YNL to C code generator for both kernel and userspace protocol stubs. +""" import argparse import filecmp @@ -9,8 +21,9 @@ import re import shutil import sys import tempfile -import yaml +import yaml as pyyaml +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation, SpecEnumSet, SpecEnumEntry from lib import SpecSubMessage @@ -157,7 +170,7 @@ class Type(SpecAttr): def presence_member(self, space, type_filter): if self.presence_type() != type_filter: - return + return '' if self.presence_type() == 'present': pfx = '__' if space == 'user' else '' @@ -166,14 +179,15 @@ class Type(SpecAttr): if self.presence_type() in {'len', 'count'}: pfx = '__' if space == 'user' else '' return f"{pfx}u32 {self.c_name};" + return '' - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return None def free_needs_iter(self): return False - def _free_lines(self, ri, var, ref): + def _free_lines(self, _ri, var, ref): if self.is_multi_val() or self.presence_type() in {'count', 'len'}: return [f'free({var}->{ref}{self.c_name});'] return [] @@ -183,9 +197,10 @@ class Type(SpecAttr): for line in lines: ri.cw.p(line) + # pylint: disable=assignment-from-none def arg_member(self, ri): member = self._complex_member_type(ri) - if member: + if member is not None: spc = ' ' if member[-1] != '*' else '' arg = [member + spc + '*' + self.c_name] if self.presence_type() == 'count': @@ -195,7 +210,7 @@ class Type(SpecAttr): def struct_member(self, ri): member = self._complex_member_type(ri) - if member: + if member is not None: ptr = '*' if self.is_multi_val() else '' if self.is_recursive_for_op(ri): ptr = '*' @@ -243,9 +258,9 @@ class Type(SpecAttr): def attr_get(self, ri, var, first): lines, init_lines, _ = self._attr_get(ri, var) - if type(lines) is str: + if isinstance(lines, str): lines = [lines] - if type(init_lines) is str: + if isinstance(init_lines, str): init_lines = [init_lines] kw = 'if' if first else 'else if' @@ -270,7 +285,7 @@ class Type(SpecAttr): def _setter_lines(self, ri, member, presence): raise Exception(f"Setter not implemented for class type {self.type}") - def setter(self, ri, space, direction, deref=False, ref=None, var="req"): + def setter(self, ri, _space, direction, deref=False, ref=None, var="req"): ref = (ref if ref else []) + [self.c_name] member = f"{var}->{'.'.join(ref)}" @@ -280,6 +295,7 @@ class Type(SpecAttr): code = [] presence = '' + # pylint: disable=consider-using-enumerate for i in range(0, len(ref)): presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}" # Every layer below last is a nest, so we know it uses bit presence @@ -414,6 +430,7 @@ class TypeScalar(Type): if low < -32768 or high > 32767: self.checks['full-range'] = True + # pylint: disable=too-many-return-statements def _attr_policy(self, policy): if 'flags-mask' in self.checks or self.is_bitfield: if self.is_bitfield: @@ -424,15 +441,15 @@ class TypeScalar(Type): flag_cnt = len(flags['entries']) mask = (1 << flag_cnt) - 1 return f"NLA_POLICY_MASK({policy}, 0x{mask:x})" - elif 'full-range' in self.checks: + if 'full-range' in self.checks: return f"NLA_POLICY_FULL_RANGE({policy}, &{c_lower(self.enum_name)}_range)" - elif 'range' in self.checks: + if 'range' in self.checks: return f"NLA_POLICY_RANGE({policy}, {self.get_limit_str('min')}, {self.get_limit_str('max')})" - elif 'min' in self.checks: + if 'min' in self.checks: return f"NLA_POLICY_MIN({policy}, {self.get_limit_str('min')})" - elif 'max' in self.checks: + if 'max' in self.checks: return f"NLA_POLICY_MAX({policy}, {self.get_limit_str('max')})" - elif 'sparse' in self.checks: + if 'sparse' in self.checks: return f"NLA_POLICY_VALIDATE_FN({policy}, &{c_lower(self.enum_name)}_validate)" return super()._attr_policy(policy) @@ -554,6 +571,8 @@ class TypeBinary(Type): mem = 'NLA_POLICY_MIN_LEN(' + self.get_limit_str('min-len') + ')' elif 'max-len' in self.checks: mem = 'NLA_POLICY_MAX_LEN(' + self.get_limit_str('max-len') + ')' + else: + raise Exception('Failed to process policy check for binary type') return mem @@ -627,7 +646,7 @@ class TypeBinaryScalarArray(TypeBinary): class TypeBitfield32(Type): - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return "struct nla_bitfield32" def _attr_typol(self): @@ -655,7 +674,7 @@ class TypeNest(Type): def is_recursive(self): return self.family.pure_nested_structs[self.nested_attrs].recursive - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return self.nested_struct_type def _free_lines(self, ri, var, ref): @@ -689,7 +708,7 @@ class TypeNest(Type): f"parg.data = &{var}->{self.c_name};"] return get_lines, init_lines, None - def setter(self, ri, space, direction, deref=False, ref=None, var="req"): + def setter(self, ri, _space, direction, deref=False, ref=None, var="req"): ref = (ref if ref else []) + [self.c_name] for _, attr in ri.family.pure_nested_structs[self.nested_attrs].member_list(): @@ -714,19 +733,18 @@ class TypeMultiAttr(Type): def _complex_member_type(self, ri): if 'type' not in self.attr or self.attr['type'] == 'nest': return self.nested_struct_type - elif self.attr['type'] == 'binary' and 'struct' in self.attr: + if self.attr['type'] == 'binary' and 'struct' in self.attr: return None # use arg_member() - elif self.attr['type'] == 'string': + if self.attr['type'] == 'string': return 'struct ynl_string *' - elif self.attr['type'] in scalars: + if self.attr['type'] in scalars: scalar_pfx = '__' if ri.ku_space == 'user' else '' if self.is_auto_scalar: name = self.type[0] + '64' else: name = self.attr['type'] return scalar_pfx + name - else: - raise Exception(f"Sub-type {self.attr['type']} not supported yet") + raise Exception(f"Sub-type {self.attr['type']} not supported yet") def arg_member(self, ri): if self.type == 'binary' and 'struct' in self.attr: @@ -737,7 +755,7 @@ class TypeMultiAttr(Type): def free_needs_iter(self): return self.attr['type'] in {'nest', 'string'} - def _free_lines(self, ri, var, ref): + def _free_lines(self, _ri, var, ref): lines = [] if self.attr['type'] in scalars: lines += [f"free({var}->{ref}{self.c_name});"] @@ -801,13 +819,12 @@ class TypeIndexedArray(Type): def _complex_member_type(self, ri): if 'sub-type' not in self.attr or self.attr['sub-type'] == 'nest': return self.nested_struct_type - elif self.attr['sub-type'] in scalars: + if self.attr['sub-type'] in scalars: scalar_pfx = '__' if ri.ku_space == 'user' else '' return scalar_pfx + self.attr['sub-type'] - elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: + if self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: return None # use arg_member() - else: - raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet") + raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet") def arg_member(self, ri): if self.sub_type == 'binary' and 'exact-len' in self.checks: @@ -823,12 +840,11 @@ class TypeIndexedArray(Type): def _attr_typol(self): if self.attr['sub-type'] in scalars: return f'.type = YNL_PT_U{c_upper(self.sub_type[1:])}, ' - elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: + if self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: return f'.type = YNL_PT_BINARY, .len = {self.checks["exact-len"]}, ' - elif self.attr['sub-type'] == 'nest': + if self.attr['sub-type'] == 'nest': return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, ' - else: - raise Exception(f"Typol for IndexedArray sub-type {self.attr['sub-type']} not supported, yet") + raise Exception(f"Typol for IndexedArray sub-type {self.attr['sub-type']} not supported, yet") def _attr_get(self, ri, var): local_vars = ['const struct nlattr *attr2;'] @@ -864,18 +880,18 @@ class TypeIndexedArray(Type): def free_needs_iter(self): return self.sub_type == 'nest' - def _free_lines(self, ri, var, ref): + def _free_lines(self, _ri, var, ref): lines = [] if self.sub_type == 'nest': lines += [ f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)", f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);', ] - lines += f"free({var}->{ref}{self.c_name});", + lines += (f"free({var}->{ref}{self.c_name});",) return lines class TypeNestTypeValue(Type): - def _complex_member_type(self, ri): + def _complex_member_type(self, _ri): return self.nested_struct_type def _attr_typol(self): @@ -921,15 +937,15 @@ class TypeSubMessage(TypeNest): return typol def _attr_get(self, ri, var): - sel = c_lower(self['selector']) + selector = self['selector'] + sel = c_lower(selector) if self.selector.is_external(): sel_var = f"_sel_{sel}" else: sel_var = f"{var}->{sel}" get_lines = [f'if (!{sel_var})', - 'return ynl_submsg_failed(yarg, "%s", "%s");' % - (self.name, self['selector']), - f"if ({self.nested_render_name}_parse(&parg, {sel_var}, attr))", + f'return ynl_submsg_failed(yarg, "{self.name}", "{selector}");', + f"if ({self.nested_render_name}_parse(&parg, {sel_var}, attr))", "return YNL_PARSE_CB_ERROR;"] init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;", f"parg.data = &{var}->{self.c_name};"] @@ -988,7 +1004,7 @@ class Struct: self.in_multi_val = False # used by a MultiAttr or and legacy arrays self.attr_list = [] - self.attrs = dict() + self.attrs = {} if type_list is not None: for t in type_list: self.attr_list.append((t, self.attr_set[t]),) @@ -1020,7 +1036,7 @@ class Struct: def external_selectors(self): sels = [] - for name, attr in self.attr_list: + for _name, attr in self.attr_list: if isinstance(attr, TypeSubMessage) and attr.selector.is_external(): sels.append(attr.selector) return sels @@ -1037,9 +1053,9 @@ class EnumEntry(SpecEnumEntry): super().__init__(enum_set, yaml, prev, value_start) if prev: - self.value_change = (self.value != prev.value + 1) + self.value_change = self.value != prev.value + 1 else: - self.value_change = (self.value != 0) + self.value_change = self.value != 0 self.value_change = self.value_change or self.enum_set['type'] == 'flags' # Added by resolve: @@ -1080,8 +1096,8 @@ class EnumSet(SpecEnumSet): return EnumEntry(self, entry, prev_entry, value_start) def value_range(self): - low = min([x.value for x in self.entries.values()]) - high = max([x.value for x in self.entries.values()]) + low = min(x.value for x in self.entries.values()) + high = max(x.value for x in self.entries.values()) if high - low + 1 != len(self.entries): return None, None @@ -1220,6 +1236,12 @@ class Family(SpecFamily): self.hooks = None delattr(self, "hooks") + self.root_sets = {} + self.pure_nested_structs = {} + self.kernel_policy = None + self.global_policy = None + self.global_policy_set = None + super().__init__(file_name, exclude_ops=exclude_ops) self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME')) @@ -1254,18 +1276,18 @@ class Family(SpecFamily): self.mcgrps = self.yaml.get('mcast-groups', {'list': []}) - self.hooks = dict() + self.hooks = {} for when in ['pre', 'post']: - self.hooks[when] = dict() + self.hooks[when] = {} for op_mode in ['do', 'dump']: - self.hooks[when][op_mode] = dict() + self.hooks[when][op_mode] = {} self.hooks[when][op_mode]['set'] = set() self.hooks[when][op_mode]['list'] = [] # dict space-name -> 'request': set(attrs), 'reply': set(attrs) - self.root_sets = dict() + self.root_sets = {} # dict space-name -> Struct - self.pure_nested_structs = dict() + self.pure_nested_structs = {} self._mark_notify() self._mock_up_events() @@ -1311,7 +1333,7 @@ class Family(SpecFamily): } def _load_root_sets(self): - for op_name, op in self.msgs.items(): + for _op_name, op in self.msgs.items(): if 'attribute-set' not in op: continue @@ -1427,7 +1449,7 @@ class Family(SpecFamily): attr_set_queue = list(self.root_sets.keys()) attr_set_seen = set(self.root_sets.keys()) - while len(attr_set_queue): + while attr_set_queue: a_set = attr_set_queue.pop(0) for attr, spec in self.attr_sets[a_set].items(): if 'nested-attributes' in spec: @@ -1510,7 +1532,7 @@ class Family(SpecFamily): for k, _ in self.root_sets.items(): yield k, None # we don't have a struct, but it must be terminal - for attr_set, struct in all_structs(): + for attr_set, _struct in all_structs(): for _, spec in self.attr_sets[attr_set].items(): if 'nested-attributes' in spec: child_name = spec['nested-attributes'] @@ -1530,7 +1552,7 @@ class Family(SpecFamily): def _load_global_policy(self): global_set = set() attr_set_name = None - for op_name, op in self.ops.items(): + for _op_name, op in self.ops.items(): if not op: continue if 'attribute-set' not in op: @@ -1613,7 +1635,7 @@ class RenderInfo: self.cw = cw - self.struct = dict() + self.struct = {} if op_mode == 'notify': op_mode = 'do' if 'do' in op else 'dump' for op_dir in ['request', 'reply']: @@ -1650,6 +1672,7 @@ class CodeWriter: if out_file is None: self._out = os.sys.stdout else: + # pylint: disable=consider-using-with self._out = tempfile.NamedTemporaryFile('w+') self._out_file = out_file @@ -1664,7 +1687,7 @@ class CodeWriter: if not self._overwrite and os.path.isfile(self._out_file): if filecmp.cmp(self._out.name, self._out_file, shallow=False): return - with open(self._out_file, 'w+') as out_file: + with open(self._out_file, 'w+', encoding='utf-8') as out_file: self._out.seek(0) shutil.copyfileobj(self._out, out_file) self._out.close() @@ -1779,7 +1802,7 @@ class CodeWriter: if not local_vars: return - if type(local_vars) is str: + if isinstance(local_vars, str): local_vars = [local_vars] local_vars.sort(key=len, reverse=True) @@ -1799,20 +1822,19 @@ class CodeWriter: def writes_defines(self, defines): longest = 0 for define in defines: - if len(define[0]) > longest: - longest = len(define[0]) + longest = max(len(define[0]), longest) longest = ((longest + 8) // 8) * 8 for define in defines: line = '#define ' + define[0] line += '\t' * ((longest - len(define[0]) + 7) // 8) - if type(define[1]) is int: + if isinstance(define[1], int): line += str(define[1]) - elif type(define[1]) is str: + elif isinstance(define[1], str): line += '"' + define[1] + '"' self.p(line) def write_struct_init(self, members): - longest = max([len(x[0]) for x in members]) + longest = max(len(x[0]) for x in members) longest += 1 # because we prepend a . longest = ((longest + 8) // 8) * 8 for one in members: @@ -2038,12 +2060,12 @@ def put_op_name(family, cw): _put_enum_to_str_helper(cw, family.c_name + '_op', map_name, 'op') -def put_enum_to_str_fwd(family, cw, enum): +def put_enum_to_str_fwd(_family, cw, enum): args = [enum.user_type + ' value'] cw.write_func_prot('const char *', f'{enum.render_name}_str', args, suffix=';') -def put_enum_to_str(family, cw, enum): +def put_enum_to_str(_family, cw, enum): map_name = f'{enum.render_name}_strmap' cw.block_start(line=f"static const char * const {map_name}[] =") for entry in enum.entries.values(): @@ -2324,7 +2346,8 @@ def parse_rsp_nested_prototype(ri, struct, suffix=';'): def parse_rsp_nested(ri, struct): if struct.submsg: - return parse_rsp_submsg(ri, struct) + parse_rsp_submsg(ri, struct) + return parse_rsp_nested_prototype(ri, struct, suffix='') @@ -2654,7 +2677,7 @@ def print_req_free(ri): def print_rsp_type(ri): - if (ri.op_mode == 'do' or ri.op_mode == 'dump') and 'reply' in ri.op[ri.op_mode]: + if ri.op_mode in ('do', 'dump') and 'reply' in ri.op[ri.op_mode]: direction = 'reply' elif ri.op_mode == 'event': direction = 'reply' @@ -2667,7 +2690,7 @@ def print_wrapped_type(ri): ri.cw.block_start(line=f"{type_name(ri, 'reply')}") if ri.op_mode == 'dump': ri.cw.p(f"{type_name(ri, 'reply')} *next;") - elif ri.op_mode == 'notify' or ri.op_mode == 'event': + elif ri.op_mode in ('notify', 'event'): ri.cw.p('__u16 family;') ri.cw.p('__u8 cmd;') ri.cw.p('struct ynl_ntf_base_type *next;') @@ -2704,7 +2727,7 @@ def _free_type(ri, direction, struct): def free_rsp_nested_prototype(ri): - print_free_prototype(ri, "") + print_free_prototype(ri, "") def free_rsp_nested(ri, struct): @@ -2930,7 +2953,7 @@ def print_kernel_op_table_hdr(family, cw): def print_kernel_op_table(family, cw): print_kernel_op_table_fwd(family, cw, terminate=False) - if family.kernel_policy == 'global' or family.kernel_policy == 'per-op': + if family.kernel_policy in ('global', 'per-op'): for op_name, op in family.ops.items(): if op.is_async: continue @@ -3346,7 +3369,7 @@ def render_user_family(family, cw, prototype): else: raise Exception('Invalid notification ' + ntf_op_name) _render_user_ntf_entry(ri, ntf_op) - for op_name, op in family.ops.items(): + for _op_name, op in family.ops.items(): if 'event' not in op: continue ri = RenderInfo(cw, family, "user", op, "event") @@ -3418,12 +3441,11 @@ def main(): print('Spec license:', parsed.license) print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)') os.sys.exit(1) - except yaml.YAMLError as exc: + except pyyaml.YAMLError as exc: print(exc) os.sys.exit(1) - return - cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out)) + cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=not args.cmp_out) _, spec_kernel = find_kernel_root(args.spec) if args.mode == 'uapi' or args.header: @@ -3524,7 +3546,7 @@ def main(): cw.nl() if parsed.kernel_policy in {'per-op', 'split'}: - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): if 'do' in op and 'event' not in op: ri = RenderInfo(cw, parsed, args.mode, op, "do") print_req_policy_fwd(cw, ri.struct['request'], ri=ri) @@ -3553,7 +3575,7 @@ def main(): print_req_policy(cw, struct) cw.nl() - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): if parsed.kernel_policy in {'per-op', 'split'}: for op_mode in ['do', 'dump']: if op_mode in op and 'request' in op[op_mode]: @@ -3581,7 +3603,7 @@ def main(): ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set) print_type_full(ri, struct) - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): cw.p(f"/* ============== {op.enum_name} ============== */") if 'do' in op and 'event' not in op: @@ -3614,7 +3636,7 @@ def main(): raise Exception(f'Only notifications with consistent types supported ({op.name})') print_wrapped_type(ri) - for op_name, op in parsed.ntfs.items(): + for _op_name, op in parsed.ntfs.items(): if 'event' in op: ri = RenderInfo(cw, parsed, args.mode, op, 'event') cw.p(f"/* {op.enum_name} - event */") @@ -3664,7 +3686,7 @@ def main(): if struct.reply: parse_rsp_nested(ri, struct) - for op_name, op in parsed.ops.items(): + for _op_name, op in parsed.ops.items(): cw.p(f"/* ============== {op.enum_name} ============== */") if 'do' in op and 'event' not in op: cw.p(f"/* {op.enum_name} - do */") @@ -3692,7 +3714,7 @@ def main(): raise Exception(f'Only notifications with consistent types supported ({op.name})') print_ntf_type_free(ri) - for op_name, op in parsed.ntfs.items(): + for _op_name, op in parsed.ntfs.items(): if 'event' in op: cw.p(f"/* {op.enum_name} - event */") diff --git a/tools/net/ynl/pyynl/ynl_gen_rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py index 90ae19aac89d..30324e2fd682 100755 --- a/tools/net/ynl/pyynl/ynl_gen_rst.py +++ b/tools/net/ynl/pyynl/ynl_gen_rst.py @@ -19,6 +19,7 @@ import sys import argparse import logging +# pylint: disable=no-name-in-module,wrong-import-position sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix()) from lib import YnlDocGenerator # pylint: disable=C0413 @@ -60,6 +61,7 @@ def write_to_rstfile(content: str, filename: str) -> None: rst_file.write(content) +# pylint: disable=broad-exception-caught def main() -> None: """Main function that reads the YAML files and generates the RST files""" diff --git a/tools/net/ynl/ynltool/Makefile b/tools/net/ynl/ynltool/Makefile index f5b1de32daa5..48b0f32050f0 100644 --- a/tools/net/ynl/ynltool/Makefile +++ b/tools/net/ynl/ynltool/Makefile @@ -13,7 +13,7 @@ endif CFLAGS += -I../lib -I../generated -I../../../include/uapi/ SRC_VERSION := \ - $(shell make --no-print-directory -sC ../../../.. kernelversion || \ + $(shell make --no-print-directory -sC ../../../.. kernelversion 2>/dev/null || \ echo "unknown") CFLAGS += -DSRC_VERSION='"$(SRC_VERSION)"' diff --git a/tools/net/ynl/ynltool/qstats.c b/tools/net/ynl/ynltool/qstats.c index 31fb45709ffa..a6c28ba4f25c 100644 --- a/tools/net/ynl/ynltool/qstats.c +++ b/tools/net/ynl/ynltool/qstats.c @@ -237,13 +237,47 @@ static void print_plain_qstats(struct netdev_qstats_get_list *qstats) } } -static int do_show(int argc, char **argv) +static struct netdev_qstats_get_list * +qstats_dump(enum netdev_qstats_scope scope) { struct netdev_qstats_get_list *qstats; struct netdev_qstats_get_req *req; struct ynl_error yerr; struct ynl_sock *ys; - int ret = 0; + + ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!ys) { + p_err("YNL: %s", yerr.msg); + return NULL; + } + + req = netdev_qstats_get_req_alloc(); + if (!req) { + p_err("failed to allocate qstats request"); + goto err_close; + } + + if (scope) + netdev_qstats_get_req_set_scope(req, scope); + + qstats = netdev_qstats_get_dump(ys, req); + netdev_qstats_get_req_free(req); + if (!qstats) { + p_err("failed to get queue stats: %s", ys->err.msg); + goto err_close; + } + + ynl_sock_destroy(ys); + return qstats; + +err_close: + ynl_sock_destroy(ys); + return NULL; +} + +static int do_show(int argc, char **argv) +{ + struct netdev_qstats_get_list *qstats; /* Parse options */ while (argc > 0) { @@ -268,29 +302,9 @@ static int do_show(int argc, char **argv) } } - ys = ynl_sock_create(&ynl_netdev_family, &yerr); - if (!ys) { - p_err("YNL: %s", yerr.msg); + qstats = qstats_dump(scope); + if (!qstats) return -1; - } - - req = netdev_qstats_get_req_alloc(); - if (!req) { - p_err("failed to allocate qstats request"); - ret = -1; - goto exit_close; - } - - if (scope) - netdev_qstats_get_req_set_scope(req, scope); - - qstats = netdev_qstats_get_dump(ys, req); - netdev_qstats_get_req_free(req); - if (!qstats) { - p_err("failed to get queue stats: %s", ys->err.msg); - ret = -1; - goto exit_close; - } /* Print the stats as returned by the kernel */ if (json_output) @@ -299,9 +313,7 @@ static int do_show(int argc, char **argv) print_plain_qstats(qstats); netdev_qstats_get_list_free(qstats); -exit_close: - ynl_sock_destroy(ys); - return ret; + return 0; } static void compute_stats(__u64 *values, unsigned int count, @@ -406,10 +418,7 @@ static int cmp_ifindex_type(const void *a, const void *b) static int do_balance(int argc, char **argv __attribute__((unused))) { struct netdev_qstats_get_list *qstats; - struct netdev_qstats_get_req *req; struct netdev_qstats_get_rsp **sorted; - struct ynl_error yerr; - struct ynl_sock *ys; unsigned int count = 0; unsigned int i, j; int ret = 0; @@ -419,29 +428,9 @@ static int do_balance(int argc, char **argv __attribute__((unused))) return -1; } - ys = ynl_sock_create(&ynl_netdev_family, &yerr); - if (!ys) { - p_err("YNL: %s", yerr.msg); + qstats = qstats_dump(NETDEV_QSTATS_SCOPE_QUEUE); + if (!qstats) return -1; - } - - req = netdev_qstats_get_req_alloc(); - if (!req) { - p_err("failed to allocate qstats request"); - ret = -1; - goto exit_close; - } - - /* Always use queue scope for balance analysis */ - netdev_qstats_get_req_set_scope(req, NETDEV_QSTATS_SCOPE_QUEUE); - - qstats = netdev_qstats_get_dump(ys, req); - netdev_qstats_get_req_free(req); - if (!qstats) { - p_err("failed to get queue stats: %s", ys->err.msg); - ret = -1; - goto exit_close; - } /* Count and sort queues */ ynl_dump_foreach(qstats, qs) @@ -576,11 +565,68 @@ exit_free_sorted: free(sorted); exit_free_qstats: netdev_qstats_get_list_free(qstats); -exit_close: - ynl_sock_destroy(ys); return ret; } +static int do_hw_gro(int argc, char **argv __attribute__((unused))) +{ + struct netdev_qstats_get_list *qstats; + + if (argc > 0) { + p_err("hw-gro command takes no arguments"); + return -1; + } + + qstats = qstats_dump(0); + if (!qstats) + return -1; + + if (json_output) + jsonw_start_array(json_wtr); + + ynl_dump_foreach(qstats, qs) { + char ifname[IF_NAMESIZE]; + const char *name; + double savings; + + if (!qs->_present.rx_packets || + !qs->_present.rx_hw_gro_packets || + !qs->_present.rx_hw_gro_wire_packets) + continue; + + if (!qs->rx_packets) + continue; + + /* How many skbs did we avoid allocating thanks to HW GRO */ + savings = (double)(qs->rx_hw_gro_wire_packets - + qs->rx_hw_gro_packets) / + qs->rx_packets * 100.0; + + name = if_indextoname(qs->ifindex, ifname); + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "ifindex", qs->ifindex); + if (name) + jsonw_string_field(json_wtr, "ifname", name); + jsonw_float_field(json_wtr, "savings", savings); + jsonw_end_object(json_wtr); + } else { + if (name) + printf("%s", name); + else + printf("ifindex:%u", qs->ifindex); + printf(": %.1f%% savings\n", savings); + } + } + + if (json_output) + jsonw_end_array(json_wtr); + + netdev_qstats_get_list_free(qstats); + return 0; +} + static int do_help(int argc __attribute__((unused)), char **argv __attribute__((unused))) { @@ -590,9 +636,10 @@ static int do_help(int argc __attribute__((unused)), } fprintf(stderr, - "Usage: %s qstats { COMMAND | help }\n" - " %s qstats [ show ] [ OPTIONS ]\n" - " %s qstats balance\n" + "Usage: %1$s qstats { COMMAND | help }\n" + " %1$s qstats [ show ] [ OPTIONS ]\n" + " %1$s qstats balance\n" + " %1$s qstats hw-gro\n" "\n" " OPTIONS := { scope queue | group-by { device | queue } }\n" "\n" @@ -601,9 +648,14 @@ static int do_help(int argc __attribute__((unused)), " show scope queue - Display per-queue statistics\n" " show group-by device - Display device-aggregated statistics (default)\n" " show group-by queue - Display per-queue statistics\n" - " balance - Analyze traffic distribution balance.\n" + "\n" + " Analysis:\n" + " balance - Traffic distribution between queues.\n" + " hw-gro - HW GRO effectiveness analysis\n" + " - savings - delta between packets received\n" + " on the wire and packets seen by the kernel.\n" "", - bin_name, bin_name, bin_name); + bin_name); return 0; } @@ -611,6 +663,7 @@ static int do_help(int argc __attribute__((unused)), static const struct cmd qstats_cmds[] = { { "show", do_show }, { "balance", do_balance }, + { "hw-gro", do_hw_gro }, { "help", do_help }, { 0 } }; diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index f4af82508228..73bfea220d1b 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -711,10 +711,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec immr = find_reloc_by_dest(elf, (void *)sec, offset+3); disp = find_reloc_by_dest(elf, (void *)sec, offset+7); - if (!immr || strcmp(immr->sym->name, "pv_ops")) + if (!immr || strncmp(immr->sym->name, "pv_ops", 6)) break; - idx = (reloc_addend(immr) + 8) / sizeof(void *); + idx = pv_ops_idx_off(immr->sym->name); + if (idx < 0) + break; + + idx += (reloc_addend(immr) + 8) / sizeof(void *); func = disp->sym; if (disp->sym->type == STT_SECTION) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3f7999317f4d..37f87c4a0134 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -197,7 +197,8 @@ static bool is_rust_noreturn(const struct symbol *func) * as well as changes to the source code itself between versions (since * these come from the Rust standard library). */ - return str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail") || + return str_ends_with(func->name, "_4core3num22from_ascii_radix_panic") || + str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail") || str_ends_with(func->name, "_4core6option13expect_failed") || str_ends_with(func->name, "_4core6option13unwrap_failed") || str_ends_with(func->name, "_4core6result13unwrap_failed") || @@ -520,21 +521,58 @@ static int decode_instructions(struct objtool_file *file) } /* - * Read the pv_ops[] .data table to find the static initialized values. + * Known pv_ops*[] arrays. */ -static int add_pv_ops(struct objtool_file *file, const char *symname) +static struct { + const char *name; + int idx_off; +} pv_ops_tables[] = { + { .name = "pv_ops", }, + { .name = "pv_ops_lock", }, + { .name = NULL, .idx_off = -1 } +}; + +/* + * Get index offset for a pv_ops* array. + */ +int pv_ops_idx_off(const char *symname) +{ + int idx; + + for (idx = 0; pv_ops_tables[idx].name; idx++) { + if (!strcmp(symname, pv_ops_tables[idx].name)) + break; + } + + return pv_ops_tables[idx].idx_off; +} + +/* + * Read a pv_ops*[] .data table to find the static initialized values. + */ +static int add_pv_ops(struct objtool_file *file, int pv_ops_idx) { struct symbol *sym, *func; unsigned long off, end; struct reloc *reloc; - int idx; + int idx, idx_off; + const char *symname; + symname = pv_ops_tables[pv_ops_idx].name; sym = find_symbol_by_name(file->elf, symname); - if (!sym) - return 0; + if (!sym) { + ERROR("Unknown pv_ops array %s", symname); + return -1; + } off = sym->offset; end = off + sym->len; + idx_off = pv_ops_tables[pv_ops_idx].idx_off; + if (idx_off < 0) { + ERROR("pv_ops array %s has unknown index offset", symname); + return -1; + } + for (;;) { reloc = find_reloc_by_dest_range(file->elf, sym->sec, off, end - off); if (!reloc) @@ -552,7 +590,7 @@ static int add_pv_ops(struct objtool_file *file, const char *symname) return -1; } - if (objtool_pv_add(file, idx, func)) + if (objtool_pv_add(file, idx + idx_off, func)) return -1; off = reloc_offset(reloc) + 1; @@ -568,14 +606,6 @@ static int add_pv_ops(struct objtool_file *file, const char *symname) */ static int init_pv_ops(struct objtool_file *file) { - static const char *pv_ops_tables[] = { - "pv_ops", - "xen_cpu_ops", - "xen_irq_ops", - "xen_mmu_ops", - NULL, - }; - const char *pv_ops; struct symbol *sym; int idx, nr; @@ -584,11 +614,20 @@ static int init_pv_ops(struct objtool_file *file) file->pv_ops = NULL; - sym = find_symbol_by_name(file->elf, "pv_ops"); - if (!sym) + nr = 0; + for (idx = 0; pv_ops_tables[idx].name; idx++) { + sym = find_symbol_by_name(file->elf, pv_ops_tables[idx].name); + if (!sym) { + pv_ops_tables[idx].idx_off = -1; + continue; + } + pv_ops_tables[idx].idx_off = nr; + nr += sym->len / sizeof(unsigned long); + } + + if (nr == 0) return 0; - nr = sym->len / sizeof(unsigned long); file->pv_ops = calloc(nr, sizeof(struct pv_state)); if (!file->pv_ops) { ERROR_GLIBC("calloc"); @@ -598,8 +637,10 @@ static int init_pv_ops(struct objtool_file *file) for (idx = 0; idx < nr; idx++) INIT_LIST_HEAD(&file->pv_ops[idx].targets); - for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) { - if (add_pv_ops(file, pv_ops)) + for (idx = 0; pv_ops_tables[idx].name; idx++) { + if (pv_ops_tables[idx].idx_off < 0) + continue; + if (add_pv_ops(file, idx)) return -1; } @@ -682,7 +723,7 @@ static int create_static_call_sections(struct objtool_file *file) key_sym = find_symbol_by_name(file->elf, tmp); if (!key_sym) { - if (!opts.module || file->klp) { + if (!opts.module) { ERROR("static_call: can't find static_call_key symbol: %s", tmp); return -1; } @@ -4761,7 +4802,7 @@ static int validate_ibt(struct objtool_file *file) !strcmp(sec->name, "__bug_table") || !strcmp(sec->name, "__ex_table") || !strcmp(sec->name, "__jump_table") || - !strcmp(sec->name, "__klp_funcs") || + !strcmp(sec->name, ".init.klp_funcs") || !strcmp(sec->name, "__mcount_loc") || !strcmp(sec->name, ".llvm.call-graph-profile") || !strcmp(sec->name, ".llvm_bb_addr_map") || diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c index 2b5059f55e40..26f08d41f2b1 100644 --- a/tools/objtool/disas.c +++ b/tools/objtool/disas.c @@ -108,6 +108,8 @@ static int sprint_name(char *str, const char *name, unsigned long offset) #define DINFO_FPRINTF(dinfo, ...) \ ((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__)) +#define bfd_vma_fmt \ + __builtin_choose_expr(sizeof(bfd_vma) == sizeof(unsigned long), "%#lx <%s>", "%#llx <%s>") static int disas_result_fprintf(struct disas_context *dctx, const char *fmt, va_list ap) @@ -170,10 +172,10 @@ static void disas_print_addr_sym(struct section *sec, struct symbol *sym, if (sym) { sprint_name(symstr, sym->name, addr - sym->offset); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, symstr); } else { str = offstr(sec, addr); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, str); free(str); } } @@ -252,7 +254,7 @@ static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) * example: "lea 0x0(%rip),%rdi". The kernel can reference * the next IP with _THIS_IP_ macro. */ - DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, "_THIS_IP_"); return; } @@ -264,11 +266,11 @@ static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) */ if (reloc->sym->type == STT_SECTION) { str = offstr(reloc->sym->sec, reloc->sym->offset + offset); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, str); free(str); } else { sprint_name(symstr, reloc->sym->name, offset); - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, symstr); } } @@ -311,7 +313,7 @@ static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo) */ sym = insn_call_dest(insn); if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) { - DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name); + DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, sym->name); return; } diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 6a8ed9c62323..2c02c7b49265 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -18,15 +18,14 @@ #include <errno.h> #include <libgen.h> #include <ctype.h> +#include <linux/align.h> +#include <linux/kernel.h> #include <linux/interval_tree_generic.h> +#include <linux/log2.h> #include <objtool/builtin.h> #include <objtool/elf.h> #include <objtool/warn.h> -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_UP_POW2(x) (1U << ((8 * sizeof(x)) - __builtin_clz((x) - 1U))) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - static inline u32 str_hash(const char *str) { return jhash(str, strlen(str), 0); @@ -1336,7 +1335,7 @@ unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char return -1; } - offset = ALIGN_UP(strtab->sh.sh_size, strtab->sh.sh_addralign); + offset = ALIGN(strtab->sh.sh_size, strtab->sh.sh_addralign); if (!elf_add_data(elf, strtab, str, strlen(str) + 1)) return -1; @@ -1378,7 +1377,7 @@ void *elf_add_data(struct elf *elf, struct section *sec, const void *data, size_ sec->data->d_size = size; sec->data->d_align = 1; - offset = ALIGN_UP(sec->sh.sh_size, sec->sh.sh_addralign); + offset = ALIGN(sec->sh.sh_size, sec->sh.sh_addralign); sec->sh.sh_size = offset + size; mark_sec_changed(elf, sec, true); @@ -1502,7 +1501,7 @@ static int elf_alloc_reloc(struct elf *elf, struct section *rsec) rsec->data->d_size = nr_relocs_new * elf_rela_size(elf); rsec->sh.sh_size = rsec->data->d_size; - nr_alloc = MAX(64, ALIGN_UP_POW2(nr_relocs_new)); + nr_alloc = max(64UL, roundup_pow_of_two(nr_relocs_new)); if (nr_alloc <= rsec->nr_alloc_relocs) return 0; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 2e1346ad5e92..5f2f77bd9b41 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -159,5 +159,6 @@ const char *objtool_disas_insn(struct instruction *insn); extern size_t sym_name_max_len; extern struct disas_context *objtool_disas_ctx; +int pv_ops_idx_off(const char *symname); #endif /* _CHECK_H */ diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h index ad830a7ce55b..e32e5e8bc631 100644 --- a/tools/objtool/include/objtool/klp.h +++ b/tools/objtool/include/objtool/klp.h @@ -6,12 +6,12 @@ #define SHN_LIVEPATCH 0xff20 /* - * __klp_objects and __klp_funcs are created by klp diff and used by the patch - * module init code to build the klp_patch, klp_object and klp_func structs - * needed by the livepatch API. + * .init.klp_objects and .init.klp_funcs are created by klp diff and used by the + * patch module init code to build the klp_patch, klp_object and klp_func + * structs needed by the livepatch API. */ -#define KLP_OBJECTS_SEC "__klp_objects" -#define KLP_FUNCS_SEC "__klp_funcs" +#define KLP_OBJECTS_SEC ".init.klp_objects" +#define KLP_FUNCS_SEC ".init.klp_funcs" /* * __klp_relocs is an intermediate section which are created by klp diff and diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c index 4d1f9e9977eb..9f1f4011eb9c 100644 --- a/tools/objtool/klp-diff.c +++ b/tools/objtool/klp-diff.c @@ -364,11 +364,40 @@ static int correlate_symbols(struct elfs *e) struct symbol *file1_sym, *file2_sym; struct symbol *sym1, *sym2; - /* Correlate locals */ - for (file1_sym = first_file_symbol(e->orig), - file2_sym = first_file_symbol(e->patched); ; - file1_sym = next_file_symbol(e->orig, file1_sym), - file2_sym = next_file_symbol(e->patched, file2_sym)) { + file1_sym = first_file_symbol(e->orig); + file2_sym = first_file_symbol(e->patched); + + /* + * Correlate any locals before the first FILE symbol. This has been + * seen when LTO inexplicably strips the initramfs_data.o FILE symbol + * due to the file only containing data and no code. + */ + for_each_sym(e->orig, sym1) { + if (sym1 == file1_sym || !is_local_sym(sym1)) + break; + + if (dont_correlate(sym1)) + continue; + + for_each_sym(e->patched, sym2) { + if (sym2 == file2_sym || !is_local_sym(sym2)) + break; + + if (sym2->twin || dont_correlate(sym2)) + continue; + + if (strcmp(sym1->demangled_name, sym2->demangled_name)) + continue; + + sym1->twin = sym2; + sym2->twin = sym1; + break; + } + } + + /* Correlate locals after the first FILE symbol */ + for (; ; file1_sym = next_file_symbol(e->orig, file1_sym), + file2_sym = next_file_symbol(e->patched, file2_sym)) { if (!file1_sym && file2_sym) { ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name); @@ -1425,9 +1454,6 @@ static int clone_special_sections(struct elfs *e) { struct section *patched_sec; - if (create_fake_symbols(e->patched)) - return -1; - for_each_sec(e->patched, patched_sec) { if (is_special_section(patched_sec)) { if (clone_special_section(e, patched_sec)) @@ -1439,7 +1465,7 @@ static int clone_special_sections(struct elfs *e) } /* - * Create __klp_objects and __klp_funcs sections which are intermediate + * Create .init.klp_objects and .init.klp_funcs sections which are intermediate * sections provided as input to the patch module's init code for building the * klp_patch, klp_object and klp_func structs for the livepatch API. */ @@ -1704,6 +1730,17 @@ int cmd_klp_diff(int argc, const char **argv) if (!e.out) return -1; + /* + * Special section fake symbols are needed so that individual special + * section entries can be extracted by clone_special_sections(). + * + * Note the fake symbols are also needed by clone_included_functions() + * because __WARN_printf() call sites add references to bug table + * entries in the calling functions. + */ + if (create_fake_symbols(e.patched)) + return -1; + if (clone_included_functions(&e)) return -1; diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 74cdbd2ce9d0..524d46478364 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -27,7 +27,6 @@ static bool is_ignored_symbol(const char *name, char type) * stable symbol list. */ "kallsyms_offsets", - "kallsyms_relative_base", "kallsyms_num_syms", "kallsyms_names", "kallsyms_markers", diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h index 47051871b436..6e1d5b955aae 100644 --- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,8 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 956ea273c2c7..01a21b6aa031 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -939,6 +939,7 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok) { const char *p, *suffix; bool has_hex = false; + bool has_underscore = false; size_t tok_len = strlen(tok); /* Check start of pmu_name for equality. */ @@ -949,13 +950,14 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok) if (*p == 0) return true; - if (*p == '_') { - ++p; - ++suffix; - } - - /* Ensure we end in a number */ + /* Ensure we end in a number or a mix of number and "_". */ while (1) { + if (!has_underscore && (*p == '_')) { + has_underscore = true; + ++p; + ++suffix; + } + if (!isxdigit(*p)) return false; if (!has_hex) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index a1df9196dc45..969716dfe8de 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -315,7 +315,17 @@ endif $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h $(INSTALL_DATA) lib/powercap.h $(DESTDIR)${includedir}/powercap.h -install-tools: $(OUTPUT)cpupower +# SYSTEMD=false disables installation of the systemd unit file +SYSTEMD ?= true + +install-systemd: + $(INSTALL) -d $(DESTDIR)${unitdir} + sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service' + $(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service' + +INSTALL_SYSTEMD := $(if $(filter true,$(strip $(SYSTEMD))),install-systemd) + +install-tools: $(OUTPUT)cpupower $(INSTALL_SYSTEMD) $(INSTALL) -d $(DESTDIR)${bindir} $(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir} $(INSTALL) -d $(DESTDIR)${bash_completion_dir} @@ -324,9 +334,6 @@ install-tools: $(OUTPUT)cpupower $(INSTALL_DATA) cpupower-service.conf '$(DESTDIR)${confdir}' $(INSTALL) -d $(DESTDIR)${libexecdir} $(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower' - $(INSTALL) -d $(DESTDIR)${unitdir} - sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service' - $(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service' install-man: $(INSTALL_DATA) -D man/cpupower.1 $(DESTDIR)${mandir}/man1/cpupower.1 @@ -406,4 +413,4 @@ help: @echo ' uninstall - Remove previously installed files from the dir defined by "DESTDIR"' @echo ' cmdline or Makefile config block option (default: "")' -.PHONY: all utils libcpupower update-po create-gmo install-lib install-tools install-man install-gmo install uninstall clean help +.PHONY: all utils libcpupower update-po create-gmo install-lib install-systemd install-tools install-man install-gmo install uninstall clean help diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c index f2c1139adf71..2fcb343d8e75 100644 --- a/tools/power/cpupower/lib/cpuidle.c +++ b/tools/power/cpupower/lib/cpuidle.c @@ -150,6 +150,7 @@ unsigned long long cpuidle_state_get_one_value(unsigned int cpu, if (len == 0) return 0; + errno = 0; value = strtoull(linebuf, &endp, 0); if (endp == linebuf || errno == ERANGE) @@ -193,8 +194,7 @@ static char *cpuidle_state_get_one_string(unsigned int cpu, if (result == NULL) return NULL; - if (result[strlen(result) - 1] == '\n') - result[strlen(result) - 1] = '\0'; + result[strcspn(result, "\n")] = '\0'; return result; } @@ -366,8 +366,7 @@ static char *sysfs_cpuidle_get_one_string(enum cpuidle_string which) if (result == NULL) return NULL; - if (result[strlen(result) - 1] == '\n') - result[strlen(result) - 1] = '\0'; + result[strcspn(result, "\n")] = '\0'; return result; } diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 7d3732f5f2f6..5fe01e516817 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -270,7 +270,7 @@ static int get_freq_hardware(unsigned int cpu, unsigned int human) { unsigned long freq; - if (cpupower_cpu_info.caps & CPUPOWER_CAP_APERF) + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) return -EINVAL; freq = cpufreq_get_freq_hardware(cpu); diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index e0d17f0de3fe..81b4763a97d6 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -111,7 +111,7 @@ static void proc_cpuidle_cpu_output(unsigned int cpu) printf(_("max_cstate: C%u\n"), cstates-1); printf(_("maximum allowed latency: %lu usec\n"), max_allowed_cstate); printf(_("states:\t\n")); - for (cstate = 1; cstate < cstates; cstate++) { + for (cstate = 0; cstate < cstates; cstate++) { printf(_(" C%d: " "type[C%d] "), cstate, cstate); printf(_("promotion[--] demotion[--] ")); diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c index 8b42c2f0a5b0..4225eff9833d 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -70,7 +70,7 @@ static int cpuidle_stop(void) current_count[cpu][state] = cpuidle_state_time(cpu, state); dprint("CPU %d - State: %d - Val: %llu\n", - cpu, state, previous_count[cpu][state]); + cpu, state, current_count[cpu][state]); } } return 0; diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile index e4bda2474060..47ad7444677e 100644 --- a/tools/sched_ext/Makefile +++ b/tools/sched_ext/Makefile @@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg +c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt $(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(BINDIR)/%: \ diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 16a42e4060f6..56a9d1557ac4 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -65,7 +65,6 @@ It's also recommended that you also include the following Kconfig options: ``` CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_PAHOLE_HAS_SPLIT_BTF=y CONFIG_PAHOLE_HAS_BTF_TAG=y ``` diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 55df8b798865..1c2376b75b5d 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -301,8 +301,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init) int ret; ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); - if (ret) + if (ret) { + scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret); return ret; + } timer = bpf_map_lookup_elem(¢ral_timer, &key); if (!timer) diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c index 6326ce598c8e..9b67ab11b04c 100644 --- a/tools/sched_ext/scx_cpu0.bpf.c +++ b/tools/sched_ext/scx_cpu0.bpf.c @@ -71,7 +71,15 @@ void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev) s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init) { - return scx_bpf_create_dsq(DSQ_CPU0, -1); + int ret; + + ret = scx_bpf_create_dsq(DSQ_CPU0, -1); + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", DSQ_CPU0, ret); + return ret; + } + + return 0; } void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei) diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 43126858b8e4..0e785cff0f24 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -842,8 +842,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, * unlikely case that it breaks. */ ret = scx_bpf_create_dsq(cgid, -1); - if (ret) + if (ret) { + scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret); return ret; + } cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); @@ -927,7 +929,15 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) { - return scx_bpf_create_dsq(FALLBACK_DSQ, -1); + int ret; + + ret = scx_bpf_create_dsq(FALLBACK_DSQ, -1); + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", FALLBACK_DSQ, ret); + return ret; + } + + return 0; } void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c new file mode 100644 index 000000000000..267011b57cba --- /dev/null +++ b/tools/sched_ext/scx_pair.bpf.c @@ -0,0 +1,610 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext core-scheduler which always makes every sibling CPU pair + * execute from the same CPU cgroup. + * + * This scheduler is a minimal implementation and would need some form of + * priority handling both inside each cgroup and across the cgroups to be + * practically useful. + * + * Each CPU in the system is paired with exactly one other CPU, according to a + * "stride" value that can be specified when the BPF scheduler program is first + * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee + * that they will only ever schedule tasks that belong to the same CPU cgroup. + * + * Scheduler Initialization + * ------------------------ + * + * The scheduler BPF program is first initialized from user space, before it is + * enabled. During this initialization process, each CPU on the system is + * assigned several values that are constant throughout its runtime: + * + * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling + * decisions. Paired CPUs always schedule tasks from the same + * CPU cgroup, and synchronize with each other to guarantee + * that this constraint is not violated. + * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access + * a struct pair_ctx object that is shared between the pair. + * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the + * pair. Each struct pair_ctx has an active_mask field, + * which is a bitmap used to indicate whether each core + * in the pair currently has an actively running task. + * This index specifies which entry in the bitmap corresponds + * to each CPU in the pair. + * + * During this initialization, the CPUs are paired according to a "stride" that + * may be specified when invoking the user space program that initializes and + * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. + * + * Tasks and cgroups + * ----------------- + * + * Every cgroup in the system is registered with the scheduler using the + * pair_cgroup_init() callback, and every task in the system is associated with + * exactly one cgroup. At a high level, the idea with the pair scheduler is to + * always schedule tasks from the same cgroup within a given CPU pair. When a + * task is enqueued (i.e. passed to the pair_enqueue() callback function), its + * cgroup ID is read from its task struct, and then a corresponding queue map + * is used to FIFO-enqueue the task for that cgroup. + * + * If you look through the implementation of the scheduler, you'll notice that + * there is quite a bit of complexity involved with looking up the per-cgroup + * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash + * BPF hash map that is used to map a cgroup ID to a globally unique ID that's + * allocated in the BPF program. This is done because we use separate maps to + * store the FIFO queue of tasks, and the length of that map, per cgroup. This + * complexity is only present because of current deficiencies in BPF that will + * soon be addressed. The main point to keep in mind is that newly enqueued + * tasks are added to their cgroup's FIFO queue. + * + * Dispatching tasks + * ----------------- + * + * This section will describe how enqueued tasks are dispatched and scheduled. + * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is + * as follows: + * + * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is + * the structure that's used to synchronize amongst the two pair CPUs in their + * scheduling decisions. After any of the following events have occurred: + * + * - The cgroup's slice run has expired, or + * - The cgroup becomes empty, or + * - Either CPU in the pair is preempted by a higher priority scheduling class + * + * The cgroup transitions to the draining state and stops executing new tasks + * from the cgroup. + * + * 2. If the pair is still executing a task, mark the pair_ctx as draining, and + * wait for the pair CPU to be preempted. + * + * 3. Otherwise, if the pair CPU is not running a task, we can move onto + * scheduling new tasks. Pop the next cgroup id from the top_q queue. + * + * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. + * + * Note again that this scheduling behavior is simple, but the implementation + * is complex mostly because this it hits several BPF shortcomings and has to + * work around in often awkward ways. Most of the shortcomings are expected to + * be resolved in the near future which should allow greatly simplifying this + * scheduler. + * + * Dealing with preemption + * ----------------------- + * + * SCX is the lowest priority sched_class, and could be preempted by them at + * any time. To address this, the scheduler implements pair_cpu_release() and + * pair_cpu_acquire() callbacks which are invoked by the core scheduler when + * the scheduler loses and gains control of the CPU respectively. + * + * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and + * then invoke: + * + * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); + * + * This preempts the pair CPU, and waits until it has re-entered the scheduler + * before returning. This is necessary to ensure that the higher priority + * sched_class that preempted our scheduler does not schedule a task + * concurrently with our pair CPU. + * + * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption + * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable + * pair scheduling. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <scx/common.bpf.h> +#include "scx_pair.h" + +char _license[] SEC("license") = "GPL"; + +/* !0 for veristat, set during init */ +const volatile u32 nr_cpu_ids = 1; + +/* a pair of CPUs stay on a cgroup for this duration */ +const volatile u32 pair_batch_dur_ns; + +/* cpu ID -> pair cpu ID */ +const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); + +/* cpu ID -> pair_id */ +const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); + +/* CPU ID -> CPU # in the pair (0 or 1) */ +const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); + +struct pair_ctx { + struct bpf_spin_lock lock; + + /* the cgroup the pair is currently executing */ + u64 cgid; + + /* the pair started executing the current cgroup at */ + u64 started_at; + + /* whether the current cgroup is draining */ + bool draining; + + /* the CPUs that are currently active on the cgroup */ + u32 active_mask; + + /* + * the CPUs that are currently preempted and running tasks in a + * different scheduler. + */ + u32 preempted_mask; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct pair_ctx); +} pair_ctx SEC(".maps"); + +/* queue of cgrp_q's possibly with tasks on them */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + /* + * Because it's difficult to build strong synchronization encompassing + * multiple non-trivial operations in BPF, this queue is managed in an + * opportunistic way so that we guarantee that a cgroup w/ active tasks + * is always on it but possibly multiple times. Once we have more robust + * synchronization constructs and e.g. linked list, we should be able to + * do this in a prettier way but for now just size it big enough. + */ + __uint(max_entries, 4 * MAX_CGRPS); + __type(value, u64); +} top_q SEC(".maps"); + +/* per-cgroup q which FIFOs the tasks from the cgroup */ +struct cgrp_q { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_QUEUED); + __type(value, u32); +}; + +/* + * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local + * storage; however, a cgroup local storage can only be accessed from the BPF + * progs attached to the cgroup. For now, work around by allocating array of + * cgrp_q's and then allocating per-cgroup indices. + * + * Another caveat: It's difficult to populate a large array of maps statically + * or from BPF. Initialize it from userland. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_CGRPS); + __type(key, s32); + __array(values, struct cgrp_q); +} cgrp_q_arr SEC(".maps"); + +static u64 cgrp_q_len[MAX_CGRPS]; + +/* + * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be + * useful to have as a map type. + */ +static u32 cgrp_q_idx_cursor; +static u64 cgrp_q_idx_busy[MAX_CGRPS]; + +/* + * All added up, the following is what we do: + * + * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking + * for a free ID. If not found, fail cgroup creation with -EBUSY. + * + * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following + * cgrp_q_idx_hash. + * + * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from + * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. + * + * This is sadly complicated for something pretty simple. Hopefully, we should + * be able to simplify in the future. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_CGRPS); + __uint(key_size, sizeof(u64)); /* cgrp ID */ + __uint(value_size, sizeof(s32)); /* cgrp_q idx */ +} cgrp_q_idx_hash SEC(".maps"); + +/* statistics */ +u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; +u64 nr_exps, nr_exp_waits, nr_exp_empty; +u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct cgroup *cgrp; + struct cgrp_q *cgq; + s32 pid = p->pid; + u64 cgid; + u32 *q_idx; + u64 *cgq_len; + + __sync_fetch_and_add(&nr_total, 1); + + cgrp = scx_bpf_task_cgroup(p); + cgid = cgrp->kn->id; + bpf_cgroup_release(cgrp); + + /* find the cgroup's q and push @p into it */ + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (!q_idx) { + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); + return; + } + + cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); + if (!cgq) { + scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", + cgid, *q_idx); + return; + } + + if (bpf_map_push_elem(cgq, &pid, 0)) { + scx_bpf_error("cgroup[%llu] queue overflow", cgid); + return; + } + + /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); + if (!cgq_len) { + scx_bpf_error("MEMBER_VTPR malfunction"); + return; + } + + if (!__sync_fetch_and_add(cgq_len, 1) && + bpf_map_push_elem(&top_q, &cgid, 0)) { + scx_bpf_error("top_q overflow"); + return; + } +} + +static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) +{ + u32 *vptr; + + vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); + if (!vptr) + return -EINVAL; + + *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); + if (!(*pairc)) + return -EINVAL; + + vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); + if (!vptr) + return -EINVAL; + + *mask = 1U << *vptr; + + return 0; +} + +__attribute__((noinline)) +static int try_dispatch(s32 cpu) +{ + struct pair_ctx *pairc; + struct bpf_map *cgq_map; + struct task_struct *p; + u64 now = scx_bpf_now(); + bool kick_pair = false; + bool expired, pair_preempted; + u32 *vptr, in_pair_mask; + s32 pid, q_idx; + u64 cgid; + int ret; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) { + scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", + cpu); + return -ENOENT; + } + + bpf_spin_lock(&pairc->lock); + pairc->active_mask &= ~in_pair_mask; + + expired = time_before(pairc->started_at + pair_batch_dur_ns, now); + if (expired || pairc->draining) { + u64 new_cgid = 0; + + __sync_fetch_and_add(&nr_exps, 1); + + /* + * We're done with the current cgid. An obvious optimization + * would be not draining if the next cgroup is the current one. + * For now, be dumb and always expire. + */ + pairc->draining = true; + + pair_preempted = pairc->preempted_mask; + if (pairc->active_mask || pair_preempted) { + /* + * The other CPU is still active, or is no longer under + * our control due to e.g. being preempted by a higher + * priority sched_class. We want to wait until this + * cgroup expires, or until control of our pair CPU has + * been returned to us. + * + * If the pair controls its CPU, and the time already + * expired, kick. When the other CPU arrives at + * dispatch and clears its active mask, it'll push the + * pair to the next cgroup and kick this CPU. + */ + __sync_fetch_and_add(&nr_exp_waits, 1); + bpf_spin_unlock(&pairc->lock); + if (expired && !pair_preempted) + kick_pair = true; + goto out_maybe_kick; + } + + bpf_spin_unlock(&pairc->lock); + + /* + * Pick the next cgroup. It'd be easier / cleaner to not drop + * pairc->lock and use stronger synchronization here especially + * given that we'll be switching cgroups significantly less + * frequently than tasks. Unfortunately, bpf_spin_lock can't + * really protect anything non-trivial. Let's do opportunistic + * operations instead. + */ + bpf_repeat(BPF_MAX_LOOPS) { + u32 *q_idx; + u64 *cgq_len; + + if (bpf_map_pop_elem(&top_q, &new_cgid)) { + /* no active cgroup, go idle */ + __sync_fetch_and_add(&nr_exp_empty, 1); + return 0; + } + + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); + if (!q_idx) + continue; + + /* + * This is the only place where empty cgroups are taken + * off the top_q. + */ + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); + if (!cgq_len || !*cgq_len) + continue; + + /* + * If it has any tasks, requeue as we may race and not + * execute it. + */ + bpf_map_push_elem(&top_q, &new_cgid, 0); + break; + } + + bpf_spin_lock(&pairc->lock); + + /* + * The other CPU may already have started on a new cgroup while + * we dropped the lock. Make sure that we're still draining and + * start on the new cgroup. + */ + if (pairc->draining && !pairc->active_mask) { + __sync_fetch_and_add(&nr_cgrp_next, 1); + pairc->cgid = new_cgid; + pairc->started_at = now; + pairc->draining = false; + kick_pair = true; + } else { + __sync_fetch_and_add(&nr_cgrp_coll, 1); + } + } + + cgid = pairc->cgid; + pairc->active_mask |= in_pair_mask; + bpf_spin_unlock(&pairc->lock); + + /* again, it'd be better to do all these with the lock held, oh well */ + vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (!vptr) { + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); + return -ENOENT; + } + q_idx = *vptr; + + /* claim one task from cgrp_q w/ q_idx */ + bpf_repeat(BPF_MAX_LOOPS) { + u64 *cgq_len, len; + + cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); + if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { + /* the cgroup must be empty, expire and repeat */ + __sync_fetch_and_add(&nr_cgrp_empty, 1); + bpf_spin_lock(&pairc->lock); + pairc->draining = true; + pairc->active_mask &= ~in_pair_mask; + bpf_spin_unlock(&pairc->lock); + return -EAGAIN; + } + + if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) + continue; + + break; + } + + cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); + if (!cgq_map) { + scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", + cgid, q_idx); + return -ENOENT; + } + + if (bpf_map_pop_elem(cgq_map, &pid)) { + scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", + cgid, q_idx); + return -ENOENT; + } + + p = bpf_task_from_pid(pid); + if (p) { + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } else { + /* we don't handle dequeues, retry on lost tasks */ + __sync_fetch_and_add(&nr_missing, 1); + return -EAGAIN; + } + +out_maybe_kick: + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); + } + } + return 0; +} + +void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) +{ + bpf_repeat(BPF_MAX_LOOPS) { + if (try_dispatch(cpu) != -EAGAIN) + break; + } +} + +void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) +{ + int ret; + u32 in_pair_mask; + struct pair_ctx *pairc; + bool kick_pair; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) + return; + + bpf_spin_lock(&pairc->lock); + pairc->preempted_mask &= ~in_pair_mask; + /* Kick the pair CPU, unless it was also preempted. */ + kick_pair = !pairc->preempted_mask; + bpf_spin_unlock(&pairc->lock); + + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); + } + } +} + +void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + int ret; + u32 in_pair_mask; + struct pair_ctx *pairc; + bool kick_pair; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) + return; + + bpf_spin_lock(&pairc->lock); + pairc->preempted_mask |= in_pair_mask; + pairc->active_mask &= ~in_pair_mask; + /* Kick the pair CPU if it's still running. */ + kick_pair = pairc->active_mask; + pairc->draining = true; + bpf_spin_unlock(&pairc->lock); + + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); + } + } + __sync_fetch_and_add(&nr_preemptions, 1); +} + +s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + s32 i, q_idx; + + bpf_for(i, 0, MAX_CGRPS) { + q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; + if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) + break; + } + if (i == MAX_CGRPS) + return -EBUSY; + + if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); + if (busy) + *busy = 0; + return -EBUSY; + } + + return 0; +} + +void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + s32 *q_idx; + + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (q_idx) { + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); + if (busy) + *busy = 0; + bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); + } +} + +void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(pair_ops, + .enqueue = (void *)pair_enqueue, + .dispatch = (void *)pair_dispatch, + .cpu_acquire = (void *)pair_cpu_acquire, + .cpu_release = (void *)pair_cpu_release, + .cgroup_init = (void *)pair_cgroup_init, + .cgroup_exit = (void *)pair_cgroup_exit, + .exit = (void *)pair_exit, + .name = "pair"); diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c new file mode 100644 index 000000000000..d3e97faa6334 --- /dev/null +++ b/tools/sched_ext/scx_pair.c @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <stdio.h> +#include <unistd.h> +#include <inttypes.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "scx_pair.h" +#include "scx_pair.bpf.skel.h" + +const char help_fmt[] = +"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n" +"execute from the same CPU cgroup.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-S STRIDE]\n" +"\n" +" -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_pair *skel; + struct bpf_link *link; + __u64 seq = 0, ecode; + __s32 stride, i, opt, outer_fd; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(pair_ops, scx_pair); + + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpu_ids > 0); + skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + + /* pair up the earlier half to the latter by default, override with -s */ + stride = skel->rodata->nr_cpu_ids / 2; + + while ((opt = getopt(argc, argv, "S:vh")) != -1) { + switch (opt) { + case 'S': + stride = strtoul(optarg, NULL, 0); + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2); + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids); + + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) + skel->rodata_pair_cpu->pair_cpu[i] = -1; + + printf("Pairs: "); + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) { + int j = (i + stride) % skel->rodata->nr_cpu_ids; + + if (skel->rodata_pair_cpu->pair_cpu[i] >= 0) + continue; + + SCX_BUG_ON(i == j, + "Invalid stride %d - CPU%d wants to be its own pair", + stride, i); + + SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0, + "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair", + stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]); + + skel->rodata_pair_cpu->pair_cpu[i] = j; + skel->rodata_pair_cpu->pair_cpu[j] = i; + skel->rodata_pair_id->pair_id[i] = i; + skel->rodata_pair_id->pair_id[j] = i; + skel->rodata_in_pair_idx->in_pair_idx[i] = 0; + skel->rodata_in_pair_idx->in_pair_idx[j] = 1; + + printf("[%d, %d] ", i, j); + } + printf("\n"); + + SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei); + + /* + * Populate the cgrp_q_arr map which is an array containing per-cgroup + * queues. It'd probably be better to do this from BPF but there are too + * many to initialize statically and there's no way to dynamically + * populate from BPF. + */ + outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr); + SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd); + + printf("Initializing"); + for (i = 0; i < MAX_CGRPS; i++) { + __s32 inner_fd; + + if (exit_req) + break; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, + sizeof(__u32), MAX_QUEUED, NULL); + SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d", + inner_fd); + SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY), + "Failed to set inner map"); + close(inner_fd); + + if (!(i % 10)) + printf("."); + fflush(stdout); + } + printf("\n"); + + /* + * Fully initialized, attach and run. + */ + link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("[SEQ %llu]\n", seq++); + printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_dispatched, + skel->bss->nr_missing); + printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n", + skel->bss->nr_kicks, + skel->bss->nr_preemptions); + printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n", + skel->bss->nr_exps, + skel->bss->nr_exp_waits, + skel->bss->nr_exp_empty); + printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n", + skel->bss->nr_cgrp_next, + skel->bss->nr_cgrp_coll, + skel->bss->nr_cgrp_empty); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_pair__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h new file mode 100644 index 000000000000..d9666a447d3f --- /dev/null +++ b/tools/sched_ext/scx_pair.h @@ -0,0 +1,9 @@ +#ifndef __SCX_EXAMPLE_PAIR_H +#define __SCX_EXAMPLE_PAIR_H + +enum { + MAX_QUEUED = 4096, + MAX_CGRPS = 4096, +}; + +#endif /* __SCX_EXAMPLE_PAIR_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index df21fad0c438..d51d8c38f1cf 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -866,12 +866,16 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); - if (ret) + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); return ret; + } ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); - if (ret) + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret); return ret; + } timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c new file mode 100644 index 000000000000..31b09958e8d5 --- /dev/null +++ b/tools/sched_ext/scx_sdt.bpf.c @@ -0,0 +1,716 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Arena-based task data scheduler. This is a variation of scx_simple + * that uses a combined allocator and indexing structure to organize + * task data. Task context allocation is done when a task enters the + * scheduler, while freeing is done when it exits. Task contexts are + * retrieved from task-local storage, pointing to the allocated memory. + * + * The main purpose of this scheduler is to demostrate arena memory + * management. + * + * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com> + * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org> + * + */ +#include <scx/common.bpf.h> +#include <scx/bpf_arena_common.bpf.h> + +#include "scx_sdt.h" + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) + __uint(max_entries, 1 << 16); /* number of pages */ + __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ +#else + __uint(max_entries, 1 << 20); /* number of pages */ + __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ +#endif +} arena __weak SEC(".maps"); + +#define SHARED_DSQ 0 + +#define DEFINE_SDT_STAT(metric) \ +static inline void \ +stat_inc_##metric(struct scx_stats __arena *stats) \ +{ \ + cast_kern(stats); \ + stats->metric += 1; \ +} \ +__u64 stat_##metric; \ + +DEFINE_SDT_STAT(enqueue); +DEFINE_SDT_STAT(init); +DEFINE_SDT_STAT(exit); +DEFINE_SDT_STAT(select_idle_cpu); +DEFINE_SDT_STAT(select_busy_cpu); + +/* + * Necessary for cond_break/can_loop's semantics. According to kernel commit + * 011832b, the loop counter variable must be seen as imprecise and bounded + * by the verifier. Initializing it from a constant (e.g., i = 0;), then, + * makes it precise and prevents may_goto from helping with converging the + * loop. For these loops we must initialize the loop counter from a variable + * whose value the verifier cannot reason about when checking the program, so + * that the loop counter's value is imprecise. + */ +static __u64 zero = 0; + +/* + * XXX Hack to get the verifier to find the arena for sdt_exit_task. + * As of 6.12-rc5, The verifier associates arenas with programs by + * checking LD.IMM instruction operands for an arena and populating + * the program state with the first instance it finds. This requires + * accessing our global arena variable, but scx methods do not necessarily + * do so while still using pointers from that arena. Insert a bpf_printk + * statement that triggers at most once to generate an LD.IMM instruction + * to access the arena and help the verifier. + */ +static volatile bool scx_arena_verify_once; + +__hidden void scx_arena_subprog_init(void) +{ + if (scx_arena_verify_once) + return; + + bpf_printk("%s: arena pointer %p", __func__, &arena); + scx_arena_verify_once = true; +} + + +private(LOCK) struct bpf_spin_lock alloc_lock; +private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; + +/* allocation pools */ +struct sdt_pool desc_pool; +struct sdt_pool chunk_pool; + +/* Protected by alloc_lock. */ +struct scx_alloc_stats alloc_stats; + + +/* Allocate element from the pool. Must be called with a then pool lock held. */ +static +void __arena *scx_alloc_from_pool(struct sdt_pool *pool) +{ + __u64 elem_size, max_elems; + void __arena *slab; + void __arena *ptr; + + elem_size = pool->elem_size; + max_elems = pool->max_elems; + + /* If the chunk is spent, get a new one. */ + if (pool->idx >= max_elems) { + slab = bpf_arena_alloc_pages(&arena, NULL, + div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); + if (!slab) + return NULL; + + pool->slab = slab; + pool->idx = 0; + } + + ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); + pool->idx += 1; + + return ptr; +} + +/* Alloc desc and associated chunk. Called with the allocator spinlock held. */ +static sdt_desc_t *scx_alloc_chunk(void) +{ + struct sdt_chunk __arena *chunk; + sdt_desc_t *desc; + sdt_desc_t *out; + + chunk = scx_alloc_from_pool(&chunk_pool); + if (!chunk) + return NULL; + + desc = scx_alloc_from_pool(&desc_pool); + if (!desc) { + /* + * Effectively frees the previous chunk allocation. + * Index cannot be 0, so decrementing is always + * valid. + */ + chunk_pool.idx -= 1; + return NULL; + } + + out = desc; + + desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; + desc->chunk = chunk; + + alloc_stats.chunk_allocs += 1; + + return out; +} + +static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) +{ + if (unlikely(data_size % 8)) + return -EINVAL; + + if (unlikely(nr_pages == 0)) + return -EINVAL; + + pool->elem_size = data_size; + pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; + /* Populate the pool slab on the first allocation. */ + pool->idx = pool->max_elems; + + return 0; +} + +/* Initialize both the base pool allocators and the root chunk of the index. */ +__hidden int +scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) +{ + size_t min_chunk_size; + int ret; + + _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, + "chunk size must fit into a page"); + + ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); + if (ret != 0) + return ret; + + ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); + if (ret != 0) + return ret; + + /* Wrap data into a descriptor and word align. */ + data_size += sizeof(struct sdt_data); + data_size = round_up(data_size, 8); + + /* + * Ensure we allocate large enough chunks from the arena to avoid excessive + * internal fragmentation when turning chunks it into structs. + */ + min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); + ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); + if (ret != 0) + return ret; + + bpf_spin_lock(&alloc_lock); + alloc->root = scx_alloc_chunk(); + bpf_spin_unlock(&alloc_lock); + if (!alloc->root) + return -ENOMEM; + + return 0; +} + +static +int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) +{ + __u64 __arena *allocated = desc->allocated; + __u64 bit; + + if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) + return -EINVAL; + + bit = (__u64)1 << (pos % 64); + + if (state) + allocated[pos / 64] |= bit; + else + allocated[pos / 64] &= ~bit; + + return 0; +} + +static __noinline +int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) +{ + sdt_desc_t *desc; + __u64 u, level; + int ret; + + for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { + level = SDT_TASK_LEVELS - 1 - u; + + /* Only propagate upwards if we are the parent's only free chunk. */ + desc = lv_desc[level]; + + ret = set_idx_state(desc, lv_pos[level], false); + if (unlikely(ret != 0)) + return ret; + + desc->nr_free += 1; + if (desc->nr_free > 1) + return 0; + } + + return 0; +} + +/* + * Free the allocated struct with the given index. Called with the + * allocator lock taken. + */ +__hidden +int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) +{ + const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; + sdt_desc_t * __arena *desc_children; + struct sdt_chunk __arena *chunk; + sdt_desc_t *desc; + struct sdt_data __arena *data; + __u64 level, shift, pos; + __u64 lv_pos[SDT_TASK_LEVELS]; + int ret; + int i; + + if (!alloc) + return 0; + + desc = alloc->root; + if (unlikely(!desc)) + return -EINVAL; + + /* To appease the verifier. */ + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + lv_desc[level] = NULL; + lv_pos[level] = 0; + } + + /* Find the leaf node containing the index. */ + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; + pos = (idx >> shift) & mask; + + lv_desc[level] = desc; + lv_pos[level] = pos; + + if (level == SDT_TASK_LEVELS - 1) + break; + + chunk = desc->chunk; + + desc_children = (sdt_desc_t * __arena *)chunk->descs; + desc = desc_children[pos]; + + if (unlikely(!desc)) + return -EINVAL; + } + + chunk = desc->chunk; + + pos = idx & mask; + data = chunk->data[pos]; + if (likely(data)) { + *data = (struct sdt_data) { + .tid.genn = data->tid.genn + 1, + }; + + /* Zero out one word at a time. */ + for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { + data->payload[i] = 0; + } + } + + ret = mark_nodes_avail(lv_desc, lv_pos); + if (unlikely(ret != 0)) + return ret; + + alloc_stats.active_allocs -= 1; + alloc_stats.free_ops += 1; + + return 0; +} + +static inline +int ffs(__u64 word) +{ + unsigned int num = 0; + + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } + + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + + if ((word & 0xf) == 0) { + num += 4; + word >>= 4; + } + + if ((word & 0x3) == 0) { + num += 2; + word >>= 2; + } + + if ((word & 0x1) == 0) { + num += 1; + word >>= 1; + } + + return num; +} + + +/* find the first empty slot */ +__hidden +__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) +{ + __u64 freeslots; + __u64 i; + + for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { + freeslots = ~desc->allocated[i]; + if (freeslots == (__u64)0) + continue; + + return (i * 64) + ffs(freeslots); + } + + return SDT_TASK_ENTS_PER_CHUNK; +} + +/* + * Find and return an available idx on the allocator. + * Called with the task spinlock held. + */ +static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) +{ + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; + sdt_desc_t * __arena *desc_children; + struct sdt_chunk __arena *chunk; + sdt_desc_t *tmp; + __u64 lv_pos[SDT_TASK_LEVELS]; + __u64 u, pos, level; + __u64 idx = 0; + int ret; + + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { + pos = chunk_find_empty(desc); + + /* If we error out, something has gone very wrong. */ + if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) + return NULL; + + if (pos == SDT_TASK_ENTS_PER_CHUNK) + return NULL; + + idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; + idx |= pos; + + /* Log the levels to complete allocation. */ + lv_desc[level] = desc; + lv_pos[level] = pos; + + /* The rest of the loop is for internal node traversal. */ + if (level == SDT_TASK_LEVELS - 1) + break; + + /* Allocate an internal node if necessary. */ + chunk = desc->chunk; + desc_children = (sdt_desc_t * __arena *)chunk->descs; + + desc = desc_children[pos]; + if (!desc) { + desc = scx_alloc_chunk(); + if (!desc) + return NULL; + + desc_children[pos] = desc; + } + } + + /* + * Finding the descriptor along with any internal node + * allocations was successful. Update all levels with + * the new allocation. + */ + bpf_for(u, 0, SDT_TASK_LEVELS) { + level = SDT_TASK_LEVELS - 1 - u; + tmp = lv_desc[level]; + + ret = set_idx_state(tmp, lv_pos[level], true); + if (ret != 0) + break; + + tmp->nr_free -= 1; + if (tmp->nr_free > 0) + break; + } + + *idxp = idx; + + return desc; +} + +__hidden +void __arena *scx_alloc(struct scx_allocator *alloc) +{ + struct sdt_data __arena *data = NULL; + struct sdt_chunk __arena *chunk; + sdt_desc_t *desc; + __u64 idx, pos; + + if (!alloc) + return NULL; + + bpf_spin_lock(&alloc_lock); + + /* We unlock if we encounter an error in the function. */ + desc = desc_find_empty(alloc->root, &idx); + if (unlikely(desc == NULL)) { + bpf_spin_unlock(&alloc_lock); + return NULL; + } + + chunk = desc->chunk; + + /* Populate the leaf node if necessary. */ + pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); + data = chunk->data[pos]; + if (!data) { + data = scx_alloc_from_pool(&alloc->pool); + if (!data) { + scx_alloc_free_idx(alloc, idx); + bpf_spin_unlock(&alloc_lock); + return NULL; + } + } + + chunk->data[pos] = data; + + /* The data counts as a chunk */ + alloc_stats.data_allocs += 1; + alloc_stats.alloc_ops += 1; + alloc_stats.active_allocs += 1; + + data->tid.idx = idx; + + bpf_spin_unlock(&alloc_lock); + + return data; +} + +/* + * Task BPF map entry recording the task's assigned ID and pointing to the data + * area allocated in arena. + */ +struct scx_task_map_val { + union sdt_id tid; + __u64 tptr; + struct sdt_data __arena *data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct scx_task_map_val); +} scx_task_map SEC(".maps"); + +static struct scx_allocator scx_task_allocator; + +__hidden +void __arena *scx_task_alloc(struct task_struct *p) +{ + struct sdt_data __arena *data = NULL; + struct scx_task_map_val *mval; + + mval = bpf_task_storage_get(&scx_task_map, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!mval) + return NULL; + + data = scx_alloc(&scx_task_allocator); + if (unlikely(!data)) + return NULL; + + mval->tid = data->tid; + mval->tptr = (__u64) p; + mval->data = data; + + return (void __arena *)data->payload; +} + +__hidden +int scx_task_init(__u64 data_size) +{ + return scx_alloc_init(&scx_task_allocator, data_size); +} + +__hidden +void __arena *scx_task_data(struct task_struct *p) +{ + struct sdt_data __arena *data; + struct scx_task_map_val *mval; + + scx_arena_subprog_init(); + + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); + if (!mval) + return NULL; + + data = mval->data; + + return (void __arena *)data->payload; +} + +__hidden +void scx_task_free(struct task_struct *p) +{ + struct scx_task_map_val *mval; + + scx_arena_subprog_init(); + + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); + if (!mval) + return; + + bpf_spin_lock(&alloc_lock); + scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); + bpf_spin_unlock(&alloc_lock); + + bpf_task_storage_delete(&scx_task_map, p); +} + +static inline void +scx_stat_global_update(struct scx_stats __arena *stats) +{ + cast_kern(stats); + __sync_fetch_and_add(&stat_enqueue, stats->enqueue); + __sync_fetch_and_add(&stat_init, stats->init); + __sync_fetch_and_add(&stat_exit, stats->exit); + __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); + __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); +} + +s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct scx_stats __arena *stats; + bool is_idle = false; + s32 cpu; + + stats = scx_task_data(p); + if (!stats) { + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); + return 0; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc_select_idle_cpu(stats); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } else { + stat_inc_select_busy_cpu(stats); + } + + return cpu; +} + +void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct scx_stats __arena *stats; + + stats = scx_task_data(p); + if (!stats) { + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); + return; + } + + stat_inc_enqueue(stats); + + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_dsq_move_to_local(SHARED_DSQ); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct scx_stats __arena *stats; + + stats = scx_task_alloc(p); + if (!stats) { + scx_bpf_error("arena allocator out of memory"); + return -ENOMEM; + } + + stats->pid = p->pid; + + stat_inc_init(stats); + + return 0; +} + +void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + struct scx_stats __arena *stats; + + stats = scx_task_data(p); + if (!stats) { + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); + return; + } + + stat_inc_exit(stats); + scx_stat_global_update(stats); + + scx_task_free(p); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) +{ + int ret; + + ret = scx_task_init(sizeof(struct scx_stats)); + if (ret < 0) { + scx_bpf_error("%s: failed with %d", __func__, ret); + return ret; + } + + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); + return ret; + } + + return 0; +} + +void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(sdt_ops, + .select_cpu = (void *)sdt_select_cpu, + .enqueue = (void *)sdt_enqueue, + .dispatch = (void *)sdt_dispatch, + .init_task = (void *)sdt_init_task, + .exit_task = (void *)sdt_exit_task, + .init = (void *)sdt_init, + .exit = (void *)sdt_exit, + .name = "sdt"); diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c new file mode 100644 index 000000000000..b0363363476d --- /dev/null +++ b/tools/sched_ext/scx_sdt.c @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com> + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <libgen.h> +#include <bpf/bpf.h> +#include <scx/common.h> + +#include "scx_sdt.h" +#include "scx_sdt.bpf.skel.h" + +const char help_fmt[] = +"A simple arena-based sched_ext scheduler.\n" +"\n" +"Modified version of scx_simple that demonstrates arena-based data structures.\n" +"\n" +"Usage: %s [-f] [-v]\n" +"\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +static bool verbose; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int sig) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_sdt *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(sdt_ops, scx_sdt); + + while ((opt = getopt(argc, argv, "fvh")) != -1) { + switch (opt) { + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei); + link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + printf("====SCHEDULING STATS====\n"); + printf("enqueues=%llu\t", skel->bss->stat_enqueue); + printf("inits=%llu\t", skel->bss->stat_init); + printf("exits=%llu\t", skel->bss->stat_exit); + printf("\n"); + + printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu); + printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu); + printf("\n"); + + printf("====ALLOCATION STATS====\n"); + printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs); + printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs); + printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops); + printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops); + printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs); + printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used); + printf("\n\n"); + + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_sdt__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_sdt.h b/tools/sched_ext/scx_sdt.h new file mode 100644 index 000000000000..67982ce9bc9b --- /dev/null +++ b/tools/sched_ext/scx_sdt.h @@ -0,0 +1,113 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com> + */ +#pragma once + +#ifndef __BPF__ +#define __arena +#endif /* __BPF__ */ + +struct scx_alloc_stats { + __u64 chunk_allocs; + __u64 data_allocs; + __u64 alloc_ops; + __u64 free_ops; + __u64 active_allocs; + __u64 arena_pages_used; +}; + +struct sdt_pool { + void __arena *slab; + __u64 elem_size; + __u64 max_elems; + __u64 idx; +}; + +#ifndef div_round_up +#define div_round_up(a, b) (((a) + (b) - 1) / (b)) +#endif + +#ifndef round_up +#define round_up(a, b) (div_round_up((a), (b)) * (b)) +#endif + +typedef struct sdt_desc __arena sdt_desc_t; + +enum sdt_consts { + SDT_TASK_ENTS_PER_PAGE_SHIFT = 9, + SDT_TASK_LEVELS = 3, + SDT_TASK_ENTS_PER_CHUNK = 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT, + SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64), + SDT_TASK_MIN_ELEM_PER_ALLOC = 8, +}; + +union sdt_id { + __s64 val; + struct { + __s32 idx; /* index in the radix tree */ + __s32 genn; /* ++'d on recycle so that it forms unique'ish 64bit ID */ + }; +}; + +struct sdt_chunk; + +/* + * Each index page is described by the following descriptor which carries the + * bitmap. This way the actual index can host power-of-two numbers of entries + * which makes indexing cheaper. + */ +struct sdt_desc { + __u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S]; + __u64 nr_free; + struct sdt_chunk __arena *chunk; +}; + +/* + * Leaf node containing per-task data. + */ +struct sdt_data { + union sdt_id tid; + __u64 payload[]; +}; + +/* + * Intermediate node pointing to another intermediate node or leaf node. + */ +struct sdt_chunk { + union { + sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK]; + struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK]; + }; +}; + +struct scx_allocator { + struct sdt_pool pool; + sdt_desc_t *root; +}; + +struct scx_stats { + int seq; + pid_t pid; + __u64 enqueue; + __u64 exit; + __u64 init; + __u64 select_busy_cpu; + __u64 select_idle_cpu; +}; + +#ifdef __BPF__ + +void __arena *scx_task_data(struct task_struct *p); +int scx_task_init(__u64 data_size); +void __arena *scx_task_alloc(struct task_struct *p); +void scx_task_free(struct task_struct *p); +void scx_arena_subprog_init(void); + +int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size); +u64 scx_alloc_internal(struct scx_allocator *alloc); +int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx); + +#endif /* __BPF__ */ diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c index e6de99dba7db..b456bd7cae77 100644 --- a/tools/sched_ext/scx_simple.bpf.c +++ b/tools/sched_ext/scx_simple.bpf.c @@ -131,7 +131,15 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) { - return scx_bpf_create_dsq(SHARED_DSQ, -1); + int ret; + + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); + if (ret) { + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); + return ret; + } + + return 0; } void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c new file mode 100644 index 000000000000..f29862b89386 --- /dev/null +++ b/tools/sched_ext/scx_userland.bpf.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal userland scheduler. + * + * In terms of scheduling, this provides two different types of behaviors: + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. + * All such tasks are direct-dispatched from the kernel, and are never + * enqueued in user space. + * 2. A primitive vruntime scheduler that is implemented in user space, for all + * other tasks. + * + * Some parts of this example user space scheduler could be implemented more + * efficiently using more complex and sophisticated data structures. For + * example, rather than using BPF_MAP_TYPE_QUEUE's, + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between + * user space and kernel space. Similarly, we use a simple vruntime-sorted list + * in user space, but an rbtree could be used instead. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <scx/common.bpf.h> +#include "scx_userland.h" + +/* + * Maximum amount of tasks enqueued/dispatched between kernel and user-space. + */ +#define MAX_ENQUEUED_TASKS 4096 + +char _license[] SEC("license") = "GPL"; + +const volatile s32 usersched_pid; + +/* !0 for veristat, set during init */ +const volatile u32 num_possible_cpus = 64; + +/* Stats that are printed by user space. */ +u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; + +/* + * Number of tasks that are queued for scheduling. + * + * This number is incremented by the BPF component when a task is queued to the + * user-space scheduler and it must be decremented by the user-space scheduler + * when a task is consumed. + */ +volatile u64 nr_queued; + +/* + * Number of tasks that are waiting for scheduling. + * + * This number must be updated by the user-space scheduler to keep track if + * there is still some scheduling work to do. + */ +volatile u64 nr_scheduled; + +UEI_DEFINE(uei); + +/* + * The map containing tasks that are enqueued in user space from the kernel. + * + * This map is drained by the user space scheduler. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(value, struct scx_userland_enqueued_task); +} enqueued SEC(".maps"); + +/* + * The map containing tasks that are dispatched to the kernel from user space. + * + * Drained by the kernel in userland_dispatch(). + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(value, s32); +} dispatched SEC(".maps"); + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ */ +}; + +/* Map that contains task-local storage. */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* + * Flag used to wake-up the user-space scheduler. + */ +static volatile u32 usersched_needed; + +/* + * Set user-space scheduler wake-up flag (equivalent to an atomic release + * operation). + */ +static void set_usersched_needed(void) +{ + __sync_fetch_and_or(&usersched_needed, 1); +} + +/* + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic + * acquire operation). + */ +static bool test_and_clear_usersched_needed(void) +{ + return __sync_fetch_and_and(&usersched_needed, 0) == 1; +} + +static bool is_usersched_task(const struct task_struct *p) +{ + return p->pid == usersched_pid; +} + +static bool keep_in_kernel(const struct task_struct *p) +{ + return p->nr_cpus_allowed < num_possible_cpus; +} + +static struct task_struct *usersched_task(void) +{ + struct task_struct *p; + + p = bpf_task_from_pid(usersched_pid); + /* + * Should never happen -- the usersched task should always be managed + * by sched_ext. + */ + if (!p) + scx_bpf_error("Failed to find usersched task %d", usersched_pid); + + return p; +} + +s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + if (keep_in_kernel(p)) { + s32 cpu; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + tctx->force_local = true; + return cpu; + } + } + + return prev_cpu; +} + +static void dispatch_user_scheduler(void) +{ + struct task_struct *p; + + p = usersched_task(); + if (p) { + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) +{ + struct scx_userland_enqueued_task task = {}; + + task.pid = p->pid; + task.sum_exec_runtime = p->se.sum_exec_runtime; + task.weight = p->scx.weight; + + if (bpf_map_push_elem(&enqueued, &task, 0)) { + /* + * If we fail to enqueue the task in user space, put it + * directly on the global DSQ. + */ + __sync_fetch_and_add(&nr_failed_enqueues, 1); + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } else { + __sync_fetch_and_add(&nr_user_enqueues, 1); + set_usersched_needed(); + } +} + +void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (keep_in_kernel(p)) { + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); + return; + } + + if (tctx->force_local) + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&nr_kernel_enqueues, 1); + return; + } else if (!is_usersched_task(p)) { + enqueue_task_in_user_space(p, enq_flags); + } +} + +void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) +{ + if (test_and_clear_usersched_needed()) + dispatch_user_scheduler(); + + bpf_repeat(MAX_ENQUEUED_TASKS) { + s32 pid; + struct task_struct *p; + + if (bpf_map_pop_elem(&dispatched, &pid)) + break; + + /* + * The task could have exited by the time we get around to + * dispatching it. Treat this as a normal occurrence, and simply + * move onto the next iteration. + */ + p = bpf_task_from_pid(pid); + if (!p) + continue; + + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +/* + * A CPU is about to change its idle state. If the CPU is going idle, ensure + * that the user-space scheduler has a chance to run if there is any remaining + * work to do. + */ +void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) +{ + /* + * Don't do anything if we exit from and idle state, a CPU owner will + * be assigned in .running(). + */ + if (!idle) + return; + /* + * A CPU is now available, notify the user-space scheduler that tasks + * can be dispatched, if there is at least one task waiting to be + * scheduled, either queued (accounted in nr_queued) or scheduled + * (accounted in nr_scheduled). + * + * NOTE: nr_queued is incremented by the BPF component, more exactly in + * enqueue(), when a task is sent to the user-space scheduler, then + * the scheduler drains the queued tasks (updating nr_queued) and adds + * them to its internal data structures / state; at this point tasks + * become "scheduled" and the user-space scheduler will take care of + * updating nr_scheduled accordingly; lastly tasks will be dispatched + * and the user-space scheduler will update nr_scheduled again. + * + * Checking both counters allows to determine if there is still some + * pending work to do for the scheduler: new tasks have been queued + * since last check, or there are still tasks "queued" or "scheduled" + * since the previous user-space scheduler run. If the counters are + * both zero it is pointless to wake-up the scheduler (even if a CPU + * becomes idle), because there is nothing to do. + * + * Keep in mind that update_idle() doesn't run concurrently with the + * user-space scheduler (that is single-threaded): this function is + * naturally serialized with the user-space scheduler code, therefore + * this check here is also safe from a concurrency perspective. + */ + if (nr_queued || nr_scheduled) { + /* + * Kick the CPU to make it immediately ready to accept + * dispatched tasks. + */ + set_usersched_needed(); + scx_bpf_kick_cpu(cpu, 0); + } +} + +s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(userland_init) +{ + if (num_possible_cpus == 0) { + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", + num_possible_cpus); + return -EINVAL; + } + + if (usersched_pid <= 0) { + scx_bpf_error("User scheduler pid uninitialized (%d)", + usersched_pid); + return -EINVAL; + } + + return 0; +} + +void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(userland_ops, + .select_cpu = (void *)userland_select_cpu, + .enqueue = (void *)userland_enqueue, + .dispatch = (void *)userland_dispatch, + .update_idle = (void *)userland_update_idle, + .init_task = (void *)userland_init_task, + .init = (void *)userland_init, + .exit = (void *)userland_exit, + .flags = SCX_OPS_ENQ_LAST | + SCX_OPS_KEEP_BUILTIN_IDLE, + .name = "userland"); diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c new file mode 100644 index 000000000000..10b31020f44f --- /dev/null +++ b/tools/sched_ext/scx_userland.c @@ -0,0 +1,437 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext user space scheduler which provides vruntime semantics + * using a simple ordered-list implementation. + * + * Each CPU in the system resides in a single, global domain. This precludes + * the need to do any load balancing between domains. The scheduler could + * easily be extended to support multiple domains, with load balancing + * happening in user space. + * + * Any task which has any CPU affinity is scheduled entirely in BPF. This + * program only schedules tasks which may run on any CPU. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <stdio.h> +#include <unistd.h> +#include <sched.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <pthread.h> +#include <bpf/bpf.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/syscall.h> + +#include <scx/common.h> +#include "scx_userland.h" +#include "scx_userland.bpf.skel.h" + +const char help_fmt[] = +"A minimal userland sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" +"\n" +"Usage: %s [-b BATCH]\n" +"\n" +" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" +" -v Print libbpf debug messages\n" +" -h Display this help and exit\n"; + +/* Defined in UAPI */ +#define SCHED_EXT 7 + +/* Number of tasks to batch when dispatching to user space. */ +static __u32 batch_size = 8; + +static bool verbose; +static volatile int exit_req; +static int enqueued_fd, dispatched_fd; + +static struct scx_userland *skel; +static struct bpf_link *ops_link; + +/* Stats collected in user space. */ +static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; + +/* Number of tasks currently enqueued. */ +static __u64 nr_curr_enqueued; + +/* The data structure containing tasks that are enqueued in user space. */ +struct enqueued_task { + LIST_ENTRY(enqueued_task) entries; + __u64 sum_exec_runtime; + double vruntime; +}; + +/* + * Use a vruntime-sorted list to store tasks. This could easily be extended to + * a more optimal data structure, such as an rbtree as is done in CFS. We + * currently elect to use a sorted list to simplify the example for + * illustrative purposes. + */ +LIST_HEAD(listhead, enqueued_task); + +/* + * A vruntime-sorted list of tasks. The head of the list contains the task with + * the lowest vruntime. That is, the task that has the "highest" claim to be + * scheduled. + */ +static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); + +/* + * The main array of tasks. The array is allocated all at once during + * initialization, based on /proc/sys/kernel/pid_max, to avoid having to + * dynamically allocate memory on the enqueue path, which could cause a + * deadlock. A more substantive user space scheduler could e.g. provide a hook + * for newly enabled tasks that are passed to the scheduler from the + * .prep_enable() callback to allows the scheduler to allocate on safe paths. + */ +struct enqueued_task *tasks; +static int pid_max; + +static double min_vruntime; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int userland) +{ + exit_req = 1; +} + +static int get_pid_max(void) +{ + FILE *fp; + int pid_max; + + fp = fopen("/proc/sys/kernel/pid_max", "r"); + if (fp == NULL) { + fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); + return -1; + } + if (fscanf(fp, "%d", &pid_max) != 1) { + fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); + fclose(fp); + return -1; + } + fclose(fp); + + return pid_max; +} + +static int init_tasks(void) +{ + pid_max = get_pid_max(); + if (pid_max < 0) + return pid_max; + + tasks = calloc(pid_max, sizeof(*tasks)); + if (!tasks) { + fprintf(stderr, "Error allocating tasks array\n"); + return -ENOMEM; + } + + return 0; +} + +static __u32 task_pid(const struct enqueued_task *task) +{ + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); +} + +static int dispatch_task(__s32 pid) +{ + int err; + + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); + if (err) { + nr_vruntime_failed++; + } else { + nr_vruntime_dispatches++; + } + + return err; +} + +static struct enqueued_task *get_enqueued_task(__s32 pid) +{ + if (pid >= pid_max) + return NULL; + + return &tasks[pid]; +} + +static double calc_vruntime_delta(__u64 weight, __u64 delta) +{ + double weight_f = (double)weight / 100.0; + double delta_f = (double)delta; + + return delta_f / weight_f; +} + +static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) +{ + __u64 delta; + + delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; + + enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); + if (min_vruntime > enqueued->vruntime) + enqueued->vruntime = min_vruntime; + enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; +} + +static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) +{ + struct enqueued_task *curr, *enqueued, *prev; + + curr = get_enqueued_task(bpf_task->pid); + if (!curr) + return ENOENT; + + update_enqueued(curr, bpf_task); + nr_vruntime_enqueues++; + nr_curr_enqueued++; + + /* + * Enqueue the task in a vruntime-sorted list. A more optimal data + * structure such as an rbtree could easily be used as well. We elect + * to use a list here simply because it's less code, and thus the + * example is less convoluted and better serves to illustrate what a + * user space scheduler could look like. + */ + + if (LIST_EMPTY(&vruntime_head)) { + LIST_INSERT_HEAD(&vruntime_head, curr, entries); + return 0; + } + + LIST_FOREACH(enqueued, &vruntime_head, entries) { + if (curr->vruntime <= enqueued->vruntime) { + LIST_INSERT_BEFORE(enqueued, curr, entries); + return 0; + } + prev = enqueued; + } + + LIST_INSERT_AFTER(prev, curr, entries); + + return 0; +} + +static void drain_enqueued_map(void) +{ + while (1) { + struct scx_userland_enqueued_task task; + int err; + + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { + skel->bss->nr_queued = 0; + skel->bss->nr_scheduled = nr_curr_enqueued; + return; + } + + err = vruntime_enqueue(&task); + if (err) { + fprintf(stderr, "Failed to enqueue task %d: %s\n", + task.pid, strerror(err)); + exit_req = 1; + return; + } + } +} + +static void dispatch_batch(void) +{ + __u32 i; + + for (i = 0; i < batch_size; i++) { + struct enqueued_task *task; + int err; + __s32 pid; + + task = LIST_FIRST(&vruntime_head); + if (!task) + break; + + min_vruntime = task->vruntime; + pid = task_pid(task); + LIST_REMOVE(task, entries); + err = dispatch_task(pid); + if (err) { + /* + * If we fail to dispatch, put the task back to the + * vruntime_head list and stop dispatching additional + * tasks in this batch. + */ + LIST_INSERT_HEAD(&vruntime_head, task, entries); + break; + } + nr_curr_enqueued--; + } + skel->bss->nr_scheduled = nr_curr_enqueued; +} + +static void *run_stats_printer(void *arg) +{ + while (!exit_req) { + __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; + + nr_failed_enqueues = skel->bss->nr_failed_enqueues; + nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; + nr_user_enqueues = skel->bss->nr_user_enqueues; + total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; + + printf("o-----------------------o\n"); + printf("| BPF ENQUEUES |\n"); + printf("|-----------------------|\n"); + printf("| kern: %10llu |\n", nr_kernel_enqueues); + printf("| user: %10llu |\n", nr_user_enqueues); + printf("| failed: %10llu |\n", nr_failed_enqueues); + printf("| -------------------- |\n"); + printf("| total: %10llu |\n", total); + printf("| |\n"); + printf("|-----------------------|\n"); + printf("| VRUNTIME / USER |\n"); + printf("|-----------------------|\n"); + printf("| enq: %10llu |\n", nr_vruntime_enqueues); + printf("| disp: %10llu |\n", nr_vruntime_dispatches); + printf("| failed: %10llu |\n", nr_vruntime_failed); + printf("o-----------------------o\n"); + printf("\n\n"); + fflush(stdout); + sleep(1); + } + + return NULL; +} + +static int spawn_stats_thread(void) +{ + pthread_t stats_printer; + + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); +} + +static void pre_bootstrap(int argc, char **argv) +{ + int err; + __u32 opt; + struct sched_param sched_param = { + .sched_priority = sched_get_priority_max(SCHED_EXT), + }; + + err = init_tasks(); + if (err) + exit(err); + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + /* + * Enforce that the user scheduler task is managed by sched_ext. The + * task eagerly drains the list of enqueued tasks in its main work + * loop, and then yields the CPU. The BPF scheduler only schedules the + * user space scheduler task when at least one other task in the system + * needs to be scheduled. + */ + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); + SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); + + while ((opt = getopt(argc, argv, "b:vh")) != -1) { + switch (opt) { + case 'b': + batch_size = strtoul(optarg, NULL, 0); + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + exit(opt != 'h'); + } + } + + /* + * It's not always safe to allocate in a user space scheduler, as an + * enqueued task could hold a lock that we require in order to be able + * to allocate. + */ + err = mlockall(MCL_CURRENT | MCL_FUTURE); + SCX_BUG_ON(err, "Failed to prefault and lock address space"); +} + +static void bootstrap(char *comm) +{ + skel = SCX_OPS_OPEN(userland_ops, scx_userland); + + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->num_possible_cpus > 0); + skel->rodata->usersched_pid = getpid(); + assert(skel->rodata->usersched_pid > 0); + + SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei); + + enqueued_fd = bpf_map__fd(skel->maps.enqueued); + dispatched_fd = bpf_map__fd(skel->maps.dispatched); + assert(enqueued_fd > 0); + assert(dispatched_fd > 0); + + SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); + + ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland); +} + +static void sched_main_loop(void) +{ + while (!exit_req) { + /* + * Perform the following work in the main user space scheduler + * loop: + * + * 1. Drain all tasks from the enqueued map, and enqueue them + * to the vruntime sorted list. + * + * 2. Dispatch a batch of tasks from the vruntime sorted list + * down to the kernel. + * + * 3. Yield the CPU back to the system. The BPF scheduler will + * reschedule the user space scheduler once another task has + * been enqueued to user space. + */ + drain_enqueued_map(); + dispatch_batch(); + sched_yield(); + } +} + +int main(int argc, char **argv) +{ + __u64 ecode; + + pre_bootstrap(argc, argv); +restart: + bootstrap(argv[0]); + sched_main_loop(); + + exit_req = 1; + bpf_link__destroy(ops_link); + ecode = UEI_REPORT(skel, uei); + scx_userland__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h new file mode 100644 index 000000000000..684fb2dd5de9 --- /dev/null +++ b/tools/sched_ext/scx_userland.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta, Inc */ + +#ifndef __SCX_USERLAND_COMMON_H +#define __SCX_USERLAND_COMMON_H + +/* + * An instance of a task that has been enqueued by the kernel for consumption + * by a user space global scheduler thread. + */ +struct scx_userland_enqueued_task { + __s32 pid; + u64 sum_exec_runtime; + u64 weight; +}; + +#endif // __SCX_USERLAND_COMMON_H diff --git a/tools/spi/.gitignore b/tools/spi/.gitignore index 14ddba3d2195..038261b34ed8 100644 --- a/tools/spi/.gitignore +++ b/tools/spi/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only spidev_fdx spidev_test +include/ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0e151d0572d1..53d84a6874b7 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -7,9 +7,10 @@ ldflags-y += --wrap=nvdimm_bus_register ldflags-y += --wrap=cxl_await_media_ready ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_endpoint_parse_cdat -ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup ldflags-y += --wrap=hmat_get_extended_linear_cache_size +ldflags-y += --wrap=devm_cxl_add_dport_by_dev +ldflags-y += --wrap=devm_cxl_switch_port_decoders_setup DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl @@ -57,12 +58,14 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o -cxl_core-y += $(CXL_CORE_SRC)/ras.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras_rch.o +cxl_core-$(CONFIG_CXL_ATL) += $(CXL_CORE_SRC)/atl.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o diff --git a/tools/testing/cxl/cxl_core_exports.c b/tools/testing/cxl/cxl_core_exports.c index 6754de35598d..f088792a8925 100644 --- a/tools/testing/cxl/cxl_core_exports.c +++ b/tools/testing/cxl/cxl_core_exports.c @@ -2,28 +2,6 @@ /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ #include "cxl.h" -#include "exports.h" /* Exporting of cxl_core symbols that are only used by cxl_test */ EXPORT_SYMBOL_NS_GPL(cxl_num_decoders_committed, "CXL"); - -cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev = - __devm_cxl_add_dport_by_dev; -EXPORT_SYMBOL_NS_GPL(_devm_cxl_add_dport_by_dev, "CXL"); - -struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev) -{ - return _devm_cxl_add_dport_by_dev(port, dport_dev); -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL"); - -cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup = - __devm_cxl_switch_port_decoders_setup; -EXPORT_SYMBOL_NS_GPL(_devm_cxl_switch_port_decoders_setup, "CXL"); - -int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) -{ - return _devm_cxl_switch_port_decoders_setup(port); -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL"); diff --git a/tools/testing/cxl/exports.h b/tools/testing/cxl/exports.h deleted file mode 100644 index 7ebee7c0bd67..000000000000 --- a/tools/testing/cxl/exports.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 2025 Intel Corporation */ -#ifndef __MOCK_CXL_EXPORTS_H_ -#define __MOCK_CXL_EXPORTS_H_ - -typedef struct cxl_dport *(*cxl_add_dport_by_dev_fn)(struct cxl_port *port, - struct device *dport_dev); -extern cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev; - -typedef int(*cxl_switch_decoders_setup_fn)(struct cxl_port *port); -extern cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup; - -#endif diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 176dcde570cd..cb87e8c0e63c 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 44bce80ef3ff..b8fcb50c1027 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -10,21 +10,12 @@ #include <cxlmem.h> #include <cxlpci.h> #include "mock.h" -#include "../exports.h" static LIST_HEAD(mock); -static struct cxl_dport * -redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev); -static int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port); - void register_cxl_mock_ops(struct cxl_mock_ops *ops) { list_add_rcu(&ops->list, &mock); - _devm_cxl_add_dport_by_dev = redirect_devm_cxl_add_dport_by_dev; - _devm_cxl_switch_port_decoders_setup = - redirect_devm_cxl_switch_port_decoders_setup; } EXPORT_SYMBOL_GPL(register_cxl_mock_ops); @@ -32,9 +23,6 @@ DEFINE_STATIC_SRCU(cxl_mock_srcu); void unregister_cxl_mock_ops(struct cxl_mock_ops *ops) { - _devm_cxl_switch_port_decoders_setup = - __devm_cxl_switch_port_decoders_setup; - _devm_cxl_add_dport_by_dev = __devm_cxl_add_dport_by_dev; list_del_rcu(&ops->list); synchronize_srcu(&cxl_mock_srcu); } @@ -163,7 +151,7 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); -int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +int __wrap_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) { int rc, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); @@ -171,11 +159,12 @@ int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) if (ops && ops->is_mock_port(port->uport_dev)) rc = ops->devm_cxl_switch_port_decoders_setup(port); else - rc = __devm_cxl_switch_port_decoders_setup(port); + rc = devm_cxl_switch_port_decoders_setup(port); put_cxl_mock_ops(index); return rc; } +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_switch_port_decoders_setup, "CXL"); int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) { @@ -245,20 +234,8 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, "CXL"); -void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) -{ - int index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (!ops || !ops->is_mock_port(dport->dport_dev)) - cxl_dport_init_ras_reporting(dport, host); - - put_cxl_mock_ops(index); -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL"); - -struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev) +struct cxl_dport *__wrap_devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) { int index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); @@ -267,11 +244,12 @@ struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, if (ops && ops->is_mock_port(port->uport_dev)) dport = ops->devm_cxl_add_dport_by_dev(port, dport_dev); else - dport = __devm_cxl_add_dport_by_dev(port, dport_dev); + dport = devm_cxl_add_dport_by_dev(port, dport_dev); put_cxl_mock_ops(index); return dport; } +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_dport_by_dev, "CXL"); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("cxl_test: emulation module"); diff --git a/tools/testing/kunit/kunit-completion.sh b/tools/testing/kunit/kunit-completion.sh new file mode 100644 index 000000000000..f053e7b5d265 --- /dev/null +++ b/tools/testing/kunit/kunit-completion.sh @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: GPL-2.0 +# bash completion support for KUnit + +_kunit_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +_kunit() +{ + local cur prev words cword + _init_completion || return + + local script="${_kunit_dir}/kunit.py" + + if [[ $cword -eq 1 && "$cur" != -* ]]; then + local cmds=$(${script} --list-cmds 2>/dev/null) + COMPREPLY=($(compgen -W "${cmds}" -- "$cur")) + return 0 + fi + + if [[ "$cur" == -* ]]; then + if [[ -n "${words[1]}" && "${words[1]}" != -* ]]; then + local opts=$(${script} ${words[1]} --list-opts 2>/dev/null) + COMPREPLY=($(compgen -W "${opts}" -- "$cur")) + return 0 + else + local opts=$(${script} --list-opts 2>/dev/null) + COMPREPLY=($(compgen -W "${opts}" -- "$cur")) + return 0 + fi + fi +} + +complete -o default -F _kunit kunit.py +complete -o default -F _kunit kunit +complete -o default -F _kunit ./tools/testing/kunit/kunit.py diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index cd99c1956331..4ec5ecba6d49 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -323,11 +323,27 @@ def get_default_jobs() -> int: return ncpu raise RuntimeError("os.cpu_count() returned None") +def get_default_build_dir() -> str: + if 'KBUILD_OUTPUT' in os.environ: + return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit') + return '.kunit' + +def add_completion_opts(parser: argparse.ArgumentParser) -> None: + parser.add_argument('--list-opts', + help=argparse.SUPPRESS, + action='store_true') + +def add_root_opts(parser: argparse.ArgumentParser) -> None: + parser.add_argument('--list-cmds', + help=argparse.SUPPRESS, + action='store_true') + add_completion_opts(parser) + def add_common_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' 'directory.', - type=str, default='.kunit', metavar='DIR') + type=str, default=get_default_build_dir(), metavar='DIR') parser.add_argument('--make_options', help='X=Y make option, can be repeated.', action='append', metavar='X=Y') @@ -374,6 +390,8 @@ def add_common_opts(parser: argparse.ArgumentParser) -> None: help='Additional QEMU arguments, e.g. "-smp 8"', action='append', metavar='') + add_completion_opts(parser) + def add_build_opts(parser: argparse.ArgumentParser) -> None: parser.add_argument('--jobs', help='As in the make command, "Specifies the number of ' @@ -569,6 +587,7 @@ subcommand_handlers_map = { def main(argv: Sequence[str]) -> None: parser = argparse.ArgumentParser( description='Helps writing and running KUnit tests.') + add_root_opts(parser) subparser = parser.add_subparsers(dest='subcommand') # The 'run' command will config, build, exec, and parse in one go. @@ -603,12 +622,28 @@ def main(argv: Sequence[str]) -> None: parse_parser.add_argument('file', help='Specifies the file to read results from.', type=str, nargs='?', metavar='input_file') + add_completion_opts(parse_parser) cli_args = parser.parse_args(massage_argv(argv)) if get_kernel_root_path(): os.chdir(get_kernel_root_path()) + if cli_args.list_cmds: + print(" ".join(subparser.choices.keys())) + return + + if cli_args.list_opts: + target_parser = subparser.choices.get(cli_args.subcommand) + if not target_parser: + target_parser = parser + + # Accessing private attribute _option_string_actions to get + # the list of options. This is not a public API, but argparse + # does not provide a way to inspect options programmatically. + print(' '.join(target_parser._option_string_actions.keys())) + return + subcomand_handler = subcommand_handlers_map.get(cli_args.subcommand, None) if subcomand_handler is None: diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index 333cd3a4a56b..5338489dcbe4 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -689,6 +689,9 @@ def bubble_up_test_results(test: Test) -> None: elif test.counts.get_status() == TestStatus.TEST_CRASHED: test.status = TestStatus.TEST_CRASHED + if status == TestStatus.FAILURE and test.counts.get_status() == TestStatus.SUCCESS: + counts.add_status(status) + def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: bool, printer: Printer) -> Test: """ Finds next test to parse in LineStream, creates new Test object, diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index bbba921e0eac..b67408147c1f 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -11,11 +11,13 @@ from unittest import mock import tempfile, shutil # Handling test_tmpdir +import io import itertools import json import os import signal import subprocess +import sys from typing import Iterable import kunit_config @@ -36,7 +38,7 @@ def setUpModule(): def tearDownModule(): shutil.rmtree(test_tmpdir) -def test_data_path(path): +def _test_data_path(path): return os.path.join(abs_test_data_dir, path) class KconfigTest(unittest.TestCase): @@ -52,7 +54,7 @@ class KconfigTest(unittest.TestCase): self.assertFalse(kconfig1.is_subset_of(kconfig0)) def test_read_from_file(self): - kconfig_path = test_data_path('test_read_from_file.kconfig') + kconfig_path = _test_data_path('test_read_from_file.kconfig') kconfig = kunit_config.parse_file(kconfig_path) @@ -98,7 +100,7 @@ class KUnitParserTest(unittest.TestCase): raise AssertionError(f'"{needle}" not found in {list(backup)}!') def test_output_isolated_correctly(self): - log_path = test_data_path('test_output_isolated_correctly.log') + log_path = _test_data_path('test_output_isolated_correctly.log') with open(log_path) as file: result = kunit_parser.extract_tap_lines(file.readlines()) self.assertContains('TAP version 14', result) @@ -109,7 +111,7 @@ class KUnitParserTest(unittest.TestCase): self.assertContains('ok 1 - example', result) def test_output_with_prefix_isolated_correctly(self): - log_path = test_data_path('test_pound_sign.log') + log_path = _test_data_path('test_pound_sign.log') with open(log_path) as file: result = kunit_parser.extract_tap_lines(file.readlines()) self.assertContains('TAP version 14', result) @@ -138,35 +140,46 @@ class KUnitParserTest(unittest.TestCase): self.assertContains('ok 3 - string-stream-test', result) def test_parse_successful_test_log(self): - all_passed_log = test_data_path('test_is_test_passed-all_passed.log') + all_passed_log = _test_data_path('test_is_test_passed-all_passed.log') with open(all_passed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_successful_nested_tests_log(self): - all_passed_log = test_data_path('test_is_test_passed-all_passed_nested.log') + all_passed_log = _test_data_path('test_is_test_passed-all_passed_nested.log') with open(all_passed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_kselftest_nested(self): - kselftest_log = test_data_path('test_is_test_passed-kselftest.log') + kselftest_log = _test_data_path('test_is_test_passed-kselftest.log') with open(kselftest_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts.errors, 0) def test_parse_failed_test_log(self): - failed_log = test_data_path('test_is_test_passed-failure.log') + failed_log = _test_data_path('test_is_test_passed-failure.log') with open(failed_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) self.assertEqual(result.counts.errors, 0) + def test_parse_failed_nested_tests_log(self): + nested_log = _test_data_path('test_is_test_passed-failure-nested.log') + with open(nested_log) as file: + result = kunit_parser.parse_run_tests(file.readlines(), stdout) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status) + self.assertEqual(result.counts.failed, 2) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status) + self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.subtests[0].subtests[0].status) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status) + self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status) + def test_no_header(self): - empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log') + empty_log = _test_data_path('test_is_test_passed-no_tests_run_no_header.log') with open(empty_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -175,7 +188,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_missing_test_plan(self): - missing_plan_log = test_data_path('test_is_test_passed-' + missing_plan_log = _test_data_path('test_is_test_passed-' 'missing_plan.log') with open(missing_plan_log) as file: result = kunit_parser.parse_run_tests( @@ -186,7 +199,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) def test_no_tests(self): - header_log = test_data_path('test_is_test_passed-no_tests_run_with_header.log') + header_log = _test_data_path('test_is_test_passed-no_tests_run_with_header.log') with open(header_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -195,7 +208,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_no_tests_no_plan(self): - no_plan_log = test_data_path('test_is_test_passed-no_tests_no_plan.log') + no_plan_log = _test_data_path('test_is_test_passed-no_tests_no_plan.log') with open(no_plan_log) as file: result = kunit_parser.parse_run_tests( kunit_parser.extract_tap_lines(file.readlines()), stdout) @@ -207,7 +220,7 @@ class KUnitParserTest(unittest.TestCase): def test_no_kunit_output(self): - crash_log = test_data_path('test_insufficient_memory.log') + crash_log = _test_data_path('test_insufficient_memory.log') print_mock = mock.patch('kunit_printer.Printer.print').start() with open(crash_log) as file: result = kunit_parser.parse_run_tests( @@ -218,7 +231,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 1) def test_skipped_test(self): - skipped_log = test_data_path('test_skip_tests.log') + skipped_log = _test_data_path('test_skip_tests.log') with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -227,7 +240,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts, kunit_parser.TestCounts(passed=4, skipped=1)) def test_skipped_all_tests(self): - skipped_log = test_data_path('test_skip_all_tests.log') + skipped_log = _test_data_path('test_skip_all_tests.log') with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -235,7 +248,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts, kunit_parser.TestCounts(skipped=5)) def test_ignores_hyphen(self): - hyphen_log = test_data_path('test_strip_hyphen.log') + hyphen_log = _test_data_path('test_strip_hyphen.log') with open(hyphen_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -249,7 +262,7 @@ class KUnitParserTest(unittest.TestCase): result.subtests[1].name) def test_ignores_prefix_printk_time(self): - prefix_log = test_data_path('test_config_printk_time.log') + prefix_log = _test_data_path('test_config_printk_time.log') with open(prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -257,7 +270,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_ignores_multiple_prefixes(self): - prefix_log = test_data_path('test_multiple_prefixes.log') + prefix_log = _test_data_path('test_multiple_prefixes.log') with open(prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -265,7 +278,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_prefix_mixed_kernel_output(self): - mixed_prefix_log = test_data_path('test_interrupted_tap_output.log') + mixed_prefix_log = _test_data_path('test_interrupted_tap_output.log') with open(mixed_prefix_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -273,7 +286,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_prefix_poundsign(self): - pound_log = test_data_path('test_pound_sign.log') + pound_log = _test_data_path('test_pound_sign.log') with open(pound_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -281,7 +294,7 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual(result.counts.errors, 0) def test_kernel_panic_end(self): - panic_log = test_data_path('test_kernel_panic_interrupt.log') + panic_log = _test_data_path('test_kernel_panic_interrupt.log') with open(panic_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.TEST_CRASHED, result.status) @@ -289,7 +302,7 @@ class KUnitParserTest(unittest.TestCase): self.assertGreaterEqual(result.counts.errors, 1) def test_pound_no_prefix(self): - pound_log = test_data_path('test_pound_no_prefix.log') + pound_log = _test_data_path('test_pound_no_prefix.log') with open(pound_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) @@ -318,7 +331,7 @@ class KUnitParserTest(unittest.TestCase): 'Failures: all_failed_suite, some_failed_suite.test2') def test_ktap_format(self): - ktap_log = test_data_path('test_parse_ktap_output.log') + ktap_log = _test_data_path('test_parse_ktap_output.log') with open(ktap_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) self.assertEqual(result.counts, kunit_parser.TestCounts(passed=3)) @@ -327,13 +340,13 @@ class KUnitParserTest(unittest.TestCase): self.assertEqual('case_2', result.subtests[0].subtests[1].name) def test_parse_subtest_header(self): - ktap_log = test_data_path('test_parse_subtest_header.log') + ktap_log = _test_data_path('test_parse_subtest_header.log') with open(ktap_log) as file: kunit_parser.parse_run_tests(file.readlines(), stdout) self.print_mock.assert_any_call(StrContains('suite (1 subtest)')) def test_parse_attributes(self): - ktap_log = test_data_path('test_parse_attributes.log') + ktap_log = _test_data_path('test_parse_attributes.log') with open(ktap_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) @@ -466,7 +479,8 @@ class LinuxSourceTreeTest(unittest.TestCase): want_kconfig = kunit_config.Kconfig() want_kconfig.add_entry('NOT_REAL', 'y') - tree = kunit_kernel.LinuxSourceTree('', kconfig_add=['CONFIG_NOT_REAL=y']) + tree = kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[os.devnull], + kconfig_add=['CONFIG_NOT_REAL=y']) self.assertTrue(want_kconfig.is_subset_of(tree._kconfig), msg=tree._kconfig) def test_invalid_arch(self): @@ -478,7 +492,7 @@ class LinuxSourceTreeTest(unittest.TestCase): return subprocess.Popen(['echo "hi\nbye"'], shell=True, text=True, stdout=subprocess.PIPE) with tempfile.TemporaryDirectory('') as build_dir: - tree = kunit_kernel.LinuxSourceTree(build_dir) + tree = kunit_kernel.LinuxSourceTree(build_dir, kunitconfig_paths=[os.devnull]) mock.patch.object(tree._ops, 'start', side_effect=fake_start).start() with self.assertRaises(ValueError): @@ -555,7 +569,7 @@ class KUnitJsonTest(unittest.TestCase): self.addCleanup(mock.patch.stopall) def _json_for(self, log_file): - with open(test_data_path(log_file)) as file: + with open(_test_data_path(log_file)) as file: test_result = kunit_parser.parse_run_tests(file, stdout) json_obj = kunit_json.get_json_result( test=test_result, @@ -596,11 +610,12 @@ class StrContains(str): class KUnitMainTest(unittest.TestCase): def setUp(self): - path = test_data_path('test_is_test_passed-all_passed.log') + path = _test_data_path('test_is_test_passed-all_passed.log') with open(path) as file: all_passed_log = file.readlines() self.print_mock = mock.patch('kunit_printer.Printer.print').start() + mock.patch.dict(os.environ, clear=True).start() self.addCleanup(mock.patch.stopall) self.mock_linux_init = mock.patch.object(kunit_kernel, 'LinuxSourceTree').start() @@ -723,6 +738,24 @@ class KUnitMainTest(unittest.TestCase): args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) + @mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'}) + def test_run_builddir_from_env(self): + build_dir = '/tmp/.kunit' + kunit.main(['run']) + self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + + @mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'}) + def test_run_builddir_override(self): + build_dir = '.kunit' + kunit.main(['run', '--build_dir=.kunit']) + self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1) + self.linux_source_mock.run_kernel.assert_called_once_with( + args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_config_builddir(self): build_dir = '.kunit' kunit.main(['config', '--build_dir', build_dir]) @@ -855,5 +888,24 @@ class KUnitMainTest(unittest.TestCase): mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test1', filter='', filter_action=None, timeout=300), ]) + @mock.patch.object(sys, 'stdout', new_callable=io.StringIO) + def test_list_cmds(self, mock_stdout): + kunit.main(['--list-cmds']) + output = mock_stdout.getvalue() + output_cmds = sorted(output.split()) + expected_cmds = sorted(['build', 'config', 'exec', 'parse', 'run']) + self.assertEqual(output_cmds, expected_cmds) + + @mock.patch.object(sys, 'stdout', new_callable=io.StringIO) + def test_run_list_opts(self, mock_stdout): + kunit.main(['run', '--list-opts']) + output = mock_stdout.getvalue() + output_cmds = set(output.split()) + self.assertIn('--help', output_cmds) + self.assertIn('--kunitconfig', output_cmds) + self.assertIn('--jobs', output_cmds) + self.assertIn('--kernel_args', output_cmds) + self.assertIn('--raw_output', output_cmds) + if __name__ == '__main__': unittest.main() diff --git a/tools/testing/kunit/qemu_configs/armeb.py b/tools/testing/kunit/qemu_configs/armeb.py new file mode 100644 index 000000000000..86d326651490 --- /dev/null +++ b/tools/testing/kunit/qemu_configs/armeb.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='arm', + kconfig=''' +CONFIG_CPU_BIG_ENDIAN=y +CONFIG_ARCH_VIRT=y +CONFIG_SERIAL_AMBA_PL010=y +CONFIG_SERIAL_AMBA_PL010_CONSOLE=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y''', + qemu_arch='arm', + kernel_path='arch/arm/boot/zImage', + kernel_command_line='console=ttyAMA0', + extra_qemu_params=['-machine', 'virt']) diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log new file mode 100644 index 000000000000..5498dfd0b0db --- /dev/null +++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log @@ -0,0 +1,10 @@ +KTAP version 1 +1..2 + KTAP version 1 + 1..1 + ok 1 test 1 +not ok 1 subtest 1 + KTAP version 1 + 1..1 + not ok 1 subsubtest 1 +not ok 2 subtest 2 diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 56e44a98d6a5..450f13ba4cca 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -22,6 +22,7 @@ TARGETS += drivers/ntsync TARGETS += drivers/s390x/uvdevice TARGETS += drivers/net TARGETS += drivers/net/bonding +TARGETS += drivers/net/netconsole TARGETS += drivers/net/team TARGETS += drivers/net/virtio_net TARGETS += drivers/platform/x86/intel/ifs diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index c4c72ee2ef55..e456f3b62fa1 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -30,13 +30,15 @@ all: @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir -p $$BUILD_TARGET; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \ + $(if $(FORCE_TARGETS),|| exit); \ done install: all @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ - make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \ + $(if $(FORCE_TARGETS),|| exit); \ done run_tests: all diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index c41640f18e4e..9d2df1f3e6bb 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -11,6 +11,8 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <linux/auxvec.h> +#include <linux/compiler.h> #include <sys/auxv.h> #include <sys/prctl.h> #include <asm/hwcap.h> @@ -595,6 +597,45 @@ static void lrcpc3_sigill(void) : "=r" (data0), "=r" (data1) : "r" (src) :); } +static void ignore_signal(int sig, siginfo_t *info, void *context) +{ + ucontext_t *uc = context; + + uc->uc_mcontext.pc += 4; +} + +static void ls64_sigill(void) +{ + struct sigaction ign, old; + char src[64] __aligned(64) = { 1 }; + + /* + * LS64 requires target memory to be Device/Non-cacheable (if + * FEAT_LS64WB not supported) and the completer supports these + * instructions, otherwise we'll receive a SIGBUS. Since we are only + * testing the ABI here, so just ignore the SIGBUS and see if we can + * execute the instructions without receiving a SIGILL. Restore the + * handler of SIGBUS after this test. + */ + ign.sa_sigaction = ignore_signal; + ign.sa_flags = SA_SIGINFO | SA_RESTART; + sigemptyset(&ign.sa_mask); + sigaction(SIGBUS, &ign, &old); + + register void *xn asm ("x8") = src; + register u64 xt_1 asm ("x0"); + + /* LD64B x0, [x8] */ + asm volatile(".inst 0xf83fd100" : "=r" (xt_1) : "r" (xn) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7"); + + /* ST64B x0, [x8] */ + asm volatile(".inst 0xf83f9100" : : "r" (xt_1), "r" (xn) + : "x1", "x2", "x3", "x4", "x5", "x6", "x7"); + + sigaction(SIGBUS, &old, NULL); +} + static const struct hwcap_data { const char *name; unsigned long at_hwcap; @@ -1134,6 +1175,14 @@ static const struct hwcap_data { .hwcap_bit = HWCAP3_MTE_STORE_ONLY, .cpuinfo = "mtestoreonly", }, + { + .name = "LS64", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LS64, + .cpuinfo = "ls64", + .sigill_fn = ls64_sigill, + .sigill_reliable = true, + }, }; typedef void (*sighandler_fn)(int, siginfo_t *, void *); diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 1703543fb7c7..ce4550fb7224 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -128,8 +128,7 @@ static int sys_clone(unsigned long clone_flags, unsigned long newsp, int *parent_tidptr, unsigned long tls, int *child_tidptr) { - return my_syscall5(__NR_clone, clone_flags, newsp, parent_tidptr, tls, - child_tidptr); + return syscall(__NR_clone, clone_flags, newsp, parent_tidptr, tls, child_tidptr); } #define __STACK_SIZE (8 * 1024 * 1024) diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S index 73830f6bc99b..881dfa3b342e 100644 --- a/tools/testing/selftests/arm64/fp/fp-pidbench.S +++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S @@ -33,7 +33,7 @@ function _start puts "Iterations per test: " mov x20, #10000 - lsl x20, x20, #8 + lsl x20, x20, #12 mov x0, x20 bl putdec puts "\n" @@ -63,6 +63,10 @@ function _start puts "SVE used per syscall: " test_loop "rdvl x0, #8" + // Test non-SVE execution after SVE + puts "No SVE after SVE: " + test_loop + // And we're done out: mov x0, #0 diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 250977abc398..ae4cce6afe2b 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -22,7 +22,7 @@ static size_t page_size = 65536; static __attribute__((noinline)) void valid_gcs_function(void) { /* Do something the compiler can't optimise out */ - my_syscall1(__NR_prctl, PR_SVE_GET_VL); + syscall(__NR_prctl, PR_SVE_GET_VL); } static inline int gcs_set_status(unsigned long mode) @@ -36,12 +36,10 @@ static inline int gcs_set_status(unsigned long mode) * other 3 values passed in registers to the syscall are zero * since the kernel validates them. */ - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, - 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, 0, 0, 0); if (ret == 0) { - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &new_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &new_mode, 0, 0, 0); if (ret == 0) { if (new_mode != mode) { ksft_print_msg("Mode set to %lx not %lx\n", @@ -49,7 +47,7 @@ static inline int gcs_set_status(unsigned long mode) ret = -EINVAL; } } else { - ksft_print_msg("Failed to validate mode: %d\n", ret); + ksft_print_msg("Failed to validate mode: %d\n", errno); } if (enabling != chkfeat_gcs()) { @@ -69,10 +67,9 @@ static bool read_status(void) unsigned long state; int ret; - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &state, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &state, 0, 0, 0); if (ret != 0) { - ksft_print_msg("Failed to read state: %d\n", ret); + ksft_print_msg("Failed to read state: %d\n", errno); return false; } @@ -188,9 +185,8 @@ static bool map_guarded_stack(void) int elem; bool pass = true; - buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size, - SHADOW_STACK_SET_MARKER | - SHADOW_STACK_SET_TOKEN); + buf = (void *)syscall(__NR_map_shadow_stack, 0, page_size, + SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN); if (buf == MAP_FAILED) { ksft_print_msg("Failed to map %lu byte GCS: %d\n", page_size, errno); @@ -257,8 +253,7 @@ static bool test_fork(void) valid_gcs_function(); get_gcspr(); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &child_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0); if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { ksft_print_msg("GCS not enabled in child\n"); ret = -EINVAL; @@ -321,8 +316,7 @@ static bool test_vfork(void) valid_gcs_function(); get_gcspr(); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &child_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0); if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { ksft_print_msg("GCS not enabled in child\n"); ret = EXIT_FAILURE; @@ -390,17 +384,15 @@ int main(void) if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) ksft_exit_skip("SKIP GCS not supported\n"); - ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, - &gcs_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", errno); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, - gcs_mode, 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret); + ksft_exit_fail_msg("Failed to enable GCS: %d\n", errno); } ksft_set_plan(ARRAY_SIZE(tests)); @@ -410,9 +402,9 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) - ksft_print_msg("Failed to disable GCS: %d\n", ret); + ksft_print_msg("Failed to disable GCS: %d\n", errno); ksft_finished(); diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index 052d0f9f92b3..f6937f890039 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -6,3 +6,4 @@ check_mmap_options check_prctl check_ksm_options check_user_mem +check_hugetlb_options diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 19c1638e312a..a3ea98211ea6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,7 +23,6 @@ test_tcpnotify_user test_libbpf xdping test_cpp -test_progs_verification_cert *.d *.subskel.h *.skel.h @@ -45,3 +44,6 @@ xdp_synproxy xdp_hw_metadata xdp_features verification_cert.h +*.BTF +*.BTF_ids +*.BTF.base diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a17baf8c6fd7..f7e1e5f5511c 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -1,4 +1,5 @@ # TEMPORARY # Alphabetical order +exe_ctx # execution context check (e.g., hardirq, softirq, etc) get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4aa60e83ff19..c6bf4dfb1495 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -4,6 +4,7 @@ include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include CXX ?= $(CROSS_COMPILE)g++ +OBJCOPY ?= $(CROSS_COMPILE)objcopy CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) @@ -107,8 +108,6 @@ TEST_PROGS := test_kmod.sh \ test_xdping.sh \ test_bpftool_build.sh \ test_bpftool.sh \ - test_bpftool_map.sh \ - test_bpftool_metadata.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh @@ -643,6 +642,9 @@ $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c ) > $$@) endif +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: $(RESOLVE_BTFIDS) $(TRUNNER_OUTPUT)/btf_data.bpf.o +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: private TEST_NEEDS_BTFIDS = 1 + # compile individual test files # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ @@ -650,6 +652,10 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ + $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -695,13 +701,11 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(TRUNNER_LIB_OBJS) \ - $(RESOLVE_BTFIDS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool @@ -716,9 +720,12 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(Q)mkdir -p $(BUILD_DIR) $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) +# Generates a header with C array declaration, containing test_progs_verification_cert bytes $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) - $(Q)ln -fs $< test_progs_verification_cert && \ - xxd -i test_progs_verification_cert > $@ + $(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \ + hexdump -v -e '12/1 " 0x%02x," "\n"' $< | sed 's/0x ,//g; $$s/,$$//'; \ + echo "};"; \ + echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests @@ -741,7 +748,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ json_writer.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ - ip_check_defrag_frags.h + ip_check_defrag_frags.h \ + bpftool_helpers.c TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ @@ -890,10 +898,10 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool $(TEST_KMOD_TARGETS) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ + *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ - $(OUTPUT)/FEATURE-DUMP.selftests \ - test_progs_verification_cert + $(OUTPUT)/FEATURE-DUMP.selftests .PHONY: docs docs-clean diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index bd29bb2e6cb5..8368bd3a0665 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -265,6 +265,7 @@ static const struct argp_option opts[] = { { "verbose", 'v', NULL, 0, "Verbose debug output"}, { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, { "quiet", 'q', NULL, 0, "Be more quiet"}, + { "stacktrace", 's', NULL, 0, "Get stack trace"}, { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, "Set of CPUs for producer threads; implies --affinity"}, { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, @@ -350,6 +351,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'q': env.quiet = true; break; + case 's': + env.stacktrace = true; + break; case ARG_PROD_AFFINITY_SET: env.affinity = true; if (parse_num_list(arg, &env.prod_cpus.cpus, diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index bea323820ffb..7cf21936e7ed 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -26,6 +26,7 @@ struct env { bool list; bool affinity; bool quiet; + bool stacktrace; int consumer_cnt; int producer_cnt; int nr_cpus; diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 34018fc3927f..aeec9edd3851 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -146,6 +146,7 @@ static void setup_ctx(void) bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); ctx.skel->rodata->batch_iters = args.batch_iters; + ctx.skel->rodata->stacktrace = env.stacktrace; } static void load_ctx(void) diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh index 83e05e837871..123b7feb6935 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -49,6 +49,11 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" done +header "Perfbuf, multi-producer" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "pb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 --rb-sample-rate 50 pb-libbpf)" +done + header "Ringbuf, multi-producer contention in overwrite mode, no consumer" for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)" diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2cd9165c7348..4b7210c318dd 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -580,11 +580,6 @@ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) struct bpf_iter_kmem_cache; extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym; @@ -615,9 +610,17 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) + extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 -extern const int __preempt_count __ksym; +extern const int __preempt_count __ksym __weak; + +struct pcpu_hot___local { + int preempt_count; +} __attribute__((preserve_access_index)); + +extern struct pcpu_hot___local pcpu_hot __ksym __weak; #endif struct task_struct___preempt_rt { @@ -627,7 +630,19 @@ struct task_struct___preempt_rt { static inline int get_preempt_count(void) { #if defined(bpf_target_x86) - return *(int *) bpf_this_cpu_ptr(&__preempt_count); + /* By default, read the per-CPU __preempt_count. */ + if (bpf_ksym_exists(&__preempt_count)) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); + + /* + * If __preempt_count does not exist, try to read preempt_count under + * struct pcpu_hot. Between v6.1 and v6.14 -- more specifically, + * [64701838bf057, 46e8fff6d45fe), preempt_count had been managed + * under struct pcpu_hot. + */ + if (bpf_core_field_exists(pcpu_hot.preempt_count)) + return ((struct pcpu_hot___local *) + bpf_this_cpu_ptr(&pcpu_hot))->preempt_count; #elif defined(bpf_target_arm64) return bpf_get_current_task_btf()->thread_info.preempt.count; #endif @@ -653,4 +668,60 @@ static inline int bpf_in_interrupt(void) (tsk->softirq_disable_cnt & SOFTIRQ_MASK); } +/* Description + * Report whether it is in NMI context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_nmi(void) +{ + return get_preempt_count() & NMI_MASK; +} + +/* Description + * Report whether it is in hard IRQ context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_hardirq(void) +{ + return get_preempt_count() & HARDIRQ_MASK; +} + +/* Description + * Report whether it is in softirq context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_serving_softirq(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return (pcnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; + + tsk = (void *) bpf_get_current_task_btf(); + return (tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; +} + +/* Description + * Report whether it is in task context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_task(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return !(pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + tsk = (void *) bpf_get_current_task_btf(); + return !((pcnt & (NMI_MASK | HARDIRQ_MASK)) | + ((tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET)); +} #endif diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index e0189254bb6e..7dad01439391 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -79,9 +79,6 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; -extern bool bpf_session_is_return(void) __ksym __weak; -extern __u64 *bpf_session_cookie(void) __ksym __weak; - struct dentry; /* Description * Returns xattr of a dentry diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c new file mode 100644 index 000000000000..a5824945a4a5 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "bpftool_helpers.h" +#include <unistd.h> +#include <string.h> +#include <stdbool.h> + +#define BPFTOOL_PATH_MAX_LEN 64 +#define BPFTOOL_FULL_CMD_MAX_LEN 512 + +#define BPFTOOL_DEFAULT_PATH "tools/sbin/bpftool" + +static int detect_bpftool_path(char *buffer) +{ + char tmp[BPFTOOL_PATH_MAX_LEN]; + + /* Check default bpftool location (will work if we are running the + * default flavor of test_progs) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Check alternate bpftool location (will work if we are running a + * specific flavor of test_progs, e.g. cpuv4 or no_alu32) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Failed to find bpftool binary */ + return 1; +} + +static int run_command(char *args, char *output_buf, size_t output_max_len) +{ + static char bpftool_path[BPFTOOL_PATH_MAX_LEN] = {0}; + bool suppress_output = !(output_buf && output_max_len); + char command[BPFTOOL_FULL_CMD_MAX_LEN]; + FILE *f; + int ret; + + /* Detect and cache bpftool binary location */ + if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path)) + return 1; + + ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s", + bpftool_path, args, + suppress_output ? " > /dev/null 2>&1" : ""); + + f = popen(command, "r"); + if (!f) + return 1; + + if (!suppress_output) + fread(output_buf, 1, output_max_len, f); + ret = pclose(f); + + return ret; +} + +int run_bpftool_command(char *args) +{ + return run_command(args, NULL, 0); +} + +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len) +{ + return run_command(args, output_buf, output_max_len); +} + diff --git a/tools/testing/selftests/bpf/bpftool_helpers.h b/tools/testing/selftests/bpf/bpftool_helpers.h new file mode 100644 index 000000000000..dec1ba201410 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#pragma once + +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> + +#define MAX_BPFTOOL_CMD_LEN (256) + +int run_bpftool_command(char *args); +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len); diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 000000000000..3f59b127943b --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 558839e3c185..24855381290d 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -1,6 +1,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BPF=y CONFIG_BPF_EVENTS=y CONFIG_BPF_JIT=y diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c deleted file mode 100644 index a4121d2248ac..000000000000 --- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ -#define _GNU_SOURCE -#include <sched.h> -#include <unistd.h> -#include <stdlib.h> -#include <stdbool.h> -#include <errno.h> -#include <string.h> -#include <pthread.h> - -#include <bpf/bpf.h> -#include <bpf/libbpf.h> - -#include "bpf_util.h" -#include "test_maps.h" -#include "task_local_storage_helpers.h" -#include "read_bpf_task_storage_busy.skel.h" - -struct lookup_ctx { - bool start; - bool stop; - int pid_fd; - int map_fd; - int loop; -}; - -static void *lookup_fn(void *arg) -{ - struct lookup_ctx *ctx = arg; - long value; - int i = 0; - - while (!ctx->start) - usleep(1); - - while (!ctx->stop && i++ < ctx->loop) - bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value); - return NULL; -} - -static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr) -{ - unsigned int i; - - ctx->stop = true; - ctx->start = true; - for (i = 0; i < nr; i++) - pthread_join(tids[i], NULL); -} - -void test_task_storage_map_stress_lookup(void) -{ -#define MAX_NR_THREAD 4096 - unsigned int i, nr = 256, loop = 8192, cpu = 0; - struct read_bpf_task_storage_busy *skel; - pthread_t tids[MAX_NR_THREAD]; - struct lookup_ctx ctx; - cpu_set_t old, new; - const char *cfg; - int err; - - cfg = getenv("TASK_STORAGE_MAP_NR_THREAD"); - if (cfg) { - nr = atoi(cfg); - if (nr > MAX_NR_THREAD) - nr = MAX_NR_THREAD; - } - cfg = getenv("TASK_STORAGE_MAP_NR_LOOP"); - if (cfg) - loop = atoi(cfg); - cfg = getenv("TASK_STORAGE_MAP_PIN_CPU"); - if (cfg) - cpu = atoi(cfg); - - skel = read_bpf_task_storage_busy__open_and_load(); - err = libbpf_get_error(skel); - CHECK(err, "open_and_load", "error %d\n", err); - - /* Only for a fully preemptible kernel */ - if (!skel->kconfig->CONFIG_PREEMPTION) { - printf("%s SKIP (no CONFIG_PREEMPTION)\n", __func__); - read_bpf_task_storage_busy__destroy(skel); - skips++; - return; - } - - /* Save the old affinity setting */ - sched_getaffinity(getpid(), sizeof(old), &old); - - /* Pinned on a specific CPU */ - CPU_ZERO(&new); - CPU_SET(cpu, &new); - sched_setaffinity(getpid(), sizeof(new), &new); - - ctx.start = false; - ctx.stop = false; - ctx.pid_fd = sys_pidfd_open(getpid(), 0); - ctx.map_fd = bpf_map__fd(skel->maps.task); - ctx.loop = loop; - for (i = 0; i < nr; i++) { - err = pthread_create(&tids[i], NULL, lookup_fn, &ctx); - if (err) { - abort_lookup(&ctx, tids, i); - CHECK(err, "pthread_create", "error %d\n", err); - goto out; - } - } - - ctx.start = true; - for (i = 0; i < nr; i++) - pthread_join(tids[i], NULL); - - skel->bss->pid = getpid(); - err = read_bpf_task_storage_busy__attach(skel); - CHECK(err, "attach", "error %d\n", err); - - /* Trigger program */ - sys_gettid(); - skel->bss->pid = 0; - - CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy); -out: - read_bpf_task_storage_busy__destroy(skel); - /* Restore affinity setting */ - sched_setaffinity(getpid(), sizeof(old), &old); - printf("%s:PASS\n", __func__); -} diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c index d15867cddde0..4f2866a615ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_list.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -27,17 +27,23 @@ static int list_sum(struct arena_list_head *head) return sum; } -static void test_arena_list_add_del(int cnt) +static void test_arena_list_add_del(int cnt, bool nonsleepable) { LIBBPF_OPTS(bpf_test_run_opts, opts); struct arena_list *skel; int expected_sum = (u64)cnt * (cnt - 1) / 2; int ret, sum; - skel = arena_list__open_and_load(); - if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load")) + skel = arena_list__open(); + if (!ASSERT_OK_PTR(skel, "arena_list__open")) return; + skel->rodata->nonsleepable = nonsleepable; + + ret = arena_list__load(skel); + if (!ASSERT_OK(ret, "arena_list__load")) + goto out; + skel->bss->cnt = cnt; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); ASSERT_OK(ret, "ret_add"); @@ -65,7 +71,11 @@ out: void test_arena_list(void) { if (test__start_subtest("arena_list_1")) - test_arena_list_add_del(1); + test_arena_list_add_del(1, false); if (test__start_subtest("arena_list_1000")) - test_arena_list_add_del(1000); + test_arena_list_add_del(1000, false); + if (test__start_subtest("arena_list_1_nonsleepable")) + test_arena_list_add_del(1, true); + if (test__start_subtest("arena_list_1000_nonsleepable")) + test_arena_list_add_del(1000, true); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c index d138cc7b1bda..75b0cf2467ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c @@ -240,6 +240,208 @@ static void check_nonstatic_global_other_sec(struct bpf_gotox *skel) bpf_link__destroy(link); } +/* + * The following subtests do not use skeleton rather than to check + * if the test should be skipped. + */ + +static int create_jt_map(__u32 max_entries) +{ + const char *map_name = "jt"; + __u32 key_size = 4; + __u32 value_size = sizeof(struct bpf_insn_array_value); + + return bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, map_name, + key_size, value_size, max_entries, NULL); +} + +static int prog_load(struct bpf_insn *insns, __u32 insn_cnt) +{ + return bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); +} + +static int __check_ldimm64_off_prog_load(__u32 max_entries, __u32 off) +{ + struct bpf_insn insns[] = { + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int map_fd, ret; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + insns[0].imm = map_fd; + insns[1].imm = off; + + ret = prog_load(insns, ARRAY_SIZE(insns)); + close(map_fd); + return ret; +} + +/* + * Check that loads from an instruction array map are only allowed with offsets + * which are multiples of 8 and do not point to outside of the map. + */ +static void check_ldimm64_off_load(struct bpf_gotox *skel __always_unused) +{ + const __u32 max_entries = 10; + int prog_fd; + __u32 off; + + for (off = 0; off < max_entries; off++) { + prog_fd = __check_ldimm64_off_prog_load(max_entries, off * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_off_prog_load")) + return; + close(prog_fd); + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, 7 /* not a multiple of 8 */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, max_entries * 8 /* too large */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } +} + +static int __check_ldimm64_gotox_prog_load(struct bpf_insn *insns, + __u32 insn_cnt, + __u32 off1, __u32 off2) +{ + const __u32 values[] = {5, 7, 9, 11, 13, 15}; + const __u32 max_entries = ARRAY_SIZE(values); + struct bpf_insn_array_value val = {}; + int map_fd, ret, i; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + + for (i = 0; i < max_entries; i++) { + val.orig_off = values[i]; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, + "bpf_map_update_elem")) { + close(map_fd); + return -1; + } + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + /* r1 = &map + offset1 */ + insns[0].imm = map_fd; + insns[1].imm = off1; + + /* r1 += off2 */ + insns[2].imm = off2; + + ret = prog_load(insns, insn_cnt); + close(map_fd); + return ret; +} + +static void reject_offsets(struct bpf_insn *insns, __u32 insn_cnt, __u32 off1, __u32 off2) +{ + int prog_fd; + + prog_fd = __check_ldimm64_gotox_prog_load(insns, insn_cnt, off1, off2); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_gotox_prog_load")) + close(prog_fd); +} + +/* + * Verify a bit more complex programs which include indirect jumps + * and with jump tables loaded with a non-zero offset + */ +static void check_ldimm64_off_gotox(struct bpf_gotox *skel __always_unused) +{ + struct bpf_insn insns[] = { + /* + * The following instructions perform an indirect jump to + * labels below. Thus valid offsets in the map are {0,...,5}. + * The program rewrites the offsets in the instructions below: + * r1 = &map + offset1 + * r1 += offset2 + * r1 = *r1 + * gotox r1 + */ + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0, 0), + + /* case 0: */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* case 1: */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* case 2: */ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* case 3: */ + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* case 4: */ + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_EXIT_INSN(), + /* default: */ + BPF_MOV64_IMM(BPF_REG_0, 5), + BPF_EXIT_INSN(), + }; + int prog_fd, err; + __u32 off1, off2; + + /* allow all combinations off1 + off2 < 6 */ + for (off1 = 0; off1 < 6; off1++) { + for (off2 = 0; off1 + off2 < 6; off2++) { + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = __check_ldimm64_gotox_prog_load(insns, ARRAY_SIZE(insns), + off1 * 8, off2 * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_gotox_prog_load")) + return; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) { + close(prog_fd); + return; + } + + if (!ASSERT_EQ(topts.retval, off1 + off2, "test_run_opts retval")) { + close(prog_fd); + return; + } + + close(prog_fd); + } + } + + /* reject off1 + off2 >= 6 */ + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 3, 8 * 3); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 7, 8 * 0); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 0, 8 * 7); + + /* reject (off1 + off2) % 8 != 0 */ + reject_offsets(insns, ARRAY_SIZE(insns), 3, 3); + reject_offsets(insns, ARRAY_SIZE(insns), 7, 0); + reject_offsets(insns, ARRAY_SIZE(insns), 0, 7); +} + void test_bpf_gotox(void) { struct bpf_gotox *skel; @@ -288,5 +490,11 @@ void test_bpf_gotox(void) if (test__start_subtest("one-map-two-jumps")) __subtest(skel, check_one_map_two_jumps); + if (test__start_subtest("check-ldimm64-off")) + __subtest(skel, check_ldimm64_off_load); + + if (test__start_subtest("check-ldimm64-off-gotox")) + __subtest(skel, check_ldimm64_off_gotox); + bpf_gotox__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index dd6512fa652b..215878ea04de 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -19,6 +19,10 @@ struct { { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, }; enum { @@ -111,7 +115,6 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK(err, "bpf_prog_test_run")) goto end; - ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple"); ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0"); ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0"); ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1"); diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c new file mode 100644 index 000000000000..e0eb869cb1b4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdint.h> +#include <sys/stat.h> +#include <stdbool.h> +#include <linux/bpf.h> +#include <bpf/libbpf.h> +#include <bpftool_helpers.h> +#include <test_progs.h> +#include <bpf/bpf.h> +#include "security_bpf_map.skel.h" + +#define PROTECTED_MAP_NAME "prot_map" +#define UNPROTECTED_MAP_NAME "not_prot_map" +#define BPF_ITER_FILE "bpf_iter_map_elem.bpf.o" +#define BPFFS_PIN_DIR "/sys/fs/bpf/test_bpftool_map" +#define INNER_MAP_NAME "inner_map_tt" +#define OUTER_MAP_NAME "outer_map_tt" + +#define MAP_NAME_MAX_LEN 64 +#define PATH_MAX_LEN 128 + +enum map_protection { + PROTECTED, + UNPROTECTED +}; + +struct test_desc { + char *name; + enum map_protection protection; + struct bpf_map *map; + char *map_name; + bool pinned; + char pin_path[PATH_MAX_LEN]; + bool write_must_fail; +}; + +static struct security_bpf_map *general_setup(void) +{ + struct security_bpf_map *skel; + uint32_t key, value; + int ret, i; + + skel = security_bpf_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto end; + + struct bpf_map *maps[] = {skel->maps.prot_map, skel->maps.not_prot_map}; + + ret = security_bpf_map__attach(skel); + if (!ASSERT_OK(ret, "attach maps security programs")) + goto end_destroy; + + for (i = 0; i < sizeof(maps)/sizeof(struct bpf_map *); i++) { + for (key = 0; key < 2; key++) { + int ret = bpf_map__update_elem(maps[i], &key, + sizeof(key), &key, sizeof(key), + 0); + if (!ASSERT_OK(ret, "set initial map value")) + goto end_destroy; + } + } + + key = 0; + value = 1; + ret = bpf_map__update_elem(skel->maps.prot_status_map, &key, + sizeof(key), &value, sizeof(value), 0); + if (!ASSERT_OK(ret, "configure map protection")) + goto end_destroy; + + if (!ASSERT_OK(mkdir(BPFFS_PIN_DIR, S_IFDIR), "create bpffs pin dir")) + goto end_destroy; + + return skel; +end_destroy: + security_bpf_map__destroy(skel); +end: + return NULL; +} + +static void general_cleanup(struct security_bpf_map *skel) +{ + rmdir(BPFFS_PIN_DIR); + security_bpf_map__destroy(skel); +} + +static void update_test_desc(struct security_bpf_map *skel, + struct test_desc *test) +{ + /* Now that the skeleton is loaded, update all missing fields to + * have the subtest properly configured + */ + if (test->protection == PROTECTED) { + test->map = skel->maps.prot_map; + test->map_name = PROTECTED_MAP_NAME; + } else { + test->map = skel->maps.not_prot_map; + test->map_name = UNPROTECTED_MAP_NAME; + } +} + +static int test_setup(struct security_bpf_map *skel, struct test_desc *desc) +{ + int ret; + + update_test_desc(skel, desc); + + if (desc->pinned) { + ret = snprintf(desc->pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + desc->name); + if (!ASSERT_GT(ret, 0, "format pin path")) + return 1; + ret = bpf_map__pin(desc->map, desc->pin_path); + if (!ASSERT_OK(ret, "pin map")) + return 1; + } + + return 0; +} + +static void test_cleanup(struct test_desc *desc) +{ + if (desc->pinned) + bpf_map__unpin(desc->map, NULL); +} + +static int lookup_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map lookup %s key 0 0 0 0", + map_handle); + if (!ASSERT_GT(ret, 0, "format map lookup cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int read_map_btf_data(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "btf dump map %s", + map_handle); + if (!ASSERT_GT(ret, 0, "format map btf dump cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int write_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map update %s key 0 0 0 0 value 1 1 1 1", map_handle); + if (!ASSERT_GT(ret, 0, "format value write cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int delete_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map delete %s key 0 0 0 0", map_handle); + if (!ASSERT_GT(ret, 0, "format value deletion cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int iterate_on_map_values(char *map_handle, char *iter_pin_path) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "iter pin %s %s map %s", + BPF_ITER_FILE, iter_pin_path, map_handle); + if (!ASSERT_GT(ret, 0, "format iterator creation cmd")) + return 1; + ret = run_bpftool_command(cmd); + if (ret) + return ret; + ret = snprintf(cmd, MAP_NAME_MAX_LEN, "cat %s", iter_pin_path); + if (ret < 0) + goto cleanup; + ret = system(cmd); + +cleanup: + unlink(iter_pin_path); + return ret; +} + +static int create_inner_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type array key 4 value 4 entries 4 name %s", + BPFFS_PIN_DIR, INNER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format inner map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int create_outer_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type hash_of_maps key 4 value 4 entries 2 name %s inner_map name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, OUTER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void delete_pinned_map(char *map_name) +{ + char pin_path[PATH_MAX_LEN]; + int ret; + + ret = snprintf(pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + map_name); + if (ret >= 0) + unlink(pin_path); +} + +static int add_outer_map_entry(int key) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map update pinned %s/%s key %d 0 0 0 value name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, key, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map value addition cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void test_basic_access(struct test_desc *desc) +{ + char map_handle[MAP_NAME_MAX_LEN]; + char iter_pin_path[PATH_MAX_LEN]; + int ret; + + if (desc->pinned) + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "pinned %s", + desc->pin_path); + else + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "name %s", + desc->map_name); + if (!ASSERT_GT(ret, 0, "format map handle")) + return; + + ret = lookup_map_value(map_handle); + ASSERT_OK(ret, "read map value"); + + ret = read_map_btf_data(map_handle); + ASSERT_OK(ret, "read map btf data"); + + ret = write_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "write map value"); + + ret = delete_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "delete map value"); + /* Restore deleted value */ + if (!ret) + write_map_value(map_handle); + + ret = snprintf(iter_pin_path, PATH_MAX_LEN, "%s/iter", BPFFS_PIN_DIR); + if (ASSERT_GT(ret, 0, "format iter pin path")) { + ret = iterate_on_map_values(map_handle, iter_pin_path); + ASSERT_OK(ret, "iterate on map values"); + } +} + +static void test_create_nested_maps(void) +{ + if (!ASSERT_OK(create_inner_map(), "create inner map")) + return; + if (!ASSERT_OK(create_outer_map(), "create outer map")) + goto end_cleanup_inner; + ASSERT_OK(add_outer_map_entry(0), "add a first entry in outer map"); + ASSERT_OK(add_outer_map_entry(1), "add a second entry in outer map"); + ASSERT_NEQ(add_outer_map_entry(2), 0, "add a third entry in outer map"); + + delete_pinned_map(OUTER_MAP_NAME); +end_cleanup_inner: + delete_pinned_map(INNER_MAP_NAME); +} + +static void test_btf_list(void) +{ + ASSERT_OK(run_bpftool_command("btf list"), "list btf data"); +} + +static struct test_desc tests[] = { + { + .name = "unprotected_unpinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = false, + }, + { + .name = "unprotected_pinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = false, + }, + { + .name = "protected_unpinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = true, + }, + { + .name = "protected_pinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = true, + } +}; + +static const size_t tests_count = ARRAY_SIZE(tests); + +void test_bpftool_maps_access(void) +{ + struct security_bpf_map *skel; + struct test_desc *current; + int i; + + skel = general_setup(); + if (!ASSERT_OK_PTR(skel, "prepare programs")) + goto cleanup; + + for (i = 0; i < tests_count; i++) { + current = &tests[i]; + if (!test__start_subtest(current->name)) + continue; + if (ASSERT_OK(test_setup(skel, current), "subtest setup")) { + test_basic_access(current); + test_cleanup(current); + } + } + if (test__start_subtest("nested_maps")) + test_create_nested_maps(); + if (test__start_subtest("btf_list")) + test_btf_list(); + +cleanup: + general_cleanup(skel); +} + diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c new file mode 100644 index 000000000000..408ace90dc7e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <bpftool_helpers.h> +#include <test_progs.h> +#include <linux/bpf.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <stdbool.h> + +#define BPFFS_DIR "/sys/fs/bpf/test_metadata" +#define BPFFS_USED BPFFS_DIR "/used" +#define BPFFS_UNUSED BPFFS_DIR "/unused" + +#define BPF_FILE_USED "metadata_used.bpf.o" +#define BPF_FILE_UNUSED "metadata_unused.bpf.o" +#define METADATA_MAP_NAME "metadata.rodata" + +#define MAX_BPFTOOL_OUTPUT_LEN (64*1024) + +#define MAX_TOKENS_TO_CHECK 3 +static char output[MAX_BPFTOOL_OUTPUT_LEN]; + +struct test_desc { + char *name; + char *bpf_prog; + char *bpffs_path; + char *expected_output[MAX_TOKENS_TO_CHECK]; + char *expected_output_json[MAX_TOKENS_TO_CHECK]; + char *metadata_map_name; +}; + +static int setup(struct test_desc *test) +{ + return mkdir(BPFFS_DIR, 0700); +} + +static void cleanup(struct test_desc *test) +{ + unlink(test->bpffs_path); + rmdir(BPFFS_DIR); +} + +static int check_metadata(char *buf, char * const *tokens, int count) +{ + int i; + + for (i = 0; i < count && tokens[i]; i++) + if (!strstr(buf, tokens[i])) + return 1; + + return 0; +} + +static void run_test(struct test_desc *test) +{ + int ret; + char cmd[MAX_BPFTOOL_CMD_LEN]; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog load %s %s", + test->bpf_prog, test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format prog insert command")) + return; + ret = run_bpftool_command(cmd); + if (!ASSERT_OK(ret, "load program")) + return; + + /* Check output with default format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info")) { + ret = check_metadata(output, test->expected_output, + ARRAY_SIZE(test->expected_output)); + ASSERT_OK(ret, "find metadata"); + } + + /* Check output with json format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog -j show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command in json")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info in json")) { + ret = check_metadata(output, test->expected_output_json, + ARRAY_SIZE(test->expected_output_json)); + ASSERT_OK(ret, "find metadata in json"); + } + + /* Check that the corresponding map can be found and accessed */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map show name %s", + test->metadata_map_name); + if (!ASSERT_GT(ret, 0, "format map check command")) + return; + ASSERT_OK(run_bpftool_command(cmd), "access metadata map"); +} + +static struct test_desc tests[] = { + { + .name = "metadata_unused", + .bpf_prog = BPF_FILE_UNUSED, + .bpffs_path = BPFFS_UNUSED, + .expected_output = { + "a = \"foo\"", + "b = 1" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"foo\",\"b\":1}" + }, + .metadata_map_name = METADATA_MAP_NAME + }, + { + .name = "metadata_used", + .bpf_prog = BPF_FILE_USED, + .bpffs_path = BPFFS_USED, + .expected_output = { + "a = \"bar\"", + "b = 2" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"bar\",\"b\":2}" + }, + .metadata_map_name = METADATA_MAP_NAME + } +}; +static const int tests_count = ARRAY_SIZE(tests); + +void test_bpftool_metadata(void) +{ + int i; + + for (i = 0; i < tests_count; i++) { + if (!test__start_subtest(tests[i].name)) + continue; + if (ASSERT_OK(setup(&tests[i]), "setup bpffs pin dir")) { + run_test(&tests[i]); + cleanup(&tests[i]); + } + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 10cba526d3e6..f1642794f70e 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -875,8 +875,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, "int cpu_number = (int)100", 100); #endif - TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, - "static int bpf_cgrp_storage_busy = (int)2", 2); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_bprintf_nest_level", int, BTF_F_COMPACT, + "static int bpf_bprintf_nest_level = (int)2", 2); } struct btf_dump_string_ctx { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_permute.c b/tools/testing/selftests/bpf/prog_tests/btf_permute.c new file mode 100644 index 000000000000..04ade5ad77ac --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_permute.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Xiaomi */ + +#include <test_progs.h> +#include <bpf/btf.h> +#include "btf_helpers.h" + +static void permute_base_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[2] FUNC 'f' type_id=6 linkage=static", + "[3] PTR '(anon)' type_id=4", + "[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[5] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[6] FUNC_PROTO '(anon)' ret_type_id=4 vlen=1\n" + "\t'p' type_id=3"); +} + +/* Ensure btf__permute works as expected in the base-BTF scenario */ +static void test_permute_base(void) +{ + struct btf *btf; + __u32 permute_ids[7]; + int err; + + btf = btf__new_empty(); + if (!ASSERT_OK_PTR(btf, "empty_main_btf")) + return; + + btf__add_int(btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf, 1); /* [2] ptr to int */ + btf__add_struct(btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(btf, 1); /* [5] int (*)(int *p); */ + btf__add_func_param(btf, "p", 2); + btf__add_func(btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_base")) + goto done; + permute_base_check(btf); + + /* ids[0] must be 0 for base BTF */ + permute_ids[0] = 4; /* [0] -> [0] */ + permute_ids[1] = 0; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* id_map_cnt is invalid */ + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 4; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 6; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Type ID must be valid */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 3; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 7; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + +done: + btf__free(btf); +} + +static void permute_split_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] FUNC 'f' type_id=5 linkage=static", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0"); +} + +/* Ensure btf__permute works as expected in the split-BTF scenario */ +static void test_permute_split(void) +{ + struct btf *split_btf = NULL, *base_btf = NULL; + __u32 permute_ids[4]; + int err, start_id; + + base_btf = btf__new_empty(); + if (!ASSERT_OK_PTR(base_btf, "empty_main_btf")) + return; + + btf__add_int(base_btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(base_btf, 1); /* [2] ptr to int */ + VALIDATE_RAW_BTF( + base_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + split_btf = btf__new_empty_split(base_btf); + if (!ASSERT_OK_PTR(split_btf, "empty_split_btf")) + goto cleanup; + btf__add_struct(split_btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(split_btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(split_btf, 1); /* [5] int (*)(int p); */ + btf__add_func_param(split_btf, "p", 2); + btf__add_func(split_btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + split_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + start_id = btf__type_cnt(base_btf); + permute_ids[3 - start_id] = 6; /* [3] -> [6] */ + permute_ids[4 - start_id] = 3; /* [4] -> [3] */ + permute_ids[5 - start_id] = 5; /* [5] -> [5] */ + permute_ids[6 - start_id] = 4; /* [6] -> [4] */ + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_split")) + goto cleanup; + permute_split_check(split_btf); + + /* + * For split BTF, id_map_cnt must equal to the number of types + * added on top of base BTF + */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 3; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Can not map to base ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 2; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + +cleanup: + btf__free(split_btf); + btf__free(base_btf); +} + +void test_btf_permute(void) +{ + if (test__start_subtest("permute_base")) + test_permute_base(); + if (test__start_subtest("permute_split")) + test_permute_split(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index 574d9a0cdc8e..0f88a9d00a22 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -190,6 +190,16 @@ static void test_walk_self_only(struct cgroup_iter *skel) BPF_CGROUP_ITER_SELF_ONLY, "self_only"); } +static void test_walk_children(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n" EPILOGUE, cg_id[CHILD1], + cg_id[CHILD2]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_CHILDREN, "children"); +} + static void test_walk_dead_self_only(struct cgroup_iter *skel) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); @@ -325,6 +335,8 @@ void test_cgroup_iter(void) test_walk_dead_self_only(skel); if (test__start_subtest("cgroup_iter__self_only_css_task")) test_walk_self_only_css_task(); + if (test__start_subtest("cgroup_iter__children")) + test_walk_children(skel); out: cgroup_iter__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 000000000000..a5afd16705f0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <bpf/libbpf.h> +#include <bpf/btf.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <unistd.h> +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + char *path; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; + + /* + * Increase memcg file usage by creating and writing + * to a mapped file. + */ + fd = open(path, O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); + unlink(path); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 64 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = i - 1; i >= 0; i--) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c index e442be9dde7e..fb2cea710db3 100644 --- a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c @@ -233,7 +233,7 @@ static void subtest_dmabuf_iter_check_lots_of_buffers(struct dmabuf_iter *skel) while ((bytes_read = read(iter_fd, buf, sizeof(buf))) > 0) total_bytes_read += bytes_read; - ASSERT_GT(total_bytes_read, getpagesize(), "total_bytes_read"); + ASSERT_GT(total_bytes_read, 4096, "total_bytes_read"); close(iter_fd); } diff --git a/tools/testing/selftests/bpf/prog_tests/exe_ctx.c b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c new file mode 100644 index 000000000000..aed6a6ef0876 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include "test_ctx.skel.h" + +void test_exe_ctx(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + cpu_set_t old_cpuset, target_cpuset; + struct test_ctx *skel; + int err, prog_fd; + + /* 1. Pin the current process to CPU 0. */ + if (sched_getaffinity(0, sizeof(old_cpuset), &old_cpuset) == 0) { + CPU_ZERO(&target_cpuset); + CPU_SET(0, &target_cpuset); + ASSERT_OK(sched_setaffinity(0, sizeof(target_cpuset), + &target_cpuset), "setaffinity"); + } + + skel = test_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto restore_affinity; + + err = test_ctx__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* 2. When we run this, the kernel will execute the BPF prog on CPU 0. */ + prog_fd = bpf_program__fd(skel->progs.trigger_all_contexts); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "test_run_trigger"); + + /* 3. Wait for the local CPU's softirq/tasklet to finish. */ + for (int i = 0; i < 1000; i++) { + if (skel->bss->count_task > 0 && + skel->bss->count_hardirq > 0 && + skel->bss->count_softirq > 0) + break; + usleep(1000); /* Wait 1ms per iteration, up to 1 sec total */ + } + + /* On CPU 0, these should now all be non-zero. */ + ASSERT_GT(skel->bss->count_task, 0, "task_ok"); + ASSERT_GT(skel->bss->count_hardirq, 0, "hardirq_ok"); + ASSERT_GT(skel->bss->count_softirq, 0, "softirq_ok"); + +cleanup: + test_ctx__destroy(skel); + +restore_affinity: + ASSERT_OK(sched_setaffinity(0, sizeof(old_cpuset), &old_cpuset), + "restore_affinity"); +} diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c new file mode 100644 index 000000000000..a299aeb8cc2e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include <test_progs.h> +#include "fsession_test.skel.h" + +static int check_result(struct fsession_test *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* Trigger test function calls */ + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return err; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return topts.retval; + + for (int i = 0; i < sizeof(*skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(((__u64 *)skel->bss)[i], 1, "test_result")) + return -EINVAL; + } + + return 0; +} + +static void test_fsession_basic(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + +static void test_fsession_reattach(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) + goto cleanup; + + /* first attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_first_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + + /* detach */ + fsession_test__detach(skel); + + /* reset counters */ + memset(skel->bss, 0, sizeof(*skel->bss)); + + /* second attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_second_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + +cleanup: + fsession_test__destroy(skel); +} + +static void test_fsession_cookie(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + goto cleanup; + + /* + * The test_fsession_basic() will test the session cookie with + * bpf_get_func_ip() case, so we need only check + * the cookie without bpf_get_func_ip() case here + */ + bpf_program__set_autoload(skel->progs.test6, false); + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + skel->bss->test6_entry_result = 1; + skel->bss->test6_exit_result = 1; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + +void test_fsession_test(void) +{ + if (test__start_subtest("fsession_test")) + test_fsession_basic(); + if (test__start_subtest("fsession_reattach")) + test_fsession_reattach(); + if (test__start_subtest("fsession_cookie")) + test_fsession_cookie(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index 64a9c95d4acf..96b27de05524 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -33,11 +33,15 @@ void test_get_func_args_test(void) ASSERT_EQ(topts.retval >> 16, 1, "test_run"); ASSERT_EQ(topts.retval & 0xffff, 1234 + 29, "test_run"); + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); + ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c index c40242dfa8fb..7772a0f288d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -46,6 +46,8 @@ static void test_function_entry(void) ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); ASSERT_EQ(skel->bss->test8_result, 1, "test8_result"); + ASSERT_EQ(skel->bss->test9_entry_result, 1, "test9_entry_result"); + ASSERT_EQ(skel->bss->test9_exit_result, 1, "test9_exit_result"); cleanup: get_func_ip_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index 3cea71f9c500..a539980a2fbe 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -253,6 +253,11 @@ static void subtest_css_iters(void) { "/cg1/cg2" }, { "/cg1/cg2/cg3" }, { "/cg1/cg2/cg3/cg4" }, + { "/cg1/cg5" }, + { "/cg1/cg5/cg6" }, + { "/cg1/cg7" }, + { "/cg1/cg7/cg8" }, + { "/cg1/cg7/cg8/cg9" }, }; int err, cg_nr = ARRAY_SIZE(cgs); int i; @@ -284,7 +289,8 @@ static void subtest_css_iters(void) ASSERT_EQ(skel->bss->post_order_cnt, cg_nr, "post_order_cnt"); ASSERT_EQ(skel->bss->last_cg_id, get_cgroup_id(cgs[0].path), "last_cg_id"); - ASSERT_EQ(skel->bss->tree_high, cg_nr - 1, "tree_high"); + ASSERT_EQ(skel->bss->children_cnt, 3, "children_cnt"); + ASSERT_EQ(skel->bss->tree_high, 3, "tree_high"); iters_css__detach(skel); cleanup: cleanup_cgroup_environment(); diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c new file mode 100644 index 000000000000..5e4793c9c29a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include "kfunc_implicit_args.skel.h" + +void test_kfunc_implicit_args(void) +{ + RUN_TESTS(kfunc_implicit_args); +} diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 6cfaa978bc9a..9caef222e528 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <errno.h> +#include <sys/prctl.h> #include <test_progs.h> #include "kprobe_multi.skel.h" #include "trace_helpers.h" @@ -540,6 +542,46 @@ cleanup: kprobe_multi_override__destroy(skel); } +static void test_override(void) +{ + struct kprobe_multi_override *skel = NULL; + int err; + + skel = kprobe_multi_override__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + skel->bss->pid = getpid(); + + /* no override */ + err = prctl(0xffff, 0); + ASSERT_EQ(err, -1, "err"); + + /* kprobe.multi override */ + skel->links.test_override = bpf_program__attach_kprobe_multi_opts(skel->progs.test_override, + SYS_PREFIX "sys_prctl", NULL); + if (!ASSERT_OK_PTR(skel->links.test_override, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + + bpf_link__destroy(skel->links.test_override); + skel->links.test_override = NULL; + + /* kprobe override */ + skel->links.test_kprobe_override = bpf_program__attach_kprobe(skel->progs.test_kprobe_override, + false, SYS_PREFIX "sys_prctl"); + if (!ASSERT_OK_PTR(skel->links.test_kprobe_override, "bpf_program__attach_kprobe")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + +cleanup: + kprobe_multi_override__destroy(skel); +} + #ifdef __x86_64__ static void test_attach_write_ctx(void) { @@ -597,6 +639,8 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("override")) + test_override(); if (test__start_subtest("session")) test_session_skel_api(); if (test__start_subtest("session_cookie")) diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 8743df599567..f372162c0280 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -131,6 +131,25 @@ static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu) return 0; } +static void wait_for_map_release(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, lopts); + struct map_kptr *skel; + int ret; + + skel = map_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load")) + return; + + do { + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &lopts); + ASSERT_OK(ret, "count_ref ret"); + ASSERT_OK(lopts.retval, "count_ref retval"); + } while (skel->bss->num_of_refs != 2); + + map_kptr__destroy(skel); +} + void serial_test_map_kptr(void) { struct rcu_tasks_trace_gp *skel; @@ -148,11 +167,15 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on bpf_map_free_deferred */ test_map_kptr_success(false); ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on synchronous delete elem */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index 343da65864d6..a72ae0b29f6e 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include "cgroup_helpers.h" #include "percpu_alloc_array.skel.h" #include "percpu_alloc_cgrp_local_storage.skel.h" #include "percpu_alloc_fail.skel.h" @@ -115,6 +116,328 @@ static void test_failure(void) { RUN_TESTS(percpu_alloc_fail); } +static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t key_sz, u32 entries, + int nr_cpus, bool test_batch) +{ + size_t value_sz = sizeof(u32), value_sz_cpus, value_sz_total; + u32 *values = NULL, *values_percpu = NULL; + const u32 value = 0xDEADC0DE; + int i, j, cpu, map_fd, err; + u64 batch = 0, flags; + void *values_row; + u32 count, v; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + value_sz_cpus = value_sz * nr_cpus; + values = calloc(entries, value_sz_cpus); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + + values_percpu = calloc(entries, roundup(value_sz, 8) * nr_cpus); + if (!ASSERT_OK_PTR(values_percpu, "calloc values_percpu")) { + free(values); + return; + } + + value_sz_total = value_sz_cpus * entries; + memset(values, 0, value_sz_total); + + map_fd = bpf_map__fd(map); + flags = BPF_F_CPU | BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu|all_cpus")) + goto out; + + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem cpu|all_cpus")) + goto out; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags all_cpus")) + goto out; + + flags = BPF_F_LOCK | BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags BPF_F_LOCK")) + goto out; + + flags = BPF_F_LOCK | BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem BPF_F_LOCK")) + goto out; + + flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_elem -ERANGE")) + goto out; + + err = bpf_map__update_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__update_elem -ERANGE")) + goto out; + + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_lookup_elem_flags -ERANGE")) + goto out; + + err = bpf_map__lookup_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__lookup_elem -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + /* clear value on all cpus */ + values[0] = 0; + flags = BPF_F_ALL_CPUS; + for (i = 0; i < entries; i++) { + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem all_cpus")) + goto out; + } + + /* update value on specified cpu */ + for (i = 0; i < entries; i++) { + values[0] = value; + flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem specified cpu")) + goto out; + + /* lookup then check value on CPUs */ + for (j = 0; j < nr_cpus; j++) { + flags = (u64)j << 32 | BPF_F_CPU; + err = bpf_map__lookup_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__lookup_elem specified cpu")) + goto out; + if (!ASSERT_EQ(values[0], j != cpu ? 0 : value, + "bpf_map__lookup_elem value on specified cpu")) + goto out; + } + } + } + + if (!test_batch) + goto out; + + count = entries; + batch_opts.elem_flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_batch -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + memset(values, 0, value_sz_total); + + /* clear values across all CPUs */ + count = entries; + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) + goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; + + /* update values on specified CPU */ + for (i = 0; i < entries; i++) + values[i] = value; + + count = entries; + batch_opts.elem_flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) + goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; + + /* lookup values on specified CPU */ + batch = 0; + count = entries; + memset(values, 0, entries * value_sz); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) + goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; + + for (i = 0; i < entries; i++) + if (!ASSERT_EQ(values[i], value, + "bpf_map_lookup_batch value on specified cpu")) + goto out; + + /* lookup values from all CPUs */ + batch = 0; + count = entries; + batch_opts.elem_flags = 0; + memset(values_percpu, 0, roundup(value_sz, 8) * nr_cpus * entries); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values_percpu, &count, + &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) + goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; + + for (i = 0; i < entries; i++) { + values_row = (void *) values_percpu + + roundup(value_sz, 8) * i * nr_cpus; + for (j = 0; j < nr_cpus; j++) { + v = *(u32 *) (values_row + roundup(value_sz, 8) * j); + if (!ASSERT_EQ(v, j != cpu ? 0 : value, + "bpf_map_lookup_batch value all_cpus")) + goto out; + } + } + } + +out: + free(values_percpu); + free(values); +} + +static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) +{ + struct percpu_alloc_array *skel; + size_t key_sz = sizeof(int); + int *keys, nr_cpus, i, err; + struct bpf_map *map; + u32 max_entries; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + max_entries = nr_cpus * 2; + keys = calloc(max_entries, key_sz); + if (!ASSERT_OK_PTR(keys, "calloc keys")) + return; + + for (i = 0; i < max_entries; i++) + keys[i] = i; + + skel = percpu_alloc_array__open(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open")) { + free(keys); + return; + } + + map = skel->maps.percpu; + bpf_map__set_type(map, map_type); + bpf_map__set_max_entries(map, max_entries); + + err = percpu_alloc_array__load(skel); + if (!ASSERT_OK(err, "test_percpu_alloc__load")) + goto out; + + test_percpu_map_op_cpu_flag(map, keys, key_sz, nr_cpus, nr_cpus, true); +out: + percpu_alloc_array__destroy(skel); + free(keys); +} + +static void test_percpu_array_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_ARRAY); +} + +static void test_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_HASH); +} + +static void test_lru_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_LRU_PERCPU_HASH); +} + +static void test_percpu_cgroup_storage_cpu_flag(void) +{ + struct percpu_alloc_array *skel = NULL; + struct bpf_cgroup_storage_key key; + int cgroup, prog_fd, nr_cpus, err; + struct bpf_map *map; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + return; + + cgroup = create_and_get_cgroup("/cg_percpu"); + if (!ASSERT_GE(cgroup, 0, "create_and_get_cgroup")) { + cleanup_cgroup_environment(); + return; + } + + err = join_cgroup("/cg_percpu"); + if (!ASSERT_OK(err, "join_cgroup")) + goto out; + + skel = percpu_alloc_array__open_and_load(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.cgroup_egress); + err = bpf_prog_attach(prog_fd, cgroup, BPF_CGROUP_INET_EGRESS, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + map = skel->maps.percpu_cgroup_storage; + err = bpf_map_get_next_key(bpf_map__fd(map), NULL, &key); + if (!ASSERT_OK(err, "bpf_map_get_next_key")) + goto out; + + test_percpu_map_op_cpu_flag(map, &key, sizeof(key), 1, nr_cpus, false); +out: + bpf_prog_detach2(-1, cgroup, BPF_CGROUP_INET_EGRESS); + close(cgroup); + cleanup_cgroup_environment(); + percpu_alloc_array__destroy(skel); +} + +static void test_map_op_cpu_flag(enum bpf_map_type map_type) +{ + u32 max_entries = 1, count = max_entries; + u64 flags, batch = 0, val = 0; + int err, map_fd, key = 0; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + map_fd = bpf_map_create(map_type, "test_cpu_flag", sizeof(int), sizeof(u64), max_entries, + NULL); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) + return; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_update_elem all_cpus"); + + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_update_batch all_cpus"); + + flags = BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu"); + + batch_opts.elem_flags = BPF_F_CPU; + err = bpf_map_lookup_batch(map_fd, NULL, &batch, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_lookup_batch cpu"); + + close(map_fd); +} + +static void test_array_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_ARRAY); +} + +static void test_hash_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_HASH); +} + void test_percpu_alloc(void) { if (test__start_subtest("array")) @@ -125,4 +448,16 @@ void test_percpu_alloc(void) test_cgrp_local_storage(); if (test__start_subtest("failure_tests")) test_failure(); + if (test__start_subtest("cpu_flag_percpu_array")) + test_percpu_array_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_hash")) + test_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_lru_percpu_hash")) + test_lru_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_cgroup_storage")) + test_percpu_cgroup_storage_cpu_flag(); + if (test__start_subtest("cpu_flag_array")) + test_array_cpu_flag(); + if (test__start_subtest("cpu_flag_hash")) + test_hash_cpu_flag(); } diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index 51544372f52e..41dfaaabb73f 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -101,9 +101,9 @@ static int resolve_symbols(void) int type_id; __u32 nr; - btf = btf__parse_elf("btf_data.bpf.o", NULL); + btf = btf__parse_raw("resolve_btfids.test.o.BTF"); if (CHECK(libbpf_get_error(btf), "resolve", - "Failed to load BTF from btf_data.bpf.o\n")) + "Failed to load BTF from resolve_btfids.test.o.BTF\n")) return -1; nr = btf__type_cnt(btf); diff --git a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c index e4940583924b..e2c867fd5244 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c @@ -5,9 +5,14 @@ #include "sk_bypass_prot_mem.skel.h" #include "network_helpers.h" +#ifndef PAGE_SIZE +#include <unistd.h> +#define PAGE_SIZE getpagesize() +#endif + #define NR_PAGES 32 #define NR_SOCKETS 2 -#define BUF_TOTAL (NR_PAGES * 4096 / NR_SOCKETS) +#define BUF_TOTAL (NR_PAGES * PAGE_SIZE / NR_SOCKETS) #define BUF_SINGLE 1024 #define NR_SEND (BUF_TOTAL / BUF_SINGLE) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1e3e4392dcca..256707e7d20d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include <error.h> -#include <netinet/tcp.h> +#include <linux/tcp.h> +#include <linux/socket.h> #include <sys/epoll.h> #include "test_progs.h" @@ -22,6 +23,15 @@ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ +/** + * SOL_TCP is defined in <netinet/tcp.h> (glibc), but the copybuf_address + * field of tcp_zerocopy_receive is not yet included in older versions. + * This workaround remains necessary until the glibc update propagates. + */ +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + static int connected_socket_v4(void) { struct sockaddr_in addr = { @@ -536,13 +546,14 @@ out: } -static void test_sockmap_skb_verdict_fionread(bool pass_prog) +static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog) { int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; int expected, zero = 0, sent, recvd, avail; struct test_sockmap_pass_prog *pass = NULL; struct test_sockmap_drop_prog *drop = NULL; char buf[256] = "0123456789"; + int split_len = sizeof(buf) / 2; if (pass_prog) { pass = test_sockmap_pass_prog__open_and_load(); @@ -550,7 +561,10 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) return; verdict = bpf_program__fd(pass->progs.prog_skb_verdict); map = bpf_map__fd(pass->maps.sock_map_rx); - expected = sizeof(buf); + if (sotype == SOCK_DGRAM) + expected = split_len; /* FIONREAD for UDP is different from TCP */ + else + expected = sizeof(buf); } else { drop = test_sockmap_drop_prog__open_and_load(); if (!ASSERT_OK_PTR(drop, "open_and_load")) @@ -566,7 +580,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); if (!ASSERT_OK(err, "create_socket_pairs()")) goto out; @@ -574,8 +588,9 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) goto out_close; - sent = xsend(p1, &buf, sizeof(buf), 0); - ASSERT_EQ(sent, sizeof(buf), "xsend(p0)"); + sent = xsend(p1, &buf, split_len, 0); + sent += xsend(p1, &buf, sizeof(buf) - split_len, 0); + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); err = ioctl(c1, FIONREAD, &avail); ASSERT_OK(err, "ioctl(FIONREAD) error"); ASSERT_EQ(avail, expected, "ioctl(FIONREAD)"); @@ -597,6 +612,12 @@ out: test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_fionread(bool pass_prog) +{ + do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog); + do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog); +} + static void test_sockmap_skb_verdict_change_tail(void) { struct test_sockmap_change_tail *skel; @@ -1042,6 +1063,257 @@ close_map: xclose(map); } +/* it is used to reproduce WARNING */ +static void test_sockmap_zc(void) +{ + int map, err, sent, recvd, zero = 0, one = 1, on = 1; + char buf[10] = "0123456789", rcv[11], addr[100]; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + struct tcp_zerocopy_receive zc; + socklen_t zc_len = sizeof(zc); + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend")) + goto end; + + /* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */ + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)")) + goto end; + + /* uninstall sockmap of p1 */ + bpf_map_delete_elem(map, &one); + + /* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */ + sent = xsend(c1, buf, sizeof(buf) - 1, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend")) + goto end; + + err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on)); + if (!ASSERT_OK(err, "setsockopt")) + goto end; + + memset(&zc, 0, sizeof(zc)); + zc.copybuf_address = (__u64)((unsigned long)addr); + zc.copybuf_len = sizeof(addr); + + err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); + if (!ASSERT_OK(err, "getsockopt")) + goto end; + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* it is used to check whether copied_seq of sk is correct */ +static void test_sockmap_copied_seq(bool strp) +{ + int i, map, err, sent, recvd, zero = 0, one = 1; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + if (strp) { + prog = skel->progs.prog_skb_verdict_ingress_strp; + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0); + if (!ASSERT_OK(err, "bpf_prog_attach parser")) + goto end; + } + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) + goto end; + + /* just trigger sockamp: data sent by c0 will be received by p1 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf")) + goto end; + + /* do partial read */ + recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1); + recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; + + /* uninstall sockmap of p1 and p0 */ + err = bpf_map_delete_elem(map, &one); + if (!ASSERT_OK(err, "bpf_map_delete_elem(1)")) + goto end; + + err = bpf_map_delete_elem(map, &zero); + if (!ASSERT_OK(err, "bpf_map_delete_elem(0)")) + goto end; + + /* now all sockets become plain socket, they should still work */ + for (i = 0; i < 5; i++) { + /* test copied_seq of p1 by running tcp native stack */ + sent = xsend(c1, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native")) + goto end; + + recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native")) + goto end; + + /* p0 previously redirected skb to p1, we also check copied_seq of p0 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native")) + goto end; + + recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native")) + goto end; + } + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* Wait until FIONREAD returns the expected value or timeout */ +static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int avail = 0; + + while (elapsed < timeout_ms) { + if (ioctl(fd, FIONREAD, &avail) < 0) + return -errno; + if (avail >= expected) + return avail; + usleep(1000); + elapsed++; + } + return avail; +} + +/* it is used to send data to via native stack and BPF redirecting */ +static void test_sockmap_multi_channels(int sotype) +{ + int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); + if (err) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + /* send data to p1 via native stack */ + sent = xsend(c1, buf, 2, 0); + if (!ASSERT_EQ(sent, 2, "xsend(2)")) + goto end; + + avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return"); + + /* send data to p1 via bpf redirecting */ + sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)")) + goto end; + + /* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable + * here since it may return immediately if prior data is already queued. + */ + expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf); + avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return"); + + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -1108,4 +1380,14 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_vsock_poll(); if (test__start_subtest("sockmap vsock unconnected")) test_sockmap_vsock_unconnected(); + if (test__start_subtest("sockmap with zc")) + test_sockmap_zc(); + if (test__start_subtest("sockmap recover")) + test_sockmap_copied_seq(false); + if (test__start_subtest("sockmap recover with strp")) + test_sockmap_copied_seq(true); + if (test__start_subtest("sockmap tcp multi channels")) + test_sockmap_multi_channels(SOCK_STREAM); + if (test__start_subtest("sockmap udp multi channels")) + test_sockmap_multi_channels(SOCK_DGRAM); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c9efdd2a5b18..da42b00e3d1f 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -74,11 +74,20 @@ static void test_stacktrace_ips_kprobe_multi(bool retprobe) load_kallsyms(); - check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, - ksym_get_addr("bpf_testmod_stacktrace_test_3"), - ksym_get_addr("bpf_testmod_stacktrace_test_2"), - ksym_get_addr("bpf_testmod_stacktrace_test_1"), - ksym_get_addr("bpf_testmod_test_read")); + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } cleanup: stacktrace_ips__destroy(skel); @@ -128,6 +137,99 @@ cleanup: stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_kprobe(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_test = bpf_program__attach_kprobe_opts( + skel->progs.kprobe_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_test, "bpf_program__attach_kprobe_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void test_stacktrace_ips_trampoline(bool retprobe) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + if (retprobe) { + skel->links.fexit_test = bpf_program__attach_trace(skel->progs.fexit_test); + if (!ASSERT_OK_PTR(skel->links.fexit_test, "bpf_program__attach_trace")) + goto cleanup; + } else { + skel->links.fentry_test = bpf_program__attach_trace(skel->progs.fentry_test); + if (!ASSERT_OK_PTR(skel->links.fentry_test, "bpf_program__attach_trace")) + goto cleanup; + } + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -136,6 +238,14 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe_multi(true); if (test__start_subtest("raw_tp")) test_stacktrace_ips_raw_tp(); + if (test__start_subtest("kprobe")) + test_stacktrace_ips_kprobe(false); + if (test__start_subtest("kretprobe")) + test_stacktrace_ips_kprobe(true); + if (test__start_subtest("fentry")) + test_stacktrace_ips_trampoline(false); + if (test__start_subtest("fexit")) + test_stacktrace_ips_trampoline(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 0f3bf594e7a5..300032a19445 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -9,6 +9,7 @@ static const char * const test_cases[] = { "strcmp", "strcasecmp", + "strncasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 0ab36503c3b2..7d534fde0af9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -8,6 +8,7 @@ #include "tailcall_freplace.skel.h" #include "tc_bpf2bpf.skel.h" #include "tailcall_fail.skel.h" +#include "tailcall_sleepable.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -1653,6 +1654,77 @@ static void test_tailcall_failure() RUN_TESTS(tailcall_fail); } +noinline void uprobe_sleepable_trigger(void) +{ + asm volatile (""); +} + +static void test_tailcall_sleepable(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct tailcall_sleepable *skel; + int prog_fd, map_fd; + int err, key; + + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + /* + * Test that we can't load uprobe_normal and uprobe_sleepable_1, + * because they share tailcall map. + */ + bpf_program__set_autoload(skel->progs.uprobe_normal, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_ERR(err, "tailcall_sleepable__load")) + goto out; + + tailcall_sleepable__destroy(skel); + + /* + * Test that we can tail call from sleepable to sleepable program. + */ + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_2, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_OK(err, "tailcall_sleepable__load")) + goto out; + + /* Add sleepable uprobe_sleepable_2 to jmp_table[0]. */ + key = 0; + prog_fd = bpf_program__fd(skel->progs.uprobe_sleepable_2); + map_fd = bpf_map__fd(skel->maps.jmp_table); + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + skel->bss->my_pid = getpid(); + + /* Attach uprobe_sleepable_1 to uprobe_sleepable_trigger and hit it. */ + opts.func_name = "uprobe_sleepable_trigger"; + skel->links.uprobe_sleepable_1 = bpf_program__attach_uprobe_opts( + skel->progs.uprobe_sleepable_1, + -1, + "/proc/self/exe", + 0 /* offset */, + &opts); + if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable_1, "bpf_program__attach_uprobe_opts")) + goto out; + + uprobe_sleepable_trigger(); + ASSERT_EQ(skel->bss->executed, 1, "executed"); + +out: + tailcall_sleepable__destroy(skel); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1707,4 +1779,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_freplace(); if (test__start_subtest("tailcall_failure")) test_tailcall_failure(); + if (test__start_subtest("tailcall_sleepable")) + test_tailcall_sleepable(); } diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 2de38776a2d4..0f86b9275cf9 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -94,7 +94,7 @@ struct tld_metadata { }; struct tld_meta_u { - _Atomic __u8 cnt; + _Atomic __u16 cnt; __u16 size; struct tld_metadata metadata[]; }; @@ -217,7 +217,7 @@ out: static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) { int err, i, sz, off = 0; - __u8 cnt; + __u16 cnt; if (!TLD_READ_ONCE(tld_meta_p)) { err = __tld_init_meta_p(); diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index 42e822ea352f..7bee33797c71 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -112,24 +112,24 @@ static void test_recursion(void) task_ls_recursion__detach(skel); /* Refer to the comment in BPF_PROG(on_update) for - * the explanation on the value 201 and 100. + * the explanation on the value 200 and 1. */ map_fd = bpf_map__fd(skel->maps.map_a); err = bpf_map_lookup_elem(map_fd, &task_fd, &value); ASSERT_OK(err, "lookup map_a"); - ASSERT_EQ(value, 201, "map_a value"); - ASSERT_EQ(skel->bss->nr_del_errs, 1, "bpf_task_storage_delete busy"); + ASSERT_EQ(value, 200, "map_a value"); + ASSERT_EQ(skel->bss->nr_del_errs, 0, "bpf_task_storage_delete busy"); map_fd = bpf_map__fd(skel->maps.map_b); err = bpf_map_lookup_elem(map_fd, &task_fd, &value); ASSERT_OK(err, "lookup map_b"); - ASSERT_EQ(value, 100, "map_b value"); + ASSERT_EQ(value, 1, "map_b value"); prog_fd = bpf_program__fd(skel->progs.on_update); memset(&info, 0, sizeof(info)); err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); ASSERT_OK(err, "get prog info"); - ASSERT_EQ(info.recursion_misses, 0, "on_update prog recursion"); + ASSERT_EQ(info.recursion_misses, 2, "on_update prog recursion"); prog_fd = bpf_program__fd(skel->progs.on_enter); memset(&info, 0, sizeof(info)); diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c new file mode 100644 index 000000000000..461ded722351 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "struct_ops_assoc.skel.h" +#include "struct_ops_assoc_reuse.skel.h" +#include "struct_ops_assoc_in_timer.skel.h" + +static void test_st_ops_assoc(void) +{ + struct struct_ops_assoc *skel = NULL; + int err, pid; + + skel = struct_ops_assoc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc__open")) + goto out; + + /* cannot explicitly associate struct_ops program */ + err = bpf_program__assoc_struct_ops(skel->progs.test_1_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(test_1_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_b, st_ops_map_b)"); + + /* sys_enter_prog_a already associated with map_a */ + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_b, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_b)"); + + err = struct_ops_assoc__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel->bss->test_pid = pid; + sys_gettid(); + skel->bss->test_pid = 0; + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc__destroy(skel); +} + +static void test_st_ops_assoc_reuse(void) +{ + struct struct_ops_assoc_reuse *skel = NULL; + int err; + + skel = struct_ops_assoc_reuse__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_reuse__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = struct_ops_assoc_reuse__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc_reuse__destroy(skel); +} + +static void test_st_ops_assoc_in_timer(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + err = struct_ops_assoc_in_timer__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again + * immediately. + */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, 1234, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + +static void test_st_ops_assoc_in_timer_no_uref(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + struct bpf_link *link; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + link = bpf_map__attach_struct_ops(skel->maps.st_ops_map); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again. + * timer_cb will run 500ms after syscall_prog runs, when the user space no longer + * holds a reference to st_ops_map. + */ + skel->bss->timer_ns = 500000000; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Detach and close struct_ops map to cause it to be freed */ + bpf_link__destroy(link); + close(bpf_program__fd(skel->progs.syscall_prog)); + close(bpf_map__fd(skel->maps.st_ops_map)); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, -1, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + +void test_struct_ops_assoc(void) +{ + if (test__start_subtest("st_ops_assoc")) + test_st_ops_assoc(); + if (test__start_subtest("st_ops_assoc_reuse")) + test_st_ops_assoc_reuse(); + if (test__start_subtest("st_ops_assoc_in_timer")) + test_st_ops_assoc_in_timer(); + if (test__start_subtest("st_ops_assoc_in_timer_no_uref")) + test_st_ops_assoc_in_timer_no_uref(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 9fd6306b455c..9556ad3d986f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -4,7 +4,7 @@ #include <test_progs.h> #define TLD_FREE_DATA_ON_THREAD_EXIT -#define TLD_DYN_DATA_SIZE 4096 +#define TLD_DYN_DATA_SIZE (getpagesize() - 8) #include "task_local_data.h" struct test_tld_struct { diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index 5af28f359cfd..bab4a31621c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -433,7 +433,7 @@ static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pk } /* Search for the end of the packet in verbatim mode */ - if (!pkt_continues(pkt->options)) + if (!pkt_continues(pkt->options) || !pkt->valid) return nb_frags; next_frag = pkt_stream->current_pkt_nb; @@ -1090,6 +1090,8 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) xsk_ring_prod__cancel(&umem->fq, nb_frags); } frags_processed -= nb_frags; + pkt_stream_cancel(pkt_stream); + pkts_sent--; } if (ifobj->use_fill_ring) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 34f9ccce2602..09ff21e1ad2f 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -1,12 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ +#include <sched.h> #include <test_progs.h> +#include <linux/perf_event.h> +#include <sys/syscall.h> #include "timer.skel.h" #include "timer_failure.skel.h" #include "timer_interrupt.skel.h" #define NUM_THR 8 +static int perf_event_open(__u32 type, __u64 config, int pid, int cpu) +{ + struct perf_event_attr attr = { + .type = type, + .config = config, + .size = sizeof(struct perf_event_attr), + .sample_period = 10000, + }; + + return syscall(__NR_perf_event_open, &attr, pid, cpu, -1, 0); +} + static void *spin_lock_thread(void *arg) { int i, err, prog_fd = *(int *)arg; @@ -22,13 +37,174 @@ static void *spin_lock_thread(void *arg) pthread_exit(arg); } -static int timer(struct timer *timer_skel) + +static int timer_stress_runner(struct timer *timer_skel, bool async_cancel) { - int i, err, prog_fd; + int i, err = 1, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); pthread_t thread_id[NUM_THR]; void *ret; + timer_skel->bss->async_cancel = async_cancel; + prog_fd = bpf_program__fd(timer_skel->progs.race); + for (i = 0; i < NUM_THR; i++) { + err = pthread_create(&thread_id[i], NULL, + &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + while (i) { + err = pthread_join(thread_id[--i], &ret); + if (ASSERT_OK(err, "pthread_join")) + ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); + } + return err; +} + +static int timer_stress(struct timer *timer_skel) +{ + return timer_stress_runner(timer_skel, false); +} + +static int timer_stress_async_cancel(struct timer *timer_skel) +{ + return timer_stress_runner(timer_skel, true); +} + +static void *nmi_cpu_worker(void *arg) +{ + volatile __u64 num = 1; + int i; + + for (i = 0; i < 500000000; ++i) + num *= (i % 7) + 1; + (void)num; + + return NULL; +} + +static int run_nmi_test(struct timer *timer_skel, struct bpf_program *prog) +{ + struct bpf_link *link = NULL; + int pe_fd = -1, pipefd[2] = {-1, -1}, pid = 0, status; + char buf = 0; + int ret = -1; + + if (!ASSERT_OK(pipe(pipefd), "pipe")) + goto cleanup; + + pid = fork(); + if (pid == 0) { + /* Child: spawn multiple threads to consume multiple CPUs */ + pthread_t threads[NUM_THR]; + int i; + + close(pipefd[1]); + read(pipefd[0], &buf, 1); + close(pipefd[0]); + + for (i = 0; i < NUM_THR; i++) + pthread_create(&threads[i], NULL, nmi_cpu_worker, NULL); + for (i = 0; i < NUM_THR; i++) + pthread_join(threads[i], NULL); + exit(0); + } + + if (!ASSERT_GE(pid, 0, "fork")) + goto cleanup; + + /* Open perf event for child process across all CPUs */ + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, + PERF_COUNT_HW_CPU_CYCLES, + pid, /* measure child process */ + -1); /* on any CPU */ + if (pe_fd < 0) { + if (errno == ENOENT || errno == EOPNOTSUPP) { + printf("SKIP:no PERF_COUNT_HW_CPU_CYCLES\n"); + test__skip(); + ret = EOPNOTSUPP; + goto cleanup; + } + ASSERT_GE(pe_fd, 0, "perf_event_open"); + goto cleanup; + } + + link = bpf_program__attach_perf_event(prog, pe_fd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto cleanup; + pe_fd = -1; /* Ownership transferred to link */ + + /* Signal child to start CPU work */ + close(pipefd[0]); + pipefd[0] = -1; + write(pipefd[1], &buf, 1); + close(pipefd[1]); + pipefd[1] = -1; + + waitpid(pid, &status, 0); + pid = 0; + + /* Verify NMI context was hit */ + ASSERT_GT(timer_skel->bss->test_hits, 0, "test_hits"); + ret = 0; + +cleanup: + bpf_link__destroy(link); + if (pe_fd >= 0) + close(pe_fd); + if (pid > 0) { + write(pipefd[1], &buf, 1); + waitpid(pid, &status, 0); + } + if (pipefd[0] >= 0) + close(pipefd[0]); + if (pipefd[1] >= 0) + close(pipefd[1]); + return ret; +} + +static int timer_stress_nmi_race(struct timer *timer_skel) +{ + int err; + + err = run_nmi_test(timer_skel, timer_skel->progs.nmi_race); + if (err == EOPNOTSUPP) + return 0; + return err; +} + +static int timer_stress_nmi_update(struct timer *timer_skel) +{ + int err; + + err = run_nmi_test(timer_skel, timer_skel->progs.nmi_update); + if (err == EOPNOTSUPP) + return 0; + if (err) + return err; + ASSERT_GT(timer_skel->bss->update_hits, 0, "update_hits"); + return 0; +} + +static int timer_stress_nmi_cancel(struct timer *timer_skel) +{ + int err; + + err = run_nmi_test(timer_skel, timer_skel->progs.nmi_cancel); + if (err == EOPNOTSUPP) + return 0; + if (err) + return err; + ASSERT_GT(timer_skel->bss->cancel_hits, 0, "cancel_hits"); + return 0; +} + +static int timer(struct timer *timer_skel) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); + err = timer__attach(timer_skel); if (!ASSERT_OK(err, "timer_attach")) return err; @@ -63,25 +239,30 @@ static int timer(struct timer *timer_skel) /* check that code paths completed */ ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); - prog_fd = bpf_program__fd(timer_skel->progs.race); - for (i = 0; i < NUM_THR; i++) { - err = pthread_create(&thread_id[i], NULL, - &spin_lock_thread, &prog_fd); - if (!ASSERT_OK(err, "pthread_create")) - break; - } + return 0; +} - while (i) { - err = pthread_join(thread_id[--i], &ret); - if (ASSERT_OK(err, "pthread_join")) - ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); - } +static int timer_cancel_async(struct timer *timer_skel) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = bpf_program__fd(timer_skel->progs.test_async_cancel_succeed); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(500); + /* check that there were no errors in timer execution */ + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); return 0; } -/* TODO: use pid filtering */ -void serial_test_timer(void) +static void test_timer(int (*timer_test_fn)(struct timer *timer_skel)) { struct timer *timer_skel = NULL; int err; @@ -94,13 +275,48 @@ void serial_test_timer(void) if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; - err = timer(timer_skel); + err = timer_test_fn(timer_skel); ASSERT_OK(err, "timer"); timer__destroy(timer_skel); +} + +void serial_test_timer(void) +{ + test_timer(timer); RUN_TESTS(timer_failure); } +void serial_test_timer_stress(void) +{ + test_timer(timer_stress); +} + +void serial_test_timer_stress_async_cancel(void) +{ + test_timer(timer_stress_async_cancel); +} + +void serial_test_timer_async_cancel(void) +{ + test_timer(timer_cancel_async); +} + +void serial_test_timer_stress_nmi_race(void) +{ + test_timer(timer_stress_nmi_race); +} + +void serial_test_timer_stress_nmi_update(void) +{ + test_timer(timer_stress_nmi_update); +} + +void serial_test_timer_stress_nmi_cancel(void) +{ + test_timer(timer_stress_nmi_cancel); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c new file mode 100644 index 000000000000..9f1f9aec8888 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include "timer_start_deadlock.skel.h" + +void test_timer_start_deadlock(void) +{ + struct timer_start_deadlock *skel; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + skel = timer_start_deadlock__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = timer_start_deadlock__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.start_timer); + + /* + * Run the syscall program that attempts to deadlock. + * If the kernel deadlocks, this call will never return. + */ + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "prog_test_run"); + ASSERT_EQ(opts.retval, 0, "prog_retval"); + + ASSERT_EQ(skel->bss->tp_called, 1, "tp_called"); +cleanup: + timer_start_deadlock__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c b/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c new file mode 100644 index 000000000000..29a46e96f660 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#define _GNU_SOURCE +#include <sched.h> +#include <pthread.h> +#include <test_progs.h> +#include "timer_start_delete_race.skel.h" + +/* + * Test for race between bpf_timer_start() and map element deletion. + * + * The race scenario: + * - CPU 1: bpf_timer_start() proceeds to bpf_async_process() and is about + * to call hrtimer_start() but hasn't yet + * - CPU 2: map_delete_elem() calls __bpf_async_cancel_and_free(), since + * timer is not scheduled yet hrtimer_try_to_cancel() is a nop, + * then calls bpf_async_refcount_put() dropping refcnt to zero + * and scheduling call_rcu_tasks_trace() + * - CPU 1: continues and calls hrtimer_start() + * - After RCU tasks trace grace period: memory is freed + * - Timer callback fires on freed memory: UAF! + * + * This test stresses this race by having two threads: + * - Thread 1: repeatedly starts timers + * - Thread 2: repeatedly deletes map elements + * + * KASAN should detect use-after-free. + */ + +#define ITERATIONS 1000 + +struct ctx { + struct timer_start_delete_race *skel; + volatile bool start; + volatile bool stop; + int errors; +}; + +static void *start_timer_thread(void *arg) +{ + struct ctx *ctx = arg; + cpu_set_t cpuset; + int fd, i; + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + + while (!ctx->start && !ctx->stop) + usleep(1); + if (ctx->stop) + return NULL; + + fd = bpf_program__fd(ctx->skel->progs.start_timer); + + for (i = 0; i < ITERATIONS && !ctx->stop; i++) { + LIBBPF_OPTS(bpf_test_run_opts, opts); + int err; + + err = bpf_prog_test_run_opts(fd, &opts); + if (err || opts.retval) { + ctx->errors++; + break; + } + } + + return NULL; +} + +static void *delete_elem_thread(void *arg) +{ + struct ctx *ctx = arg; + cpu_set_t cpuset; + int fd, i; + + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + + while (!ctx->start && !ctx->stop) + usleep(1); + if (ctx->stop) + return NULL; + + fd = bpf_program__fd(ctx->skel->progs.delete_elem); + + for (i = 0; i < ITERATIONS && !ctx->stop; i++) { + LIBBPF_OPTS(bpf_test_run_opts, opts); + int err; + + err = bpf_prog_test_run_opts(fd, &opts); + if (err || opts.retval) { + ctx->errors++; + break; + } + } + + return NULL; +} + +void test_timer_start_delete_race(void) +{ + struct timer_start_delete_race *skel; + pthread_t threads[2]; + struct ctx ctx = {}; + int err; + + skel = timer_start_delete_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + ctx.skel = skel; + + err = pthread_create(&threads[0], NULL, start_timer_thread, &ctx); + if (!ASSERT_OK(err, "create start_timer_thread")) { + ctx.stop = true; + goto cleanup; + } + + err = pthread_create(&threads[1], NULL, delete_elem_thread, &ctx); + if (!ASSERT_OK(err, "create delete_elem_thread")) { + ctx.stop = true; + pthread_join(threads[0], NULL); + goto cleanup; + } + + ctx.start = true; + + pthread_join(threads[0], NULL); + pthread_join(threads[1], NULL); + + ASSERT_EQ(ctx.errors, 0, "thread_errors"); + + /* Either KASAN will catch UAF or kernel will crash or nothing happens */ +cleanup: + timer_start_delete_race__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 4b4b081b46cc..302286a80154 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -6,6 +6,8 @@ #include "verifier_and.skel.h" #include "verifier_arena.skel.h" #include "verifier_arena_large.skel.h" +#include "verifier_arena_globals1.skel.h" +#include "verifier_arena_globals2.skel.h" #include "verifier_array_access.skel.h" #include "verifier_async_cb_context.skel.h" #include "verifier_basic_stack.skel.h" @@ -28,9 +30,11 @@ #include "verifier_ctx.skel.h" #include "verifier_ctx_sk_msg.skel.h" #include "verifier_d_path.skel.h" +#include "verifier_default_trusted_ptr.skel.h" #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" +#include "verifier_div_mod_bounds.skel.h" #include "verifier_div_overflow.skel.h" #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" @@ -108,6 +112,7 @@ #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" #include "verifier_lsm.skel.h" +#include "verifier_jit_inline.skel.h" #include "irq.skel.h" #define MAX_ENTRIES 11 @@ -147,6 +152,8 @@ static void run_tests_aux(const char *skel_name, void test_verifier_and(void) { RUN(verifier_and); } void test_verifier_arena(void) { RUN(verifier_arena); } void test_verifier_arena_large(void) { RUN(verifier_arena_large); } +void test_verifier_arena_globals1(void) { RUN(verifier_arena_globals1); } +void test_verifier_arena_globals2(void) { RUN(verifier_arena_globals2); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } @@ -167,9 +174,11 @@ void test_verifier_const_or(void) { RUN(verifier_const_or); } void test_verifier_ctx(void) { RUN(verifier_ctx); } void test_verifier_ctx_sk_msg(void) { RUN(verifier_ctx_sk_msg); } void test_verifier_d_path(void) { RUN(verifier_d_path); } +void test_verifier_default_trusted_ptr(void) { RUN_TESTS(verifier_default_trusted_ptr); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } +void test_verifier_div_mod_bounds(void) { RUN(verifier_div_mod_bounds); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } @@ -247,6 +256,7 @@ void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } +void test_verifier_jit_inline(void) { RUN(verifier_jit_inline); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 15c67d23128b..84831eecc935 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -16,12 +16,12 @@ void serial_test_wq(void) /* re-run the success test to check if the timer was actually executed */ wq_skel = wq__open_and_load(); - if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + if (!ASSERT_OK_PTR(wq_skel, "wq__open_and_load")) return; err = wq__attach(wq_skel); if (!ASSERT_OK(err, "wq_attach")) - return; + goto clean_up; prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); err = bpf_prog_test_run_opts(prog_fd, &topts); @@ -31,6 +31,7 @@ void serial_test_wq(void) usleep(50); /* 10 usecs should be enough, but give it extra */ ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); +clean_up: wq__destroy(wq_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index df27535995af..ad56e4370ce3 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -18,7 +18,7 @@ static void test_xdp_with_cpumap_helpers(void) struct bpf_cpumap_val val = { .qsize = 192, }; - int err, prog_fd, prog_redir_fd, map_fd; + int err, prog_fd, prog_redir_fd, map_fd, bad_fd; struct nstoken *nstoken = NULL; __u32 idx = 0; @@ -79,7 +79,22 @@ static void test_xdp_with_cpumap_helpers(void) val.qsize = 192; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + ASSERT_EQ(err, -EINVAL, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + + /* Try to attach non-BPF file descriptor */ + bad_fd = open("/dev/null", O_RDONLY); + ASSERT_GE(bad_fd, 0, "Open /dev/null for non-BPF fd"); + + val.bpf_prog.fd = bad_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EINVAL, "Add non-BPF fd to cpumap entry"); + + /* Try to attach nonexistent file descriptor */ + err = close(bad_fd); + ASSERT_EQ(err, 0, "Close non-BPF fd for nonexistent fd"); + + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EBADF, "Add nonexistent fd to cpumap entry"); /* Try to attach BPF_XDP program with frags to cpumap when we have * already loaded a BPF_XDP program on the map diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c index efa350d04ec5..910dabe95afd 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -114,12 +114,14 @@ static void test_xdp_pull_data_basic(void) { u32 pg_sz, max_meta_len, max_data_len; struct test_xdp_pull_data *skel; + int buff_len; skel = test_xdp_pull_data__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) return; pg_sz = sysconf(_SC_PAGE_SIZE); + buff_len = pg_sz + pg_sz / 2; if (find_xdp_sizes(skel, pg_sz)) goto out; @@ -140,13 +142,13 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); /* multi-buf pkt, empty linear data area, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, 0, PULL_MAX); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX); /* multi-buf pkt, no tailroom, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, max_data_len, PULL_MAX); /* Test cases with invalid pull length */ @@ -154,18 +156,18 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); /* multi-buf pkt with no space left in linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, max_data_len, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, empty linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, 0, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no tailroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, max_data_len, PULL_MAX | PULL_PLUS_ONE); out: diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index 3a2ddcacbea6..235d8cc95bdd 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -30,6 +30,7 @@ struct arena_list_head __arena *list_head; int list_sum; int cnt; bool skip = false; +const volatile bool nonsleepable = false; #ifdef __BPF_FEATURE_ADDR_SPACE_CAST long __arena arena_sum; @@ -42,6 +43,9 @@ int test_val SEC(".addr_space.1"); int zero; +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + SEC("syscall") int arena_list_add(void *ctx) { @@ -71,6 +75,10 @@ int arena_list_del(void *ctx) struct elem __arena *n; int sum = 0; + /* Take rcu_read_lock to test non-sleepable context */ + if (nonsleepable) + bpf_rcu_read_lock(); + arena_sum = 0; list_for_each_entry(n, list_head, node) { sum += n->value; @@ -79,6 +87,9 @@ int arena_list_del(void *ctx) bpf_free(n); } list_sum = sum; + + if (nonsleepable) + bpf_rcu_read_unlock(); #else skip = true; #endif diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index ff189a736ad8..8fc38592a87b 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -62,9 +62,9 @@ static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending) &init, BPF_NOEXIST); } -SEC("fentry/cgroup_attach_task") -int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup) +SEC("tp_btf/cgroup_attach_task") +int BPF_PROG(counter, struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup) { __u64 cg_id = cgroup_id(dst_cgrp); struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem( diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 000000000000..59fb70a3cc50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_core_read.h> +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = &cgrp->self; + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index 6884ab99a421..f05e120f3450 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -431,6 +431,47 @@ __naked void subprog1(void) ::: __clobber_all); } +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +SEC("socket") +__log_level(2) +__msg("2: .1........ (07) r1 += 8") +__msg("3: .1........ (79) r2 = *(u64 *)(r1 +0)") +__msg("4: ..2....... (b7) r3 = 1") +__msg("5: ..23...... (b7) r4 = 2") +__msg("6: ..234..... (0d) gotox r2") +__msg("7: ...3...... (bf) r0 = r3") +__msg("8: 0......... (95) exit") +__msg("9: ....4..... (bf) r0 = r4") +__msg("10: 0......... (95) exit") +__naked +void gotox(void) +{ + asm volatile ( + ".pushsection .jumptables,\"\",@progbits;" +"jt0_%=: .quad l0_%= - socket;" + ".quad l1_%= - socket;" + ".size jt0_%=, 16;" + ".global jt0_%=;" + ".popsection;" + + "r1 = jt0_%= ll;" + "r1 += 8;" + "r2 = *(u64 *)(r1 + 0);" + "r3 = 1;" + "r4 = 2;" + ".8byte %[gotox_r2];" +"l0_%=: r0 = r3;" + "exit;" +"l1_%=: r0 = r4;" + "exit;" + : + : __imm_insn(gotox_r2, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_2, BPF_REG_0, 0, 0)) + : __clobber_all); +} + +#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */ + /* to retain debug info for BTF generation */ void kfunc_root(void) { diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 8a2fd596c8a3..61c32e91e8c3 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { - /* NULL passed to KF_TRUSTED_ARGS kfunc. */ + /* NULL passed to kfunc. */ bpf_cpumask_empty(NULL); return 0; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index dda6a8dada82..8f2ae9640886 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -1465,7 +1465,7 @@ int xdp_invalid_data_slice2(struct xdp_md *xdp) } /* Only supported prog type can create skb-type dynptrs */ -SEC("?raw_tp") +SEC("?xdp") __failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed") int skb_invalid_ctx(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 4d756b623557..462712ff3b8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -77,7 +77,7 @@ int on_open_validate_file_read(void *c) err = 1; return 0; } - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback); return 0; } diff --git a/tools/testing/selftests/bpf/progs/free_timer.c b/tools/testing/selftests/bpf/progs/free_timer.c index 4501ae8fc414..eccb2d47db43 100644 --- a/tools/testing/selftests/bpf/progs/free_timer.c +++ b/tools/testing/selftests/bpf/progs/free_timer.c @@ -7,6 +7,16 @@ #define MAX_ENTRIES 8 +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + struct map_value { struct bpf_timer timer; }; diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c new file mode 100644 index 000000000000..86e8a2fe467e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +__u64 test1_entry_result = 0; +__u64 test1_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test1, int a, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test1_entry_result = a == 1 && ret == 0; + return 0; + } + + test1_exit_result = a == 1 && ret == 2; + return 0; +} + +__u64 test2_entry_result = 0; +__u64 test2_exit_result = 0; + +SEC("fsession/bpf_fentry_test3") +int BPF_PROG(test2, char a, int b, __u64 c, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test2_entry_result = a == 4 && b == 5 && c == 6 && ret == 0; + return 0; + } + + test2_exit_result = a == 4 && b == 5 && c == 6 && ret == 15; + return 0; +} + +__u64 test3_entry_result = 0; +__u64 test3_exit_result = 0; + +SEC("fsession/bpf_fentry_test4") +int BPF_PROG(test3, void *a, char b, int c, __u64 d, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test3_entry_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 0; + return 0; + } + + test3_exit_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 34; + return 0; +} + +__u64 test4_entry_result = 0; +__u64 test4_exit_result = 0; + +SEC("fsession/bpf_fentry_test5") +int BPF_PROG(test4, __u64 a, void *b, short c, int d, __u64 e, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test4_entry_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 0; + return 0; + } + + test4_exit_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 65; + return 0; +} + +__u64 test5_entry_result = 0; +__u64 test5_exit_result = 0; + +SEC("fsession/bpf_fentry_test7") +int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + if (!arg) + test5_entry_result = ret == 0; + return 0; + } + + if (!arg) + test5_exit_result = 1; + return 0; +} + +__u64 test6_entry_result = 0; +__u64 test6_exit_result = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test6, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test6_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test6_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} + +__u64 test7_entry_ok = 0; +__u64 test7_exit_ok = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0xAAAABBBBCCCCDDDDull; + test7_entry_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; + } + + test7_exit_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; +} + +__u64 test8_entry_ok = 0; +__u64 test8_exit_ok = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test8, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0x1111222233334444ull; + test8_entry_ok = *cookie == 0x1111222233334444ull; + return 0; + } + + test8_exit_ok = *cookie == 0x1111222233334444ull; + return 0; +} + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a, int ret) +{ + __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + test9_entry_result = a == 1 && ret == 0; + *cookie = 0x123456ULL; + return 0; + } + + test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; + return 0; +} + +__u64 test10_result = 0; +SEC("fexit/bpf_fentry_test1") +int BPF_PROG(test10, int a, int ret) +{ + test10_result = a == 1 && ret == 2; + return 0; +} + +__u64 test11_result = 0; +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test11, int a) +{ + test11_result = a == 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index e0f34a55e697..180ba5098ca1 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> +#include <vmlinux.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <errno.h> @@ -121,3 +121,85 @@ int BPF_PROG(fexit_test, int _a, int *_b, int _ret) test4_result &= err == 0 && ret == 1234; return 0; } + +__u64 test5_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test1_tp") +int BPF_PROG(tp_test1) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0; + __s64 err; + + test5_result = cnt == 1; + + err = bpf_get_func_arg(ctx, 0, &a); + test5_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test5_result &= err == -EINVAL; + + return 0; +} + +__u64 test6_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test2_tp") +int BPF_PROG(tp_test2) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, b = 0, z = 0; + __s64 err; + + test6_result = cnt == 2; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test6_result &= err == 0 && (int) a == 2; + + err = bpf_get_func_arg(ctx, 1, &b); + test6_result &= err == 0 && b == 3; + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 2, &z); + test6_result &= err == -EINVAL; + + return 0; +} + +__u64 test7_result = 0; +#if defined(bpf_target_x86) || defined(bpf_target_arm64) +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0, ret = 0; + __s64 err; + + test7_result = cnt == 1; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test7_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test7_result &= err == -EINVAL; + + if (bpf_session_is_return(ctx)) { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 2; + } else { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 0; + } + + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test7) +{ + test7_result = 1; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 2011cacdeb18..43ff836a8ed8 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -103,3 +103,26 @@ int BPF_URETPROBE(test8, int ret) test8_result = (const void *) addr == (const void *) uprobe_trigger; return 0; } + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; +#if defined(bpf_target_x86) || defined(bpf_target_arm64) +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test9_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test9_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + test9_entry_result = test9_exit_result = 1; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 7dd92a303bf6..7f27b517d5d5 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1926,4 +1926,144 @@ static int loop1_wrapper(void) ); } +/* + * This is similar to a test case absent_mark_in_the_middle_state(), + * but adapted for use with bpf_loop(). + */ +SEC("raw_tp") +__flag(BPF_F_TEST_STATE_FREQ) +__failure __msg("math between fp pointer and register with unbounded min value is not allowed") +__naked void absent_mark_in_the_middle_state4(void) +{ + /* + * Equivalent to a C program below: + * + * int main(void) { + * fp[-8] = bpf_get_prandom_u32(); + * fp[-16] = -32; // used in a memory access below + * bpf_loop(7, loop_cb4, fp, 0); + * return 0; + * } + * + * int loop_cb4(int i, void *ctx) { + * if (unlikely(ctx[-8] > bpf_get_prandom_u32())) + * *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected + * if (unlikely(fp[-8] > bpf_get_prandom_u32())) + * ctx[-16] = -31; // makes said access unaligned + * return 0; + * } + */ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r8 = r0;" + "*(u64 *)(r10 - 8) = r0;" + "*(u64 *)(r10 - 16) = -32;" + "r1 = 7;" + "r2 = loop_cb4 ll;" + "r3 = r10;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_loop), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +__used __naked +static void loop_cb4(void) +{ + asm volatile ( + "r9 = r2;" + "r8 = *(u64 *)(r9 - 8);" + "r6 = *(u64 *)(r9 - 16);" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto use_fp16_%=;" + "1:" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto update_fp16_%=;" + "2:" + "r0 = 0;" + "exit;" + "use_fp16_%=:" + "r1 = r10;" + "r1 += r6;" + "*(u64 *)(r1 + 0) = 42;" + "goto 1b;" + "update_fp16_%=:" + "*(u64 *)(r9 - 16) = -31;" + "goto 2b;" + : + : __imm(bpf_get_prandom_u32) + ); +} + +SEC("raw_tp") +__success +__naked int stack_misc_vs_scalar_in_a_loop(void) +{ + asm volatile( + "*(u8 *)(r10 - 15) = 1;" /* This marks stack slot fp[-16] as STACK_MISC. */ + "*(u8 *)(r10 - 23) = 1;" + "*(u8 *)(r10 - 31) = 1;" + "*(u8 *)(r10 - 39) = 1;" + "*(u8 *)(r10 - 47) = 1;" + "*(u8 *)(r10 - 55) = 1;" + "*(u8 *)(r10 - 63) = 1;" + "*(u8 *)(r10 - 71) = 1;" + "*(u8 *)(r10 - 79) = 1;" + "r1 = r10;" + "r1 += -8;" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "loop_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto loop_end_%=;" + +#define maybe_change_stack_slot(off) \ + "call %[bpf_get_prandom_u32];" \ + "if r0 == 42 goto +1;" \ + "goto +1;" \ + "*(u64 *)(r10 " #off ") = r0;" + + /* + * When comparing verifier states fp[-16] will be + * either STACK_MISC or SCALAR. Pruning logic should + * consider old STACK_MISC equivalent to current SCALAR + * to avoid states explosion. + */ + maybe_change_stack_slot(-16) + maybe_change_stack_slot(-24) + maybe_change_stack_slot(-32) + maybe_change_stack_slot(-40) + maybe_change_stack_slot(-48) + maybe_change_stack_slot(-56) + maybe_change_stack_slot(-64) + maybe_change_stack_slot(-72) + maybe_change_stack_slot(-80) + +#undef maybe_change_stack_slot + + "goto loop_%=;" + "loop_end_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_destroy];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_iter_num_new), + __imm(bpf_iter_num_next), + __imm(bpf_iter_num_destroy), + __imm_addr(amap) + : __clobber_all + ); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/iters_css.c b/tools/testing/selftests/bpf/progs/iters_css.c index ec1f6c2f590b..5a1d87d186a9 100644 --- a/tools/testing/selftests/bpf/progs/iters_css.c +++ b/tools/testing/selftests/bpf/progs/iters_css.c @@ -12,8 +12,7 @@ char _license[] SEC("license") = "GPL"; pid_t target_pid; u64 root_cg_id, leaf_cg_id; u64 first_cg_id, last_cg_id; - -int pre_order_cnt, post_order_cnt, tree_high; +int pre_order_cnt, post_order_cnt, children_cnt, tree_high; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; @@ -43,7 +42,7 @@ int iter_css_for_each(const void *ctx) } root_css = &root_cgrp->self; leaf_css = &leaf_cgrp->self; - pre_order_cnt = post_order_cnt = tree_high = 0; + pre_order_cnt = post_order_cnt = children_cnt = tree_high = 0; first_cg_id = last_cg_id = 0; bpf_rcu_read_lock(); @@ -60,6 +59,10 @@ int iter_css_for_each(const void *ctx) first_cg_id = cur_cgrp->kn->id; } + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_CHILDREN) { + children_cnt++; + } + bpf_for_each(css, pos, leaf_css, BPF_CGROUP_ITER_ANCESTORS_UP) tree_high++; diff --git a/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c new file mode 100644 index 000000000000..89b6a47e22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +extern int bpf_kfunc_implicit_arg(int a) __weak __ksym; +extern int bpf_kfunc_implicit_arg_impl(int a, struct bpf_prog_aux *aux) __weak __ksym; /* illegal */ +extern int bpf_kfunc_implicit_arg_legacy(int a, int b) __weak __ksym; +extern int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) __weak __ksym; + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +__retval(5) +int test_kfunc_implicit_arg(void *ctx) +{ + return bpf_kfunc_implicit_arg(5); +} + +SEC("syscall") +__failure __msg("cannot find address for kernel function bpf_kfunc_implicit_arg_impl") +int test_kfunc_implicit_arg_impl_illegal(void *ctx) +{ + return bpf_kfunc_implicit_arg_impl(5, NULL); +} + +SEC("syscall") +__retval(7) +int test_kfunc_implicit_arg_legacy(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy(3, 4); +} + +SEC("syscall") +__retval(11) +int test_kfunc_implicit_arg_legacy_impl(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy_impl(5, 6, NULL); +} diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c index 28f8487c9059..14f39fa6d515 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c @@ -5,9 +5,24 @@ char _license[] SEC("license") = "GPL"; +int pid = 0; + SEC("kprobe.multi") int test_override(struct pt_regs *ctx) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + bpf_override_return(ctx, 123); + return 0; +} + +SEC("kprobe") +int test_kprobe_override(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + bpf_override_return(ctx, 123); return 0; } diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c index 0835b5edf685..ad627016e3e5 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> +#include <vmlinux.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <stdbool.h> -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -23,16 +22,16 @@ int BPF_PROG(trigger) return 0; } -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -42,17 +41,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_kprobe_1_result); + return check_cookie(ctx, 1, &test_kprobe_1_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_kprobe_2_result); + return check_cookie(ctx, 2, &test_kprobe_2_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_kprobe_3_result); + return check_cookie(ctx, 3, &test_kprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index 637e75df2e14..d0be77011a84 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -62,7 +62,6 @@ SEC("lsm/inode_unlink") int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct bpf_local_storage *local_storage; struct local_storage *storage; struct task_struct *task; bool is_self_unlink; @@ -88,15 +87,10 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) if (!storage || storage->value) return 0; - if (bpf_task_storage_delete(&task_storage_map, task)) + if (bpf_task_storage_delete(&task_storage_map2, task)) return 0; - /* Ensure that the task_storage_map is disconnected from the storage. - * The storage memory should not be freed back to the - * bpf_mem_alloc. - */ - local_storage = task->bpf_storage; - if (!local_storage || local_storage->smap) + if (bpf_task_storage_delete(&task_storage_map, task)) return 0; task_storage_result = 0; @@ -164,18 +158,9 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, if (bpf_sk_storage_delete(&sk_storage_map2, sk)) return 0; - storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!storage) - return 0; - if (bpf_sk_storage_delete(&sk_storage_map, sk)) return 0; - /* Ensure that the sk_storage_map is disconnected from the storage. */ - if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap) - return 0; - sk_storage_result = 0; return 0; } diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index edaba481db9d..e708ffbe1f61 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -487,6 +487,24 @@ int test_map_kptr_ref3(struct __sk_buff *ctx) return 0; } +int num_of_refs; + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + SEC("syscall") int test_ls_map_kptr_ref1(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 4c0ff01f1a96..6443b320c732 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -272,7 +272,7 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) SEC("?tc") __failure -__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") +__msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -291,7 +291,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") +__failure __msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c index 37c2d2608ec0..ed6a2a93d5a5 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c @@ -187,4 +187,36 @@ out: return 0; } +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, u32); +} percpu SEC(".maps"); + +SEC("?fentry/bpf_fentry_test1") +int BPF_PROG(test_percpu_array, int x) +{ + u64 value = 0xDEADC0DE; + int key = 0; + + bpf_map_update_elem(&percpu, &key, &value, BPF_ANY); + return 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, u32); +} percpu_cgroup_storage SEC(".maps"); + +SEC("cgroup_skb/egress") +int cgroup_egress(struct __sk_buff *skb) +{ + u32 *val = bpf_get_local_storage(&percpu_cgroup_storage, 0); + + *val = 1; + return 1; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 4acb6af2dfe3..70b7baf9304b 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("dereference of modified ptr_or_null_ ptr R2 off=16 disallowed") +__failure __msg("Possibly NULL pointer passed to trusted arg1") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index d70c28824bbe..b4e073168fb1 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -7,6 +7,16 @@ #include "bpf_tracing_net.h" #include "bpf_misc.h" +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c deleted file mode 100644 index 69da05bb6c63..000000000000 --- a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ -#include "vmlinux.h" -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> - -extern bool CONFIG_PREEMPTION __kconfig __weak; -extern const int bpf_task_storage_busy __ksym; - -char _license[] SEC("license") = "GPL"; - -int pid = 0; -int busy = 0; - -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, long); -} task SEC(".maps"); - -SEC("raw_tp/sys_enter") -int BPF_PROG(read_bpf_task_storage_busy) -{ - int *value; - - if (!CONFIG_PREEMPTION) - return 0; - - if (bpf_get_current_pid_tgid() >> 32 != pid) - return 0; - - value = bpf_this_cpu_ptr(&bpf_task_storage_busy); - if (value) - busy = *value; - - return 0; -} diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c index 46d6eb2a3b17..c8f4815c8dfb 100644 --- a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c +++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c @@ -6,7 +6,6 @@ #include <bpf/bpf_tracing.h> #include <bpf/bpf_core_read.h> -void *local_storage_ptr = NULL; void *sk_ptr = NULL; int cookie_found = 0; __u64 cookie = 0; @@ -19,21 +18,17 @@ struct { __type(value, int); } sk_storage SEC(".maps"); -SEC("fexit/bpf_local_storage_destroy") -int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage) +SEC("fexit/bpf_sk_storage_free") +int BPF_PROG(bpf_sk_storage_free, struct sock *sk) { - struct sock *sk; - - if (local_storage_ptr != local_storage) + if (sk_ptr != sk) return 0; - sk = bpf_core_cast(sk_ptr, struct sock); if (sk->sk_cookie.counter != cookie) return 0; cookie_found++; omem = sk->sk_omem_alloc.counter; - local_storage_ptr = NULL; return 0; } @@ -50,7 +45,6 @@ int BPF_PROG(inet6_sock_destruct, struct sock *sk) if (value && *value == 0xdeadbeef) { cookie_found++; sk_ptr = sk; - local_storage_ptr = sk->sk_bpf_storage; } return 0; diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index a96c8150d7f5..6830f2978613 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -31,6 +31,13 @@ int unused(void) __u32 stack_key; +SEC("kprobe") +int kprobe_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + SEC("kprobe.multi") int kprobe_multi_test(struct pt_regs *ctx) { @@ -46,4 +53,24 @@ int rawtp_test(void *ctx) return 0; } +SEC("fentry/bpf_testmod_stacktrace_test") +int fentry_test(struct pt_regs *ctx) +{ + /* + * Skip 2 bpf_program/trampoline stack entries: + * - bpf_prog_bd1f7a949f55fb03_fentry_test + * - bpf_trampoline_182536277701 + */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + +SEC("fexit/bpf_testmod_stacktrace_test") +int fexit_test(struct pt_regs *ctx) +{ + /* Skip 2 bpf_program/trampoline stack entries, check fentry_test. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 4a5bd852f10c..6f999ba951a3 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -42,6 +42,10 @@ int size; u64 fault_addr; void *arena_ptr; +#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) + +private(STREAM) struct bpf_spin_lock block; + SEC("syscall") __success __retval(0) int stream_exhaust(void *ctx) @@ -234,4 +238,53 @@ int stream_arena_callback_fault(void *ctx) return 0; } +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_print_stack_kfunc(void *ctx) +{ + return bpf_stream_print_stack(BPF_STDERR); +} + +SEC("syscall") +__success __retval(-2) +int stream_print_stack_invalid_id(void *ctx) +{ + /* Try to pass an invalid stream ID. */ + return bpf_stream_print_stack((enum bpf_stream_id)0xbadcafe); +} + +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stdout(_STR) +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_print_kfuncs_locked(void *ctx) +{ + int ret; + + bpf_spin_lock(&block); + + ret = bpf_stream_printk(BPF_STDOUT, _STR); + if (ret) + goto out; + + ret = bpf_stream_print_stack(BPF_STDERR); + +out: + bpf_spin_unlock(&block); + + return ret; +} + + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 3662515f0107..8e8249f3521c 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 826e6b6aff7e..bddc4e8579d2 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -33,6 +33,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null1(void *ctx) { return bpf_strncasecmp(NULL, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null2(void *ctx) { return bpf_strncasecmp("HELLO", NULL, 5); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -57,6 +59,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { ret SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr1(void *ctx) { return bpf_strncasecmp(user_ptr, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr2(void *ctx) { return bpf_strncasecmp("HELLO", user_ptr, 5); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -83,6 +87,8 @@ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault1(void *ctx) { return bpf_strncasecmp(invalid_kern_ptr, "HELLO", 5); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault2(void *ctx) { return bpf_strncasecmp("HELLO", invalid_kern_ptr, 5); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 05e1da1f250f..412c53b87b18 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -8,6 +8,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } +SEC("syscall") int test_strncasecmp_too_long(void *ctx) { return bpf_strncasecmp(long_str, long_str, sizeof(long_str)); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index a8513964516b..f65b1226a81a 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -17,6 +17,13 @@ __test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO __test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } __test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } __test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } +__test(0) int test_strncasecmp_eq1(void *ctx) { return bpf_strncasecmp(str, "hello world", 11); } +__test(0) int test_strncasecmp_eq2(void *ctx) { return bpf_strncasecmp(str, "HELLO WORLD", 11); } +__test(0) int test_strncasecmp_eq3(void *ctx) { return bpf_strncasecmp(str, "HELLO world", 11); } +__test(0) int test_strncasecmp_eq4(void *ctx) { return bpf_strncasecmp(str, "hello", 5); } +__test(0) int test_strncasecmp_eq5(void *ctx) { return bpf_strncasecmp(str, "hello world!", 11); } +__test(-1) int test_strncasecmp_neq1(void *ctx) { return bpf_strncasecmp(str, "hello!", 6); } +__test(1) int test_strncasecmp_neq2(void *ctx) { return bpf_strncasecmp(str, "abc", 3); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c new file mode 100644 index 000000000000..68842e3f936b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +int test_pid; + +/* Programs associated with st_ops_map_a */ + +#define MAP_A_MAGIC 1234 +int test_err_a; + +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + return MAP_A_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +#define MAP_B_MAGIC 5678 +int test_err_b; + +SEC("struct_ops") +int BPF_PROG(test_1_b, struct st_ops_args *args) +{ + return MAP_B_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_b, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c new file mode 100644 index 000000000000..0bed49e9f217 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array_map SEC(".maps"); + +#define MAP_MAGIC 1234 +int recur; +int test_err; +int timer_ns; +int timer_test_1_ret; +int timer_cb_run; + +__noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + struct st_ops_args args = {}; + + recur++; + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + recur--; + + timer_cb_run++; + + return 0; +} + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + struct bpf_timer *timer; + int key = 0; + + if (!recur) { + timer = bpf_map_lookup_elem(&array_map, &key); + if (!timer) + return 0; + + bpf_timer_init(timer, &array_map, 1); + bpf_timer_set_callback(timer, timer_cb); + bpf_timer_start(timer, timer_ns, 0); + } + + return MAP_MAGIC; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c new file mode 100644 index 000000000000..396b3e58c729 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define MAP_A_MAGIC 1234 +int test_err_a; +int recur; + +/* + * test_1_a is reused. The kfunc should not be able to get the associated + * struct_ops and call test_1 recursively as it is ambiguous. + */ +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + int ret; + + if (!recur) { + recur++; + ret = bpf_kfunc_multi_st_ops_test_1_assoc(args); + if (ret != -1) + test_err_a++; + recur--; + } + + return MAP_A_MAGIC; +} + +/* Programs associated with st_ops_map_a */ + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +int test_err_b; + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); + if (ret != MAP_A_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_a, +}; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c index 6a2dd5367802..c8d217e89eea 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c @@ -12,7 +12,7 @@ void bpf_task_release(struct task_struct *p) __ksym; * reject programs returning a referenced kptr of the wrong type. */ SEC("struct_ops/test_return_ref_kptr") -__failure __msg("At program exit the register R0 is not a known value (ptr_or_null_)") +__failure __msg("At program exit the register R0 is not a known value (trusted_ptr_or_null_)") struct task_struct *BPF_PROG(kptr_return_fail__wrong_type, int dummy, struct task_struct *task, struct cgroup *cgrp) { diff --git a/tools/testing/selftests/bpf/progs/tailcall_sleepable.c b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c new file mode 100644 index 000000000000..d959a9eaaa9c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" +#include "bpf_test_utils.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps"); + +SEC("?uprobe") +int uprobe_normal(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +SEC("?uprobe.s") +int uprobe_sleepable_1(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +int executed = 0; +int my_pid = 0; + +SEC("?uprobe.s") +int uprobe_sleepable_2(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + executed++; + return 0; +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h index 432fff2af844..fed53d63a7e5 100644 --- a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -80,7 +80,7 @@ struct tld_metadata { }; struct tld_meta_u { - __u8 cnt; + __u16 cnt; __u16 size; struct tld_metadata metadata[TLD_MAX_DATA_CNT]; }; diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c index f1853c38aada..b37359432692 100644 --- a/tools/testing/selftests/bpf/progs/task_ls_recursion.c +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -36,14 +36,9 @@ int BPF_PROG(on_update) if (!test_pid || task->pid != test_pid) return 0; + /* This will succeed as there is no real deadlock */ ptr = bpf_task_storage_get(&map_a, task, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); - /* ptr will not be NULL when it is called from - * the bpf_task_storage_get(&map_b,...F_CREATE) in - * the BPF_PROG(on_enter) below. It is because - * the value can be found in map_a and the kernel - * does not need to acquire any spin_lock. - */ if (ptr) { int err; @@ -53,12 +48,7 @@ int BPF_PROG(on_update) nr_del_errs++; } - /* This will still fail because map_b is empty and - * this BPF_PROG(on_update) has failed to acquire - * the percpu busy lock => meaning potential - * deadlock is detected and it will fail to create - * new storage. - */ + /* This will succeed as there is no real deadlock */ ptr = bpf_task_storage_get(&map_b, task, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (ptr) diff --git a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c index 986829aaf73a..6ce98fe9f387 100644 --- a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c +++ b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c @@ -1,15 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" +#include <errno.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; -#ifndef EBUSY -#define EBUSY 16 -#endif - extern bool CONFIG_PREEMPTION __kconfig __weak; int nr_get_errs = 0; int nr_del_errs = 0; @@ -40,7 +37,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, ret = bpf_task_storage_delete(&task_storage, bpf_get_current_task_btf()); - if (ret == -EBUSY) + if (ret == -EDEADLK || ret == -ETIMEDOUT) __sync_fetch_and_add(&nr_del_errs, 1); return 0; diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 663a80990f8f..a6009d105158 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -65,8 +65,7 @@ int oncpu_hash_map(struct pt_regs *args) work = bpf_map_lookup_elem(&hmap, &key); if (!work) return 0; - - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -80,7 +79,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work); return 0; } @@ -102,6 +101,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 1270953fd092..82e4b8913333 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 55e555f7f41b..1d4378f351ef 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index f7b330ddd007..076fbf03a126 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -15,7 +15,6 @@ extern unsigned long CONFIG_HZ __kconfig; -int test_einval_bpf_tuple = 0; int test_einval_reserved = 0; int test_einval_reserved_new = 0; int test_einval_netns_id = 0; @@ -99,12 +98,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, __builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4)); - ct = lookup_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def)); - if (ct) - bpf_ct_release(ct); - else - test_einval_bpf_tuple = opts_def.error; - opts_def.reserved[0] = 1; ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index a586f087ffeb..2c156cd166af 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -4,6 +4,7 @@ #include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_core_read.h> +#include "bpf_misc.h" struct nf_conn; @@ -18,6 +19,10 @@ struct nf_conn *bpf_skb_ct_alloc(struct __sk_buff *, struct bpf_sock_tuple *, u3 struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32, struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_alloc(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_ct_insert_entry(struct nf_conn *) __ksym; void bpf_ct_release(struct nf_conn *) __ksym; void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym; @@ -146,4 +151,56 @@ int change_status_after_alloc(struct __sk_buff *ctx) return 0; } +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int lookup_null_bpf_tuple(struct __sk_buff *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int lookup_null_bpf_opts(struct __sk_buff *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c index c88ccc53529a..0c3df19626cb 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c +++ b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c @@ -33,7 +33,7 @@ struct { } hashmap1 SEC(".maps"); -static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 +static __noinline __tag1 __tag2 int foo(int x __tag1 __tag2) { struct key_t key; value_t val = {}; diff --git a/tools/testing/selftests/bpf/progs/test_ctx.c b/tools/testing/selftests/bpf/progs/test_ctx.c new file mode 100644 index 000000000000..7d4995506717 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ctx.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +extern void bpf_kfunc_trigger_ctx_check(void) __ksym; + +int count_hardirq; +int count_softirq; +int count_task; + +/* Triggered via bpf_prog_test_run from user-space */ +SEC("syscall") +int trigger_all_contexts(void *ctx) +{ + if (bpf_in_task()) + __sync_fetch_and_add(&count_task, 1); + + /* Trigger the firing of a hardirq and softirq for test. */ + bpf_kfunc_trigger_ctx_check(); + return 0; +} + +/* Observer for HardIRQ */ +SEC("fentry/bpf_testmod_test_hardirq_fn") +int BPF_PROG(on_hardirq) +{ + if (bpf_in_hardirq()) + __sync_fetch_and_add(&count_hardirq, 1); + return 0; +} + +/* Observer for SoftIRQ */ +SEC("fentry/bpf_testmod_test_softirq_fn") +int BPF_PROG(on_softirq) +{ + if (bpf_in_serving_softirq()) + __sync_fetch_and_add(&count_softirq, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index 061befb004c2..d249113ed657 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -48,10 +48,9 @@ SEC("?lsm.s/bpf") __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - unsigned long val = 0; + static struct bpf_dynptr val; - return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val, - (struct bpf_dynptr *)val, NULL); + return bpf_verify_pkcs7_signature(&val, &val, NULL); } SEC("lsm.s/bpf") diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 0ad1bf1ede8d..967081bbcfe1 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("expected pointer to stack or const struct bpf_dynptr") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 69aacc96db36..ef9edca184ea 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -44,4 +44,18 @@ int prog_skb_parser(struct __sk_buff *skb) return SK_PASS; } +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_ingress(struct __sk_buff *skb) +{ + int one = 1; + + return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS); +} + +SEC("sk_skb/stream_parser") +int prog_skb_verdict_ingress_strp(struct __sk_buff *skb) +{ + return skb->len; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 7330c61b5730..7376df405a6b 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -23,7 +23,12 @@ static const int cfg_udp_src = 20000; (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) -#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) +struct vxlanhdr___local { + __be32 vx_flags; + __be32 vx_vni; +}; + +#define L2_PAD_SZ (sizeof(struct vxlanhdr___local) + ETH_HLEN) #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 @@ -154,7 +159,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -195,12 +200,12 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -285,7 +290,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -325,12 +330,12 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -639,7 +644,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) olen += ETH_HLEN; break; case VXLAN_UDP_PORT: - olen += ETH_HLEN + sizeof(struct vxlanhdr); + olen += ETH_HLEN + sizeof(struct vxlanhdr___local); break; } break; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 4c677c001258..d6d5fefcd9b1 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include <linux/bpf.h> -#include <time.h> + +#include <vmlinux.h> #include <stdbool.h> #include <errno.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +#define CLOCK_MONOTONIC 1 +#define CLOCK_BOOTTIME 7 + char _license[] SEC("license") = "GPL"; + struct hmap_elem { int counter; struct bpf_timer timer; @@ -59,10 +63,14 @@ __u64 bss_data; __u64 abs_data; __u64 err; __u64 ok; +__u64 test_hits; +__u64 update_hits; +__u64 cancel_hits; __u64 callback_check = 52; __u64 callback2_check = 52; __u64 pinned_callback_check; __s32 pinned_cpu; +bool async_cancel = 0; #define ARRAY 1 #define HTAB 2 @@ -164,6 +172,29 @@ int BPF_PROG2(test1, int, a) return 0; } +static int timer_error(void *map, int *key, struct bpf_timer *timer) +{ + err = 42; + return 0; +} + +SEC("syscall") +int test_async_cancel_succeed(void *ctx) +{ + struct bpf_timer *arr_timer; + int array_key = ARRAY; + + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + bpf_timer_set_callback(arr_timer, timer_error); + bpf_timer_start(arr_timer, 100000 /* 100us */, 0); + bpf_timer_cancel_async(arr_timer); + ok = 7; + return 0; +} + /* callback for prealloc and non-prealloca hashtab timers */ static int timer_cb2(void *map, int *key, struct hmap_elem *val) { @@ -399,27 +430,88 @@ static int race_timer_callback(void *race_array, int *race_key, struct bpf_timer return 0; } -SEC("syscall") -int race(void *ctx) +/* Callback that updates its own map element */ +static int update_self_callback(void *map, int *key, struct bpf_timer *timer) +{ + struct elem init = {}; + + bpf_map_update_elem(map, key, &init, BPF_ANY); + __sync_fetch_and_add(&update_hits, 1); + return 0; +} + +/* Callback that cancels itself using async cancel */ +static int cancel_self_callback(void *map, int *key, struct bpf_timer *timer) +{ + bpf_timer_cancel_async(timer); + __sync_fetch_and_add(&cancel_hits, 1); + return 0; +} + +enum test_mode { + TEST_RACE_SYNC, + TEST_RACE_ASYNC, + TEST_UPDATE, + TEST_CANCEL, +}; + +static __always_inline int test_common(enum test_mode mode) { struct bpf_timer *timer; - int err, race_key = 0; struct elem init; + int ret, key = 0; __builtin_memset(&init, 0, sizeof(struct elem)); - bpf_map_update_elem(&race_array, &race_key, &init, BPF_ANY); - timer = bpf_map_lookup_elem(&race_array, &race_key); + bpf_map_update_elem(&race_array, &key, &init, BPF_ANY); + timer = bpf_map_lookup_elem(&race_array, &key); if (!timer) - return 1; + return 0; - err = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC); - if (err && err != -EBUSY) - return 1; + ret = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC); + if (ret && ret != -EBUSY) + return 0; + + if (mode == TEST_RACE_SYNC || mode == TEST_RACE_ASYNC) + bpf_timer_set_callback(timer, race_timer_callback); + else if (mode == TEST_UPDATE) + bpf_timer_set_callback(timer, update_self_callback); + else + bpf_timer_set_callback(timer, cancel_self_callback); - bpf_timer_set_callback(timer, race_timer_callback); bpf_timer_start(timer, 0, 0); - bpf_timer_cancel(timer); + + if (mode == TEST_RACE_ASYNC) + bpf_timer_cancel_async(timer); + else if (mode == TEST_RACE_SYNC) + bpf_timer_cancel(timer); return 0; } + +SEC("syscall") +int race(void *ctx) +{ + return test_common(async_cancel ? TEST_RACE_ASYNC : TEST_RACE_SYNC); +} + +SEC("perf_event") +int nmi_race(void *ctx) +{ + __sync_fetch_and_add(&test_hits, 1); + return test_common(TEST_RACE_ASYNC); +} + +SEC("perf_event") +int nmi_update(void *ctx) +{ + __sync_fetch_and_add(&test_hits, 1); + return test_common(TEST_UPDATE); +} + +SEC("perf_event") +int nmi_cancel(void *ctx) +{ + __sync_fetch_and_add(&test_hits, 1); + return test_common(TEST_CANCEL); +} diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c new file mode 100644 index 000000000000..019518ee18cd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define CLOCK_MONOTONIC 1 + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer_map SEC(".maps"); + +volatile int in_timer_start; +volatile int tp_called; + +static int timer_cb(void *map, int *key, struct elem *value) +{ + return 0; +} + +SEC("tp_btf/hrtimer_cancel") +int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) +{ + struct bpf_timer *timer; + int key = 0; + + if (!in_timer_start) + return 0; + + tp_called = 1; + timer = bpf_map_lookup_elem(&timer_map, &key); + + /* + * Call bpf_timer_start() from the tracepoint within hrtimer logic + * on the same timer to make sure it doesn't deadlock. + */ + bpf_timer_start(timer, 1000000000, 0); + return 0; +} + +SEC("syscall") +int start_timer(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer_map, &key); + /* claude may complain here that there is no NULL check. Ignoring it. */ + bpf_timer_init(timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, timer_cb); + + /* + * call hrtimer_start() twice, so that 2nd call does + * remove_hrtimer() and trace_hrtimer_cancel() tracepoint. + */ + in_timer_start = 1; + bpf_timer_start(timer, 1000000000, 0); + bpf_timer_start(timer, 1000000000, 0); + in_timer_start = 0; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/timer_start_delete_race.c b/tools/testing/selftests/bpf/progs/timer_start_delete_race.c new file mode 100644 index 000000000000..2ff6357f33f9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_start_delete_race.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <time.h> +#include <bpf/bpf_helpers.h> + +#define ITER_CNT 2000 + +struct map_value { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} timer_map SEC(".maps"); + +long cb_cnt; + +/* + * Timer callback that accesses the map value. + * If the race bug exists and this runs on freed memory, + * KASAN should detect it. + */ +static int timer_cb(void *map, int *key, struct map_value *value) +{ + __sync_fetch_and_add(&cb_cnt, 1); + return 0; +} + +SEC("syscall") +int start_timer(void *ctx) +{ + struct map_value *value; + int i; + + for (i = 0; i < ITER_CNT; i++) { + int key = 0; + + value = bpf_map_lookup_elem(&timer_map, &key); + /* hey claude, no if (!value) here. The verifier got smarter */ + + bpf_timer_init(&value->timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(&value->timer, timer_cb); + bpf_timer_start(&value->timer, 100000000, 0); + } + return 0; +} + +SEC("syscall") +int delete_elem(void *ctx) +{ + int i; + + for (i = 0; i < ITER_CNT; i++) { + int key = 0; + + bpf_map_delete_elem(&timer_map, &key); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2898b3749d07..4ea0422d1042 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,6 +25,34 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } +volatile const int stacktrace; + +typedef __u64 stack_trace_t[128]; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, stack_trace_t); +} stack_heap SEC(".maps"); + +static __always_inline void do_stacktrace(void *ctx) +{ + if (!stacktrace) + return; + + __u64 *ptr = bpf_map_lookup_elem(&stack_heap, &(__u32){0}); + + if (ptr) + bpf_get_stack(ctx, ptr, sizeof(stack_trace_t), 0); +} + +static __always_inline void handle(void *ctx) +{ + inc_counter(); + do_stacktrace(ctx); +} + SEC("?uprobe") int bench_trigger_uprobe(void *ctx) { @@ -81,21 +109,21 @@ int trigger_driver_kfunc(void *ctx) SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -108,7 +136,7 @@ int bench_kprobe_multi_empty(void *ctx) SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -121,34 +149,34 @@ int bench_kretprobe_multi_empty(void *ctx) SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fmod_ret/bpf_modify_return_test_tp") int bench_trigger_fmodret(void *ctx) { - inc_counter(); + handle(ctx); return -22; } SEC("?tp/bpf_test_run/bpf_trigger_tp") int bench_trigger_tp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?raw_tp/bpf_trigger_tp") int bench_trigger_rawtp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c index 30bff90b68dc..6e46bb00ff58 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> +#include <vmlinux.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <stdbool.h> -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -51,7 +50,7 @@ static int uprobe_multi_check(void *ctx, bool is_return) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*") int uprobe(struct pt_regs *ctx) { - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } static __always_inline bool verify_sleepable_user_copy(void) @@ -67,5 +66,5 @@ int uprobe_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_multi_sleep_result++; - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c index 5befdf944dc6..b5db196614a9 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> +#include <vmlinux.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <stdbool.h> -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -13,16 +12,16 @@ __u64 test_uprobe_1_result = 0; __u64 test_uprobe_2_result = 0; __u64 test_uprobe_3_result = 0; -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -32,17 +31,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1") int uprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_uprobe_1_result); + return check_cookie(ctx, 1, &test_uprobe_1_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2") int uprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_uprobe_2_result); + return check_cookie(ctx, 2, &test_uprobe_2_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3") int uprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_uprobe_3_result); + return check_cookie(ctx, 3, &test_uprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c index 8fbcd69fae22..3ce309248a04 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> +#include <vmlinux.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <stdbool.h> -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -16,11 +15,11 @@ int idx_return = 0; __u64 test_uprobe_cookie_entry[6]; __u64 test_uprobe_cookie_return[3]; -static int check_cookie(void) +static int check_cookie(struct pt_regs *ctx) { - __u64 *cookie = bpf_session_cookie(); + __u64 *cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) { + if (bpf_session_is_return(ctx)) { if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return)) return 1; test_uprobe_cookie_return[idx_return++] = *cookie; @@ -40,5 +39,5 @@ int uprobe_recursive(struct pt_regs *ctx) if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - return check_cookie(); + return check_cookie(ctx); } diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 7f4827eede3c..c4b8daac4388 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -10,6 +10,8 @@ #include "bpf_experimental.h" #include "bpf_arena_common.h" +#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) + struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); @@ -21,6 +23,37 @@ struct { #endif } arena SEC(".maps"); +SEC("socket") +__success __retval(0) +int basic_alloc1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile int __arena *page1, *page2, *no_page; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page2, 1); + if (*page1 != 1) + return 6; + if (*page2 != 0 && *page2 != 2) /* use-after-free should return 0 or the stored value */ + return 7; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc1(void *ctx) @@ -60,6 +93,44 @@ int basic_alloc1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_alloc2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page1, *page2, *page3, *page4; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + if (!page1) + return 1; + page2 = page1 + __PAGE_SIZE; + page3 = page1 + __PAGE_SIZE * 2; + page4 = page1 - __PAGE_SIZE; + *page1 = 1; + *page2 = 2; + *page3 = 3; + *page4 = 4; + if (*page1 != 1) + return 1; + if (*page2 != 2) + return 2; + if (*page3 != 0) + return 3; + if (*page4 != 0) + return 4; + bpf_arena_free_pages(&arena, (void __arena *)page1, 2); + if (*page1 != 0 && *page1 != 1) + return 5; + if (*page2 != 0 && *page2 != 2) + return 6; + if (*page3 != 0) + return 7; + if (*page4 != 0) + return 8; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc2(void *ctx) @@ -102,6 +173,19 @@ struct bpf_arena___l { struct bpf_map map; } __attribute__((preserve_access_index)); +SEC("socket") +__success __retval(0) __log_level(2) +int basic_alloc3_nosleep(void *ctx) +{ + struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena; + volatile char __arena *pages; + + pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0); + if (!pages) + return 1; + return 0; +} + SEC("syscall") __success __retval(0) __log_level(2) int basic_alloc3(void *ctx) @@ -115,6 +199,38 @@ int basic_alloc3(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return 1; + + page += __PAGE_SIZE; + + /* Reserve the second page */ + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 2; + + /* Try to explicitly allocate the reserved page. */ + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if (page) + return 3; + + /* Try to implicitly allocate the page (since there's only 2 of them). */ + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (page) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve1(void *ctx) @@ -147,6 +263,26 @@ int basic_reserve1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if ((u64)page) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve2(void *ctx) @@ -168,6 +304,27 @@ int basic_reserve2(void *ctx) } /* Reserve the same page twice, should return -EBUSY. */ +SEC("socket") +__success __retval(0) +int reserve_twice_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret != -EBUSY) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_twice(void *ctx) @@ -190,6 +347,36 @@ int reserve_twice(void *ctx) } /* Try to reserve past the end of the arena. */ +SEC("socket") +__success __retval(0) +int reserve_invalid_region_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + /* Try a NULL pointer. */ + ret = bpf_arena_reserve_pages(&arena, NULL, 3); + if (ret != -EINVAL) + return 1; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 3); + if (ret != -EINVAL) + return 2; + + ret = bpf_arena_reserve_pages(&arena, page, 4096); + if (ret != -EINVAL) + return 3; + + ret = bpf_arena_reserve_pages(&arena, page, (1ULL << 32) - 1); + if (ret != -EINVAL) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_invalid_region(void *ctx) @@ -254,4 +441,40 @@ int iter_maps3(struct bpf_iter__bpf_map *ctx) return 0; } +private(ARENA_TESTS) struct bpf_spin_lock arena_bpf_test_lock; + +/* Use the arena kfunc API while under a BPF lock. */ +SEC("syscall") +__success __retval(0) +int arena_kfuncs_under_bpf_lock(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + bpf_spin_lock(&arena_bpf_test_lock); + + /* Get a separate region of the arena. */ + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 1; + } + + bpf_arena_free_pages(&arena, page, 1); + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 2; + } + + bpf_arena_free_pages(&arena, page, 1); + + bpf_spin_unlock(&arena_bpf_test_lock); +#endif + + return 0; +} char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c new file mode 100644 index 000000000000..83182ddbfb95 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_experimental.h" +#include "bpf_arena_common.h" +#include "bpf_misc.h" + +#define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1)) +#define GLOBAL_PAGES (16) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Global data, to be placed at the end of the arena. + */ +volatile char __arena global_data[GLOBAL_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve1(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0x5a; + __u8 __arena *guard, *globals; + volatile char __arena *ptr; + int i; + int ret; + + guard = (void __arena *)arena_base(&arena); + globals = (void __arena *)(arena_base(&arena) + (ARENA_PAGES - GLOBAL_PAGES) * PAGE_SIZE); + + /* Reserve the region we've offset the globals by. */ + ret = bpf_arena_reserve_pages(&arena, guard, ARENA_PAGES - GLOBAL_PAGES); + if (ret) + return 1; + + /* Make sure the globals are in the expected offset. */ + ret = bpf_arena_reserve_pages(&arena, globals, 1); + if (!ret) + return 2; + + /* Verify globals are properly mapped in by libbpf. */ + for (i = 0; i < GLOBAL_PAGES; i++) { + ptr = &global_data[i][PAGE_SIZE / 2]; + + *ptr = magic; + if (*ptr != magic) + return i + 3; + } +#endif + return 0; +} + +/* + * Relocation check by reading directly into the global data w/o using symbols. + */ +SEC("syscall") +__success __retval(0) +int check_relocation(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0xfa; + u8 __arena *ptr; + + global_data[GLOBAL_PAGES - 1][PAGE_SIZE / 2] = magic; + ptr = (u8 __arena *)((u64)(ARENA_PAGES * PAGE_SIZE - PAGE_SIZE / 2)); + if (*ptr != magic) + return 1; + +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c new file mode 100644 index 000000000000..e6bd7b61f9f1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +#define ARENA_PAGES (32) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Fill the entire arena with global data. + * The offset into the arena should be 0. + */ +char __arena global_data[ARENA_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve2(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + void __arena *guard; + int ret; + + guard = (void __arena *)arena_base(&arena); + + /* Make sure the data at offset 0 case is properly handled. */ + ret = bpf_arena_reserve_pages(&arena, guard, 1); + if (!ret) + return 1; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index f19e15400b3e..5f7e7afee169 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -23,18 +23,31 @@ int big_alloc1(void *ctx) { #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) volatile char __arena *page1, *page2, *no_page, *page3; - void __arena *base; + u64 base; - page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + base = (u64)arena_base(&arena); + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); if (!page1) return 1; + + if ((u64)page1 != base) + return 15; + *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE * 2, + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - 2 * PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; - no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE, + + /* Test for the guard region at the end of the arena. */ + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE - PAGE_SIZE, + 1, NUMA_NO_NODE, 0); + if (no_page) + return 16; + + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) return 3; @@ -270,5 +283,34 @@ int big_alloc2(void *ctx) return 9; return 0; } + +SEC("socket") +__success __retval(0) +int big_alloc3(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *pages; + u64 i; + + /* + * Allocate 2051 pages in one go to check how kmalloc_nolock() handles large requests. + * Since kmalloc_nolock() can allocate up to 1024 struct page * at a time, this call should + * result in three batches: two batches of 1024 pages each, followed by a final batch of 3 + * pages. + */ + pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); + if (!pages) + return 0; + + bpf_for(i, 0, 2051) + pages[i * PAGE_SIZE] = 123; + bpf_for(i, 0, 2051) + if (pages[i * PAGE_SIZE] != 123) + return i; + + bpf_arena_free_pages(&arena, pages, 2051); +#endif + return 0; +} #endif char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 7efa9521105e..39aff82549c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -96,7 +96,7 @@ int wq_non_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } @@ -114,7 +114,7 @@ int wq_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } @@ -156,7 +156,7 @@ int task_work_non_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } @@ -176,6 +176,6 @@ int task_work_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 411a18437d7e..560531404bce 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar(id=1-1)") __retval(0) __naked void sub64_partial_overflow(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_bswap.c b/tools/testing/selftests/bpf/progs/verifier_bswap.c index e61755656e8d..4b779deee767 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bswap.c +++ b/tools/testing/selftests/bpf/progs/verifier_bswap.c @@ -48,6 +48,49 @@ __naked void bswap_64(void) : __clobber_all); } +#define BSWAP_RANGE_TEST(name, op, in_value, out_value) \ + SEC("socket") \ + __success __log_level(2) \ + __msg("r0 &= {{.*}}; R0=scalar({{.*}},var_off=(0x0; " #in_value "))") \ + __msg("r0 = " op " r0 {{.*}}; R0=scalar({{.*}},var_off=(0x0; " #out_value "))") \ + __naked void name(void) \ + { \ + asm volatile ( \ + "call %[bpf_get_prandom_u32];" \ + "r0 &= " #in_value ";" \ + "r0 = " op " r0;" \ + "r2 = " #out_value " ll;" \ + "if r0 > r2 goto trap_%=;" \ + "r0 = 0;" \ + "exit;" \ + "trap_%=:" \ + "r1 = 42;" \ + "r0 = *(u64 *)(r1 + 0);" \ + "exit;" \ + : \ + : __imm(bpf_get_prandom_u32) \ + : __clobber_all); \ + } + +BSWAP_RANGE_TEST(bswap16_range, "bswap16", 0x3f00, 0x3f) +BSWAP_RANGE_TEST(bswap32_range, "bswap32", 0x3f00, 0x3f0000) +BSWAP_RANGE_TEST(bswap64_range, "bswap64", 0x3f00, 0x3f000000000000) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +BSWAP_RANGE_TEST(be16_range, "be16", 0x3f00, 0x3f) +BSWAP_RANGE_TEST(be32_range, "be32", 0x3f00, 0x3f0000) +BSWAP_RANGE_TEST(be64_range, "be64", 0x3f00, 0x3f000000000000) +BSWAP_RANGE_TEST(le16_range, "le16", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(le32_range, "le32", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(le64_range, "le64", 0x3f00, 0x3f00) +#else +BSWAP_RANGE_TEST(be16_range, "be16", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(be32_range, "be32", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(be64_range, "be64", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(le16_range, "le16", 0x3f00, 0x3f) +BSWAP_RANGE_TEST(le32_range, "le32", 0x3f00, 0x3f0000) +BSWAP_RANGE_TEST(le64_range, "le64", 0x3f00, 0x3f000000000000) +#endif + #else SEC("socket") diff --git a/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c new file mode 100644 index 000000000000..fa3b656ad4fb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +SEC("syscall") +__success __retval(0) +int test_default_trusted_ptr(void *ctx) +{ + struct prog_test_member *trusted_ptr; + + trusted_ptr = bpf_kfunc_get_default_trusted_ptr_test(); + /* + * Test BPF kfunc bpf_get_default_trusted_ptr_test() returns a + * PTR_TO_BTF_ID | PTR_TRUSTED, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + bpf_kfunc_put_default_trusted_ptr_test(trusted_ptr); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c new file mode 100644 index 000000000000..4672af0b3268 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c @@ -0,0 +1,1149 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <limits.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +/* This file contains unit tests for signed/unsigned division and modulo + * operations (with divisor as a constant), focusing on verifying whether + * BPF verifier's range tracking module soundly and precisely computes + * the results. + */ + +SEC("socket") +__description("UDIV32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 /= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= w2 {{.*}}; R1=0 R2=0") +__naked void udiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 /= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 /= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= r2 {{.*}}; R1=0 R2=0") +__naked void udiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 /= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=3,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-3,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s/= w2 {{.*}}; R1=0 R2=0") +__naked void sdiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 s/= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=0x80000000") +__naked void sdiv32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s/= -1; \ + if w1 != %[int_min] goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=3)") +__naked void sdiv64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=2)") +__naked void sdiv64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s/= r2 {{.*}}; R1=0 R2=0") +__naked void sdiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 s/= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=scalar()") +__naked void sdiv64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=0x8000000000000000") +__naked void sdiv64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s/= -1; \ + r2 = %[llong_min] ll; \ + if r1 != r2 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 10; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 %%= w2; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 10; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 %%= r2; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff)) R2=0") +__naked void smod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w2 = 0; \ + w1 s%%= w2; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s%%= -1; \ + if w1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s%%= -1; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10) R2=0") +__naked void smod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r2 = 0; \ + r1 s%%= r2; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s%%= -1; \ + if r1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s%%= -1; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 1204fbc58178..e7dae0cf9c17 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -72,7 +72,7 @@ int trusted_task_arg_nonnull_fail1(void *ctx) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("R1 type=trusted_ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") __msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") int trusted_task_arg_nonnull_fail2(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 059aa716e3d0..889c9b78b912 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -17,17 +17,6 @@ struct { __type(value, struct val); } map_spin_lock SEC(".maps"); -struct timer { - struct bpf_timer t; -}; - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, int); - __type(value, struct timer); -} map_timer SEC(".maps"); - SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") __failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") @@ -85,106 +74,6 @@ __naked void bpf_prog_type_raw_tracepoint_1(void) } SEC("kprobe") -__description("bpf_timer_init isn restricted in BPF_PROG_TYPE_KPROBE") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void in_bpf_prog_type_kprobe_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("perf_event") -__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void bpf_prog_type_perf_event_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("tracepoint") -__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void in_bpf_prog_type_tracepoint_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("raw_tracepoint") -__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void bpf_prog_type_raw_tracepoint_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("kprobe") __description("bpf_spin_lock is forbidden in BPF_PROG_TYPE_KPROBE") __failure __msg("tracing progs cannot use bpf_spin_lock yet") __naked void in_bpf_prog_type_kprobe_3(void) diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c new file mode 100644 index 000000000000..4ea254063646 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +SEC("fentry/bpf_fentry_test1") +__success __retval(0) +__arch_x86_64 +__jited(" addq %gs:{{.*}}, %rax") +__arch_arm64 +__jited(" mrs x7, SP_EL0") +int inline_bpf_get_current_task(void) +{ + bpf_get_current_task(); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c index a509cad97e69..1fce7a7e8d03 100644 --- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -32,7 +32,7 @@ static void task_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(task_kfunc_raw_tp) { task_kfunc_load_test(); @@ -86,7 +86,7 @@ static void cgrp_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cgrp_kfunc_raw_tp) { cgrp_kfunc_load_test(); @@ -138,7 +138,7 @@ static void cpumask_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cpumask_kfunc_raw_tp) { cpumask_kfunc_load_test(); diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c index 8f755d2464cf..2ef346c827c2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c +++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <limits.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" @@ -18,9 +19,9 @@ __naked void scalars(void) r4 = r1; \ w2 += 0x7FFFFFFF; \ w4 += 0; \ - if r2 == 0 goto l1; \ + if r2 == 0 goto l0_%=; \ exit; \ -l1: \ +l0_%=: \ r4 >>= 63; \ r3 = 1; \ r3 -= r4; \ @@ -31,4 +32,335 @@ l1: \ " ::: __clobber_all); } +/* + * Test that sync_linked_regs() preserves register IDs. + * + * The sync_linked_regs() function copies bounds from known_reg to linked + * registers. When doing so, it must preserve each register's original id + * to allow subsequent syncs from the same source to work correctly. + * + */ +SEC("socket") +__success +__naked void sync_linked_regs_preserves_id(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; /* r0 in [0, 255] */ \ + r1 = r0; /* r0, r1 linked with id 1 */ \ + r1 += 4; /* r1 has id=1 and off=4 in [4, 259] */ \ + if r1 < 10 goto l0_%=; \ + /* r1 in [10, 259], r0 synced to [6, 255] */ \ + r2 = r0; /* r2 has id=1 and in [6, 255] */ \ + if r1 < 14 goto l0_%=; \ + /* r1 in [14, 259], r0 synced to [10, 255] */ \ + if r0 >= 10 goto l0_%=; \ + /* Never executed */ \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success +__naked void scalars_neg(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 += -4; \ + if r1 s< 0 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Same test but using BPF_SUB instead of BPF_ADD with negative immediate */ +SEC("socket") +__success +__naked void scalars_neg_sub(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 -= 4; \ + if r1 s< 0 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* alu32 with negative offset */ +SEC("socket") +__success +__naked void scalars_neg_alu32_add(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xff; \ + w1 = w0; \ + w1 += -4; \ + if w1 s< 0 goto l0_%=; \ + if w0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* alu32 with negative offset using SUB */ +SEC("socket") +__success +__naked void scalars_neg_alu32_sub(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xff; \ + w1 = w0; \ + w1 -= 4; \ + if w1 s< 0 goto l0_%=; \ + if w0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Positive offset: r1 = r0 + 4, then if r1 >= 6, r0 >= 2, so r0 != 0 */ +SEC("socket") +__success +__naked void scalars_pos(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 += 4; \ + if r1 < 6 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* SUB with negative immediate: r1 -= -4 is equivalent to r1 += 4 */ +SEC("socket") +__success +__naked void scalars_sub_neg_imm(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 -= -4; \ + if r1 < 6 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Double ADD clears the ID (can't accumulate offsets) */ +SEC("socket") +__failure +__msg("div by zero") +__naked void scalars_double_add(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 += 2; \ + r1 += 2; \ + if r1 < 6 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Test that sync_linked_regs() correctly handles large offset differences. + * r1.off = S32_MIN, r2.off = 1, delta = S32_MIN - 1 requires 64-bit math. + */ +SEC("socket") +__success +__naked void scalars_sync_delta_overflow(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r2 = r0; \ + r1 += %[s32_min]; \ + r2 += 1; \ + if r2 s< 100 goto l0_%=; \ + if r1 s< 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32), + [s32_min]"i"(INT_MIN) + : __clobber_all); +} + +/* + * Another large delta case: r1.off = S32_MAX, r2.off = -1. + * delta = S32_MAX - (-1) = S32_MAX + 1 requires 64-bit math. + */ +SEC("socket") +__success +__naked void scalars_sync_delta_overflow_large_range(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r2 = r0; \ + r1 += %[s32_max]; \ + r2 += -1; \ + if r2 s< 0 goto l0_%=; \ + if r1 s>= 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32), + [s32_max]"i"(INT_MAX) + : __clobber_all); +} + +/* + * Test linked scalar tracking with alu32 and large positive offset (0x7FFFFFFF). + * After w1 += 0x7FFFFFFF, w1 wraps to negative for any r0 >= 1. + * If w1 is signed-negative, then r0 >= 1, so r0 != 0. + */ +SEC("socket") +__success +__naked void scalars_alu32_big_offset(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xff; \ + w1 = w0; \ + w1 += 0x7FFFFFFF; \ + if w1 s>= 0 goto l0_%=; \ + if w0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__failure +__msg("div by zero") +__naked void scalars_alu32_basic(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + w1 += 1; \ + if r1 > 10 goto 1f; \ + r0 >>= 32; \ + if r0 == 0 goto 1f; \ + r0 /= 0; \ +1: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Test alu32 linked register tracking with wrapping. + * R0 is bounded to [0xffffff00, 0xffffffff] (high 32-bit values) + * w1 += 0x100 causes R1 to wrap to [0, 0xff] + * + * After sync_linked_regs, if bounds are computed correctly: + * R0 should be [0x00000000_ffffff00, 0x00000000_ffffff80] + * R0 >> 32 == 0, so div by zero is unreachable + * + * If bounds are computed incorrectly (64-bit underflow): + * R0 becomes [0xffffffff_ffffff00, 0xffffffff_ffffff80] + * R0 >> 32 == 0xffffffff != 0, so div by zero is reachable + */ +SEC("socket") +__success +__naked void scalars_alu32_wrap(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 |= 0xffffff00; \ + r1 = r0; \ + w1 += 0x100; \ + if r1 > 0x80 goto l0_%=; \ + r2 = r0; \ + r2 >>= 32; \ + if r2 == 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success +void alu32_negative_offset(void) +{ + volatile char path[5]; + volatile int offset = bpf_get_prandom_u32(); + int off = offset; + + if (off >= 5 && off < 10) + path[off - 5] = '.'; + + /* So compiler doesn't say: error: variable 'path' set but not used */ + __sink(path[0]); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 6af9100a37ff..38e8e9176862 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> +#include <vmlinux.h> #include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> #include "bpf_misc.h" SEC("lsm/file_permission") @@ -159,4 +160,32 @@ __naked int disabled_hook_test3(void *ctx) ::: __clobber_all); } +SEC("lsm/mmap_file") +__description("not null checking nullable pointer in bpf_lsm_mmap_file") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(no_null_check, struct file *file) +{ + struct inode *inode; + + inode = file->f_inode; + __sink(inode); + + return 0; +} + +SEC("lsm/mmap_file") +__description("null checking nullable pointer in bpf_lsm_mmap_file") +__success +int BPF_PROG(null_check, struct file *file) +{ + struct inode *inode; + + if (file) { + inode = file->f_inode; + __sink(inode); + } + + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index c0ce690ddb68..3072fee9a448 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -715,6 +715,51 @@ __naked void ignore_unique_scalar_ids_old(void) : __clobber_all); } +/* Check that two registers with 0 scalar IDs in a verified state can be mapped + * to the same scalar ID in current state. + */ +SEC("socket") +__success __log_level(2) +/* The states should be equivalent on reaching insn 12. + */ +__msg("12: safe") +__msg("processed 17 insns") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void two_nil_old_ids_one_cur_id(void) +{ + asm volatile ( + /* Give unique scalar IDs to r{6,7} */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + "r6 = r0;" + "r6 *= 1;" + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + "r7 = r0;" + "r7 *= 1;" + "r0 = 0;" + /* Maybe make r{6,7} IDs identical */ + "if r6 > r7 goto l0_%=;" + "goto l1_%=;" +"l0_%=:" + "r6 = r7;" +"l1_%=:" + /* Mark r{6,7} precise. + * Get here in two states: + * - first: r6{.id=0}, r7{.id=0} (cached state) + * - second: r6{.id=A}, r7{.id=A} + * Verifier considers such states equivalent. + * Thus "exit;" would be verified only once. + */ + "r2 = r10;" + "r2 += r6;" + "r2 += r7;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + /* Check that two different scalar IDs in a verified state can't be * mapped to the same scalar ID in current state. */ @@ -723,9 +768,9 @@ __success __log_level(2) /* The exit instruction should be reachable from two states, * use two matches and "processed .. insns" to ensure this. */ -__msg("13: (95) exit") -__msg("13: (95) exit") -__msg("processed 18 insns") +__msg("15: (95) exit") +__msg("15: (95) exit") +__msg("processed 20 insns") __flag(BPF_F_TEST_STATE_FREQ) __naked void two_old_ids_one_cur_id(void) { @@ -734,9 +779,11 @@ __naked void two_old_ids_one_cur_id(void) "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" "r6 = r0;" + "r8 = r0;" "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" "r7 = r0;" + "r9 = r0;" "r0 = 0;" /* Maybe make r{6,7} IDs identical */ "if r6 > r7 goto l0_%=;" diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 8613ea160dcd..be328100ba53 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -532,6 +532,74 @@ __naked void arsh32_imm_zero_extend_check(void) } SEC("socket") +__description("arsh32 imm sign positive extend check") +__success __retval(0) +__log_level(2) +__msg("2: (57) r6 &= 4095 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__msg("3: (67) r6 <<= 32 ; R6=scalar(smin=smin32=0,smax=umax=0xfff00000000,smax32=umax32=0,var_off=(0x0; 0xfff00000000))") +__msg("4: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__naked void arsh32_imm_sign_extend_positive_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign negative extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__naked void arsh32_imm_sign_extend_negative_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__naked void arsh32_imm_sign_extend_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 2047; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") __description("end16 (to_le) reg zero extend check") __success __success_unpriv __retval(0) __naked void le_reg_zero_extend_check_1(void) @@ -670,4 +738,89 @@ __naked void ldx_w_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_and(void) +{ + /* Below is what LLVM generates in cilium's bpf_wiregard.o */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 &= -134; /* w2 becomes 0 or -134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if w2 != -136 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_and(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 &= -134; \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if r2 != -136 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_or(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 |= 134; /* w2 becomes -1 or 134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if w2 == -1 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_or(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 |= 134; /* r2 becomes -1 or 134 */ \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if r2 == -1 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c index 28b4f7035ceb..8ee1243e62a8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c +++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c @@ -950,4 +950,26 @@ l3_%=: r0 = 0; \ " ::: __clobber_all); } +SEC("socket") +__description("unpriv: nospec after dead stack write in helper") +__success __success_unpriv +__retval(0) +/* Dead code sanitizer rewrites the call to `goto -1`. */ +__naked void unpriv_dead_helper_stack_write_nospec_result(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 != 1 goto l0_%=; \ + r2 = 0; \ + r3 = r10; \ + r3 += -16; \ + r4 = 4; \ + r5 = 0; \ + call %[bpf_skb_load_bytes_relative]; \ +l0_%=: exit; \ +" : + : __imm(bpf_skb_load_bytes_relative) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 2129e4353fd9..4d8273c258d5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -173,14 +173,15 @@ __naked void flow_keys_illegal_variable_offset_alu(void) asm volatile(" \ r6 = r1; \ r7 = *(u64*)(r6 + %[flow_keys_off]); \ - r8 = 8; \ - r8 /= 1; \ + call %[bpf_get_prandom_u32]; \ + r8 = r0; \ r8 &= 8; \ r7 += r8; \ r0 = *(u64*)(r7 + 0); \ exit; \ " : - : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) : __clobber_all); } diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp.c b/tools/testing/selftests/bpf/progs/verifier_xdp.c index 50768ed179b3..7dc9226aeb34 100644 --- a/tools/testing/selftests/bpf/progs/verifier_xdp.c +++ b/tools/testing/selftests/bpf/progs/verifier_xdp.c @@ -5,6 +5,14 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, __u64); + __uint(map_flags, BPF_F_RDONLY_PROG); +} map_array_ro SEC(".maps"); + SEC("xdp") __description("XDP, using ifindex from netdev") __success __retval(1) @@ -21,4 +29,31 @@ l0_%=: exit; \ : __clobber_all); } +SEC("xdp") +__description("XDP, using xdp_store_bytes from RO map") +__success __retval(0) +__naked void xdp_store_bytes_from_ro_map(void) +{ + asm volatile (" \ + r6 = r1; \ + r1 = 0; \ + *(u64*)(r10 - 8) = r1; \ + r2 = r10; \ + r2 += -8; \ + r1 = %[map_array_ro] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + r1 = r6; \ + r2 = 0; \ + r3 = r0; \ + r4 = 8; \ + call %[bpf_xdp_store_bytes]; \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm(bpf_xdp_store_bytes), + __imm_addr(map_array_ro) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index d06f6d40594a..3767f5595bbc 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -97,7 +97,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("arg#0 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { @@ -123,7 +123,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") long test_wrong_wq_pointer_offset(void *ctx) { diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh b/tools/testing/selftests/bpf/test_bpftool_map.sh deleted file mode 100755 index 515b1df0501e..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_map.sh +++ /dev/null @@ -1,398 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -TESTNAME="bpftool_map" -BPF_FILE="security_bpf_map.bpf.o" -BPF_ITER_FILE="bpf_iter_map_elem.bpf.o" -PROTECTED_MAP_NAME="prot_map" -NOT_PROTECTED_MAP_NAME="not_prot_map" -BPF_FS_TMP_PARENT="/tmp" -BPF_FS_PARENT=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_FS_PARENT=${BPF_FS_PARENT:-$BPF_FS_TMP_PARENT} -# bpftool will mount bpf file system under BPF_DIR if it is not mounted -# under BPF_FS_PARENT. -BPF_DIR="$BPF_FS_PARENT/test_$TESTNAME" -SCRIPT_DIR=$(dirname $(realpath "$0")) -BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE" -BPF_ITER_FILE_PATH="$SCRIPT_DIR/$BPF_ITER_FILE" -BPFTOOL_PATH="bpftool" -# Assume the script is located under tools/testing/selftests/bpf/ -KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../) - -_cleanup() -{ - set +eu - - # If BPF_DIR is a mount point this will not remove the mount point itself. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null - - # Unmount if BPF filesystem was temporarily created. - if [ "$BPF_FS_PARENT" = "$BPF_FS_TMP_PARENT" ]; then - # A loop and recursive unmount are required as bpftool might - # create multiple mounts. For example, a bind mount of the directory - # to itself. The bind mount is created to change mount propagation - # flags on an actual mount point. - max_attempts=3 - attempt=0 - while mountpoint -q "$BPF_DIR" && [ $attempt -lt $max_attempts ]; do - umount -R "$BPF_DIR" 2>/dev/null - attempt=$((attempt+1)) - done - - # The directory still exists. Remove it now. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2>/dev/null - fi -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -check_root_privileges() { - if [ $(id -u) -ne 0 ]; then - echo "Need root privileges" - exit $ksft_skip - fi -} - -# Function to verify bpftool path. -# Parameters: -# $1: bpftool path -verify_bpftool_path() { - local bpftool_path="$1" - if ! "$bpftool_path" version > /dev/null 2>&1; then - echo "Could not run test without bpftool" - exit $ksft_skip - fi -} - -# Function to verify BTF support. -# The test requires BTF support for fmod_ret programs. -verify_btf_support() { - if [ ! -f /sys/kernel/btf/vmlinux ]; then - echo "Could not run test without BTF support" - exit $ksft_skip - fi -} - -# Function to initialize map entries with keys [0..2] and values set to 0. -# Parameters: -# $1: Map name -# $2: bpftool path -initialize_map_entries() { - local map_name="$1" - local bpftool_path="$2" - - for key in 0 1 2; do - "$bpftool_path" map update name "$map_name" key $key 0 0 0 value 0 0 0 $key - done -} - -# Test read access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -access_for_read() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - - # Test read access to the map. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Read access to $key in $map_name failed" - exit 1 - fi - - # Test read access to map's BTF data. - if ! "$bpftool_path" btf dump map "$name_cmd" "$map_name" 1>/dev/null; then - echo " Read access to $map_name for BTF data failed" - exit 1 - fi -} - -# Test write access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_write() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed but should have succeeded" - exit 1 - fi - fi -} - -# Test entry deletion for the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_deletion() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - # Test deletion by key for the map. - # Before deleting, check the key exists. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Key $key does not exist in $map_name" - exit 1 - fi - - # Delete by key. - if "$bpftool_path" map delete "$name_cmd" "$map_name" key $key 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Deletion for $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Deletion for $key in $map_name failed but should have succeeded" - exit 1 - fi - fi - - # After deleting, check the entry existence according to the expected status. - if "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - if [ "$write_should_succeed" = "true" ]; then - echo " Key $key for $map_name was not deleted but should have been deleted" - exit 1 - fi - else - if [ "$write_should_succeed" = "false" ]; then - echo "Key $key for $map_name was deleted but should have not been deleted" - exit 1 - fi - fi - - # Test creation of map's deleted entry, if deletion was successful. - # Otherwise, the entry exists. - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded after deletion attempt but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed after deletion attempt but should have succeeded" - exit 1 - fi - fi -} - -# Test map elements iterator. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: BPF_DIR -# $5: bpf iterator object file path -iterate_map_elem() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local bpf_dir="$4" - local bpf_file="$5" - local pin_path="$bpf_dir/map_iterator" - - "$bpftool_path" iter pin "$bpf_file" "$pin_path" map "$name_cmd" "$map_name" - if [ ! -f "$pin_path" ]; then - echo " Failed to pin iterator to $pin_path" - exit 1 - fi - - cat "$pin_path" 1>/dev/null - rm "$pin_path" 2>/dev/null -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key for rw -# $5: key to delete -# $6: Whether write should succeed (true/false) -# $7: BPF_DIR -# $8: bpf iterator object file path -access_map() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key_for_rw="$4" - local key_to_del="$5" - local write_should_succeed="$6" - local bpf_dir="$7" - local bpf_iter_file_path="$8" - - access_for_read "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" - access_for_write "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" \ - "$write_should_succeed" - access_for_deletion "$name_cmd" "$map_name" "$bpftool_path" "$key_to_del" \ - "$write_should_succeed" - iterate_map_elem "$name_cmd" "$map_name" "$bpftool_path" "$bpf_dir" \ - "$bpf_iter_file_path" -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Map name -# $2: bpftool path -# $3: BPF_DIR -# $4: Whether write should succeed (true/false) -# $5: bpf iterator object file path -test_map_access() { - local map_name="$1" - local bpftool_path="$2" - local bpf_dir="$3" - local pin_path="$bpf_dir/${map_name}_pinned" - local write_should_succeed="$4" - local bpf_iter_file_path="$5" - - # Test access to the map by name. - access_map "name" "$map_name" "$bpftool_path" "0 0 0 0" "1 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" - - # Pin the map to the BPF filesystem - "$bpftool_path" map pin name "$map_name" "$pin_path" - if [ ! -e "$pin_path" ]; then - echo " Failed to pin $map_name" - exit 1 - fi - - # Test access to the pinned map. - access_map "pinned" "$pin_path" "$bpftool_path" "0 0 0 0" "2 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" -} - -# Function to test map creation and map-of-maps -# Parameters: -# $1: bpftool path -# $2: BPF_DIR -test_map_creation_and_map_of_maps() { - local bpftool_path="$1" - local bpf_dir="$2" - local outer_map_name="outer_map_tt" - local inner_map_name="inner_map_tt" - - "$bpftool_path" map create "$bpf_dir/$inner_map_name" type array key 4 \ - value 4 entries 4 name "$inner_map_name" - if [ ! -f "$bpf_dir/$inner_map_name" ]; then - echo " Failed to create inner map file at $bpf_dir/$outer_map_name" - return 1 - fi - - "$bpftool_path" map create "$bpf_dir/$outer_map_name" type hash_of_maps \ - key 4 value 4 entries 2 name "$outer_map_name" inner_map name "$inner_map_name" - if [ ! -f "$bpf_dir/$outer_map_name" ]; then - echo " Failed to create outer map file at $bpf_dir/$outer_map_name" - return 1 - fi - - # Add entries to the outer map by name and by pinned path. - "$bpftool_path" map update pinned "$bpf_dir/$outer_map_name" key 0 0 0 0 \ - value pinned "$bpf_dir/$inner_map_name" - "$bpftool_path" map update name "$outer_map_name" key 1 0 0 0 value \ - name "$inner_map_name" - - # The outer map should be full by now. - # The following map update command is expected to fail. - if "$bpftool_path" map update name "$outer_map_name" key 2 0 0 0 value name \ - "$inner_map_name" 2>/dev/null; then - echo " Update for $outer_map_name succeeded but should have failed" - exit 1 - fi -} - -# Function to test map access with the btf list command -# Parameters: -# $1: bpftool path -test_map_access_with_btf_list() { - local bpftool_path="$1" - - # The btf list command iterates over maps for - # loaded BPF programs. - if ! "$bpftool_path" btf list 1>/dev/null; then - echo " Failed to access btf data" - exit 1 - fi -} - -set -eu - -trap cleanup_skip EXIT - -check_root_privileges - -verify_bpftool_path "$BPFTOOL_PATH" - -verify_btf_support - -trap cleanup EXIT - -# Load and attach the BPF programs to control maps access. -"$BPFTOOL_PATH" prog loadall "$BPF_FILE_PATH" "$BPF_DIR" autoattach - -initialize_map_entries "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" -initialize_map_entries "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" - -# Activate the map protection mechanism. Protection status is controlled -# by a value stored in the prot_status_map at index 0. -"$BPFTOOL_PATH" map update name prot_status_map key 0 0 0 0 value 1 0 0 0 - -# Test protected map (write should fail). -test_map_access "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "false" \ - "$BPF_ITER_FILE_PATH" - -# Test not protected map (write should succeed). -test_map_access "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "true" \ - "$BPF_ITER_FILE_PATH" - -test_map_creation_and_map_of_maps "$BPFTOOL_PATH" "$BPF_DIR" - -test_map_access_with_btf_list "$BPFTOOL_PATH" - -exit 0 diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh deleted file mode 100755 index b5520692f41b..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_metadata.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -BPF_FILE_USED="metadata_used.bpf.o" -BPF_FILE_UNUSED="metadata_unused.bpf.o" - -TESTNAME=bpftool_metadata -BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_DIR=$BPF_FS/test_$TESTNAME - -_cleanup() -{ - set +e - rm -rf $BPF_DIR 2> /dev/null -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -if [ $(id -u) -ne 0 ]; then - echo "selftests: $TESTNAME [SKIP] Need root privileges" - exit $ksft_skip -fi - -if [ -z "$BPF_FS" ]; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" - exit $ksft_skip -fi - -if ! bpftool version > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" - exit $ksft_skip -fi - -set -e - -trap cleanup_skip EXIT - -mkdir $BPF_DIR - -trap cleanup EXIT - -bpftool prog load $BPF_FILE_UNUSED $BPF_DIR/unused - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/unused - -bpftool prog load $BPF_FILE_USED $BPF_DIR/used - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/used - -exit 0 diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h index aeef86b3da74..45a5e41f3a92 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h @@ -63,6 +63,16 @@ BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare, sizeof(struct bpf_testmod_test_writable_ctx) ); +DECLARE_TRACE(bpf_testmod_fentry_test1, + TP_PROTO(int a), + TP_ARGS(a) +); + +DECLARE_TRACE(bpf_testmod_fentry_test2, + TP_PROTO(int a, u64 b), + TP_ARGS(a, b) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1669a7eeda26..186a25ab429a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -254,6 +254,22 @@ __bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) return NULL; } +static struct prog_test_member trusted_ptr; + +__bpf_kfunc struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) +{ + return &trusted_ptr; +} + +__bpf_kfunc void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) +{ + /* + * This BPF kfunc doesn't actually have any put/KF_ACQUIRE + * semantics. We're simply wanting to simulate a BPF kfunc that takes a + * struct prog_test_member pointer as an argument. + */ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -285,6 +301,12 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) call_rcu(&ctx->rcu, testmod_free_cb); } +__bpf_kfunc void bpf_testmod_ctx_release_dtor(void *ctx) +{ + bpf_testmod_ctx_release(ctx); +} +CFI_NOSEAL(bpf_testmod_ctx_release_dtor); + static struct bpf_testmod_ops3 *st_ops3; static int bpf_testmod_test_3(void) @@ -390,11 +412,15 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) noinline int bpf_testmod_fentry_test1(int a) { + trace_bpf_testmod_fentry_test1_tp(a); + return a + 1; } noinline int bpf_testmod_fentry_test2(int a, u64 b) { + trace_bpf_testmod_fentry_test2_tp(a, b); + return a + b; } @@ -693,9 +719,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) @@ -703,11 +729,13 @@ BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2) +BTF_ID_FLAGS(func, bpf_kfunc_get_default_trusted_ptr_test); +BTF_ID_FLAGS(func, bpf_kfunc_put_default_trusted_ptr_test); BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) BTF_ID(struct, bpf_testmod_ctx) -BTF_ID(func, bpf_testmod_ctx_release) +BTF_ID(func, bpf_testmod_ctx_release_dtor) static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = { .owner = THIS_MODULE, @@ -1134,6 +1162,38 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux); + +__bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); + +/* hook targets */ +noinline void bpf_testmod_test_hardirq_fn(void) { barrier(); } +noinline void bpf_testmod_test_softirq_fn(void) { barrier(); } + +/* Tasklet for SoftIRQ context */ +static void ctx_check_tasklet_fn(struct tasklet_struct *t) +{ + bpf_testmod_test_softirq_fn(); +} + +DECLARE_TASKLET(ctx_check_tasklet, ctx_check_tasklet_fn); + +/* IRQ Work for HardIRQ context */ +static void ctx_check_irq_fn(struct irq_work *work) +{ + bpf_testmod_test_hardirq_fn(); + tasklet_schedule(&ctx_check_tasklet); +} + +static struct irq_work ctx_check_irq = IRQ_WORK_INIT_HARD(ctx_check_irq_fn); + +/* The kfunc trigger */ +__bpf_kfunc void bpf_kfunc_trigger_ctx_check(void) +{ + irq_work_queue(&ctx_check_irq); +} BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) @@ -1157,7 +1217,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) @@ -1171,11 +1231,16 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) +BTF_ID_FLAGS(func, bpf_kfunc_trigger_ctx_check) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1637,6 +1702,7 @@ static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) return NULL; } +/* Call test_1() of the struct_ops map identified by the id */ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) { struct bpf_testmod_multi_st_ops *st_ops; @@ -1652,6 +1718,38 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) return ret; } +/* Call test_1() of the associated struct_ops map */ +int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux) +{ + struct bpf_testmod_multi_st_ops *st_ops; + int ret = -1; + + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(aux); + if (st_ops) + ret = st_ops->test_1(args); + + return ret; +} + +int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux) +{ + if (aux && a > 0) + return a; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux) +{ + if (aux) + return a + b; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) +{ + return bpf_kfunc_implicit_arg_legacy(a, b, aux); +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = @@ -1774,6 +1872,10 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + /* Clean up irqwork and tasklet */ + irq_work_sync(&ctx_check_irq); + tasklet_kill(&ctx_check_tasklet); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); unregister_bpf_testmod_uprobe(); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 4df6fa6a92cb..d5c5454e257e 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -161,6 +161,16 @@ void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; -int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; +#ifndef __KERNEL__ +extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __weak __ksym; +extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; +#endif + +struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; +void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; + +void bpf_testmod_test_hardirq_fn(void); +void bpf_testmod_test_softirq_fn(void); +void bpf_kfunc_trigger_ctx_check(void) __ksym; #endif /* _BPF_TESTMOD_KFUNC_H */ diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 9437bdd4afa5..a5576b2dfc26 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -4,6 +4,18 @@ #include <bpf/libbpf.h> +#ifdef __x86_64__ +#define SYS_PREFIX "__x64_" +#elif defined(__s390x__) +#define SYS_PREFIX "__s390x_" +#elif defined(__aarch64__) +#define SYS_PREFIX "__arm64_" +#elif defined(__riscv) +#define SYS_PREFIX "__riscv_" +#else +#define SYS_PREFIX "" +#endif + #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c8d640802cce..9ca83dce100d 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index c0648dc009b5..e569d119fb60 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -81,7 +81,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 4294967295 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=4294967295", }, { "direct map access, write test 8", @@ -141,7 +141,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 536870912 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=536870912", }, { "direct map access, write test 13", diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 59a020c35647..061d98f6e9bb 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -229,11 +229,11 @@ { "precise: program doesn't prematurely prune branches", .insns = { - BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000), - BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401), BPF_JMP_IMM(BPF_JA, 0, 0, 0), BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2), BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1), diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index e962f133250c..1be1e353d40a 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -2580,7 +2580,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last if (last && fmt == RESFMT_TABLE) { output_header_underlines(); printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n", - env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped); + env.files_processed, env.progs_processed, env.files_skipped, env.progs_skipped); } } diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 44c52f620fda..ce6c2642fd9b 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -168,6 +168,27 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) return atol(ptr + strlen(key)); } +long cg_read_key_long_poll(const char *cgroup, const char *control, + const char *key, long expected, int retries, + useconds_t wait_interval_us) +{ + long val = -1; + int i; + + for (i = 0; i < retries; i++) { + val = cg_read_key_long(cgroup, control, key); + if (val < 0) + return val; + + if (val == expected) + break; + + usleep(wait_interval_us); + } + + return val; +} + long cg_read_lc(const char *cgroup, const char *control) { char buf[PAGE_SIZE]; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index 7ab2824ed7b5..77f386dab5e8 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -17,6 +17,8 @@ #define CG_NAMED_NAME "selftest" #define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) +#define DEFAULT_WAIT_INTERVAL_US (100 * 1000) /* 100 ms */ + /* * Checks if two given values differ by less than err% of their sum. */ @@ -64,6 +66,9 @@ extern int cg_read_strstr(const char *cgroup, const char *control, extern long cg_read_long(const char *cgroup, const char *control); extern long cg_read_long_fd(int fd); long cg_read_key_long(const char *cgroup, const char *control, const char *key); +long cg_read_key_long_poll(const char *cgroup, const char *control, + const char *key, long expected, int retries, + useconds_t wait_interval_us); extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); extern int cg_open(const char *cgroup, const char *control, int flags); diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index a17256d9f88a..5dff3ad53867 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -269,7 +269,7 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3|A2:1-3|A3:2-3|B1:2-3 A1:P0|A3:P0|B1:P-2" + " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-1|A2:1|A3:1|B1:2-3 A1:P0|A3:P0|B1:P2" " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4" @@ -318,7 +318,7 @@ TEST_MATRIX=( # Invalid to valid local partition direct transition tests " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3" " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3" - " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:4-6 A1:P-2|B1:P0" + " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:5-6 A1:P2|B1:P0" " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3" # Local partition invalidation tests @@ -388,10 +388,10 @@ TEST_MATRIX=( " C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2" " C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1" - # A non-exclusive cpuset.cpus change will invalidate partition and its siblings - " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P0" - " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P-1" - " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P0|B1:P-1" + # A non-exclusive cpuset.cpus change will not invalidate its siblings partition. + " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:3 A1:P1|B1:P0" + " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-1|XA1:0-1|B1:2-3 A1:P1|B1:P1" + " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-1|B1:2-3 A1:P0|B1:P1" # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it " C0-3 . . C4-5 X5 . . . 0 A1:0-3|B1:4-5" @@ -417,6 +417,17 @@ TEST_MATRIX=( " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \ A1:P0|A2:P2:B1:P-1 2-4" + # When multiple partitions with conflicting cpuset.cpus are created, the + # latter created ones will only get what are left of the available exclusive + # CPUs. + " C1-3:P1 . . . . . . C3-5:P1 0 A1:1-3|B1:4-5:XB1:4-5 A1:P1|B1:P1" + + # cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive + " C1-3:X1-3 . . C4-5 . . . C1-2 0 A1:1-3|B1:1-2" + + # cpuset.cpus can become empty with task in it as it inherits parent's effective CPUs + " C1-3:S+ C2 . . . T:C . . 0 A1:1-3|A2:1-3" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -427,7 +438,7 @@ TEST_MATRIX=( # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3|B1:4-5" - # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + # cpuset.cpus.exclusive cannot be set to a superset of sibling's cpuset.cpus " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3|B1:4-5" ) @@ -477,6 +488,10 @@ REMOTE_TEST_MATRIX=( . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \ p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \ 1-2,4-6|1-2,5-6" + # c12 whose cpuset.cpus CPUs are all granted to c11 will become invalid partition + " C1-5:P1:S+ . C1-4:P1 C2-3 . . \ + . . . P1 . . p1:5|c11:1-4|c12:5 \ + p1:P1|c11:P1|c12:P-1" ) # diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index ca38525484e3..eeabd34bf083 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -26,6 +26,7 @@ */ #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) +#define KMEM_DEAD_WAIT_RETRIES 80 static int alloc_dcache(const char *cgroup, void *arg) { @@ -306,9 +307,7 @@ static int test_kmem_dead_cgroups(const char *root) { int ret = KSFT_FAIL; char *parent; - long dead; - int i; - int max_time = 20; + long dead = -1; parent = cg_name(root, "kmem_dead_cgroups_test"); if (!parent) @@ -323,21 +322,19 @@ static int test_kmem_dead_cgroups(const char *root) if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) goto cleanup; - for (i = 0; i < max_time; i++) { - dead = cg_read_key_long(parent, "cgroup.stat", - "nr_dying_descendants "); - if (dead == 0) { - ret = KSFT_PASS; - break; - } - /* - * Reclaiming cgroups might take some time, - * let's wait a bit and repeat. - */ - sleep(1); - if (i > 5) - printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); - } + /* + * Allow up to ~8s for reclaim of dying descendants to complete. + * This is a generous upper bound derived from stress testing, not + * from a specific kernel constant, and can be adjusted if reclaim + * behavior changes in the future. + */ + dead = cg_read_key_long_poll(parent, "cgroup.stat", + "nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES, + DEFAULT_WAIT_INTERVAL_US); + if (dead) + goto cleanup; + + ret = KSFT_PASS; cleanup: cg_destroy(parent); diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 4e1647568c5b..2fb096a2a9f9 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -21,6 +21,8 @@ #include "kselftest.h" #include "cgroup_util.h" +#define MEMCG_SOCKSTAT_WAIT_RETRIES 30 + static bool has_localevents; static bool has_recursiveprot; @@ -1384,6 +1386,7 @@ static int test_memcg_sock(const char *root) int bind_retries = 5, ret = KSFT_FAIL, pid, err; unsigned short port; char *memcg; + long sock_post = -1; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -1432,7 +1435,22 @@ static int test_memcg_sock(const char *root) if (cg_read_long(memcg, "memory.current") < 0) goto cleanup; - if (cg_read_key_long(memcg, "memory.stat", "sock ")) + /* + * memory.stat is updated asynchronously via the memcg rstat + * flushing worker, which runs periodically (every 2 seconds, + * see FLUSH_TIME). On a busy system, the "sock " counter may + * stay non-zero for a short period of time after the TCP + * connection is closed and all socket memory has been + * uncharged. + * + * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some + * scheduling slack) and require that the "sock " counter + * eventually drops to zero. + */ + sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0, + MEMCG_SOCKSTAT_WAIT_RETRIES, + DEFAULT_WAIT_INTERVAL_US); + if (sock_post) goto cleanup; ret = KSFT_PASS; diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c index a6f6d5f2ae07..5c8adee63641 100644 --- a/tools/testing/selftests/coredump/coredump_test_helpers.c +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -56,7 +56,7 @@ void crashing_child(void) pthread_create(&thread, NULL, do_nothing, NULL); /* crash on purpose */ - i = *(int *)NULL; + __builtin_trap(); } int create_detached_tmpfs(void) diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c index 56b17e8fe1be..567793b11107 100644 --- a/tools/testing/selftests/damon/access_memory.c +++ b/tools/testing/selftests/damon/access_memory.c @@ -8,6 +8,11 @@ #include <string.h> #include <time.h> +enum access_mode { + ACCESS_MODE_ONCE, + ACCESS_MODE_REPEAT, +}; + int main(int argc, char *argv[]) { char **regions; @@ -15,10 +20,12 @@ int main(int argc, char *argv[]) int nr_regions; int sz_region; int access_time_ms; + enum access_mode mode = ACCESS_MODE_ONCE; + int i; - if (argc != 4) { - printf("Usage: %s <number> <size (bytes)> <time (ms)>\n", + if (argc < 4) { + printf("Usage: %s <number> <size (bytes)> <time (ms)> [mode]\n", argv[0]); return -1; } @@ -27,15 +34,21 @@ int main(int argc, char *argv[]) sz_region = atoi(argv[2]); access_time_ms = atoi(argv[3]); + if (argc > 4 && !strcmp(argv[4], "repeat")) + mode = ACCESS_MODE_REPEAT; + regions = malloc(sizeof(*regions) * nr_regions); for (i = 0; i < nr_regions; i++) regions[i] = malloc(sz_region); - for (i = 0; i < nr_regions; i++) { - start_clock = clock(); - while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC < - access_time_ms) - memset(regions[i], i, sz_region); - } + do { + for (i = 0; i < nr_regions; i++) { + start_clock = clock(); + while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC + < access_time_ms) + memset(regions[i], i, sz_region); + } + } while (mode == ACCESS_MODE_REPEAT); + return 0; } diff --git a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh index 64c5d8c518a4..33a7ff43ed6c 100755 --- a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh +++ b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh @@ -14,6 +14,13 @@ then exit $ksft_skip fi +kmemleak="/sys/kernel/debug/kmemleak" +if [ ! -f "$kmemleak" ] +then + echo "$kmemleak not found" + exit $ksft_skip +fi + # ensure filter directory echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds" echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts" @@ -22,22 +29,17 @@ echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/nr_filters" filter_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/0" -before_kb=$(grep Slab /proc/meminfo | awk '{print $2}') - -# try to leak 3000 KiB -for i in {1..102400}; +# try to leak 128 times +for i in {1..128}; do echo "012345678901234567890123456789" > "$filter_dir/memcg_path" done -after_kb=$(grep Slab /proc/meminfo | awk '{print $2}') -# expect up to 1500 KiB free from other tasks memory -expected_after_kb_max=$((before_kb + 1500)) - -if [ "$after_kb" -gt "$expected_after_kb_max" ] +echo scan > "$kmemleak" +kmemleak_report=$(cat "$kmemleak") +if [ "$kmemleak_report" = "" ] then - echo "maybe memcg_path are leaking: $before_kb -> $after_kb" - exit 1 -else exit 0 fi +echo "$kmemleak_report" +exit 1 diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py index 90ad7409a7a6..35c724a63f6c 100755 --- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py +++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py @@ -6,10 +6,10 @@ import time import _damon_sysfs -def main(): - # access two 10 MiB memory regions, 2 second per each - sz_region = 10 * 1024 * 1024 - proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) +def pass_wss_estimation(sz_region): + # access two regions of given size, 2 seocnds per each region + proc = subprocess.Popen( + ['./access_memory', '2', '%d' % sz_region, '2000', 'repeat']) kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( contexts=[_damon_sysfs.DamonCtx( ops='vaddr', @@ -27,7 +27,7 @@ def main(): exit(1) wss_collected = [] - while proc.poll() == None: + while proc.poll() is None and len(wss_collected) < 40: time.sleep(0.1) err = kdamonds.kdamonds[0].update_schemes_tried_bytes() if err != None: @@ -36,20 +36,43 @@ def main(): wss_collected.append( kdamonds.kdamonds[0].contexts[0].schemes[0].tried_bytes) + proc.terminate() + err = kdamonds.stop() + if err is not None: + print('kdamond stop failed: %s' % err) + exit(1) wss_collected.sort() acceptable_error_rate = 0.2 for percentile in [50, 75]: sample = wss_collected[int(len(wss_collected) * percentile / 100)] error_rate = abs(sample - sz_region) / sz_region - print('%d-th percentile (%d) error %f' % - (percentile, sample, error_rate)) + print('%d-th percentile error %f (expect %d, result %d)' % + (percentile, error_rate, sz_region, sample)) if error_rate > acceptable_error_rate: print('the error rate is not acceptable (> %f)' % acceptable_error_rate) print('samples are as below') - print('\n'.join(['%d' % wss for wss in wss_collected])) - exit(1) + for idx, wss in enumerate(wss_collected): + if idx < len(wss_collected) - 1 and \ + wss_collected[idx + 1] == wss: + continue + print('%d/%d: %d' % (idx, len(wss_collected), wss)) + return False + return True + +def main(): + # DAMON doesn't flush TLB. If the system has large TLB that can cover + # whole test working set, DAMON cannot see the access. Test up to 160 MiB + # test working set. + sz_region_mb = 10 + max_sz_region_mb = 160 + while sz_region_mb <= max_sz_region_mb: + test_pass = pass_wss_estimation(sz_region_mb * 1024 * 1024) + if test_pass is True: + exit(0) + sz_region_mb *= 2 + exit(1) if __name__ == '__main__': main() diff --git a/tools/testing/selftests/dm-verity/Makefile b/tools/testing/selftests/dm-verity/Makefile new file mode 100644 index 000000000000..b75ee08a54af --- /dev/null +++ b/tools/testing/selftests/dm-verity/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_PROGS := test-dm-verity-keyring.sh + +include ../lib.mk diff --git a/tools/testing/selftests/dm-verity/config b/tools/testing/selftests/dm-verity/config new file mode 100644 index 000000000000..1cd3712fa0a4 --- /dev/null +++ b/tools/testing/selftests/dm-verity/config @@ -0,0 +1,10 @@ +CONFIG_BLK_DEV_DM=y +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_MODULE_UNLOAD=y +CONFIG_KEYS=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_SYSTEM_DATA_VERIFICATION=y diff --git a/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh new file mode 100755 index 000000000000..1f9601ef22f8 --- /dev/null +++ b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh @@ -0,0 +1,873 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test script for dm-verity keyring functionality +# +# This script has two modes depending on kernel configuration: +# +# 1. keyring_unsealed=1 AND require_signatures=1: +# - Upload a test key to the .dm-verity keyring +# - Seal the keyring +# - Create a dm-verity device with a signed root hash +# - Verify signature verification works +# +# 2. keyring_unsealed=0 (default) OR require_signatures=0: +# - Verify the keyring is already sealed (if unsealed=0) +# - Verify keys cannot be added to a sealed keyring +# - Verify the keyring is inactive (not used for verification) +# +# Requirements: +# - Root privileges +# - openssl +# - veritysetup (cryptsetup) +# - keyctl (keyutils) + +set -e + +WORK_DIR="" +DATA_DEV="" +HASH_DEV="" +DM_NAME="verity-test-$$" +CLEANUP_DONE=0 + +# Module parameters (detected at runtime) +KEYRING_UNSEALED="" +REQUIRE_SIGNATURES="" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $*" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $*" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $*" +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $*" >&2 +} + +log_skip() { + echo -e "${YELLOW}[SKIP]${NC} $*" +} + +cleanup() { + if [ "$CLEANUP_DONE" -eq 1 ]; then + return + fi + CLEANUP_DONE=1 + + log_info "Cleaning up..." + + # Remove dm-verity device if it exists + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi + + # Detach loop devices + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + fi + + # Remove work directory + if [ -n "$WORK_DIR" ] && [ -d "$WORK_DIR" ]; then + rm -rf "$WORK_DIR" + fi +} + +trap cleanup EXIT + +die() { + log_error "$*" + exit 1 +} + +find_dm_verity_keyring() { + # The .dm-verity keyring is not linked to user-accessible keyrings, + # so we need to find it via /proc/keys + local serial_hex + serial_hex=$(awk '/\.dm-verity/ {print $1}' /proc/keys 2>/dev/null) + + if [ -z "$serial_hex" ]; then + return 1 + fi + + # Convert hex to decimal for keyctl + echo $((16#$serial_hex)) +} + +get_module_param() { + local param="$1" + local path="/sys/module/dm_verity/parameters/$param" + + if [ -f "$path" ]; then + cat "$path" + else + echo "" + fi +} + +check_requirements() { + log_info "Checking requirements..." + + # Check for root + if [ "$(id -u)" -ne 0 ]; then + die "This script must be run as root" + fi + + # Check for required tools + for cmd in openssl veritysetup keyctl losetup dmsetup dd awk; do + if ! command -v "$cmd" &>/dev/null; then + die "Required command not found: $cmd" + fi + done + + # Check for dm-verity module + if ! modprobe -n dm-verity &>/dev/null; then + die "dm-verity module not available" + fi + + # Verify OpenSSL can create signatures + # OpenSSL cms -sign with -binary -outform DER creates detached signatures by default + log_info "Using OpenSSL for PKCS#7 signatures" +} + +load_dm_verity_module() { + local keyring_unsealed="${1:-0}" + local require_signatures="${2:-0}" + + log_info "Loading dm-verity module with keyring_unsealed=$keyring_unsealed require_signatures=$require_signatures" + + # Unload if already loaded + if lsmod | grep -q '^dm_verity'; then + log_info "Unloading existing dm-verity module..." + modprobe -r dm-verity 2>/dev/null || \ + die "Failed to unload dm-verity module (may be in use)" + sleep 1 + fi + + # Load with specified parameters + modprobe dm-verity keyring_unsealed="$keyring_unsealed" require_signatures="$require_signatures" || \ + die "Failed to load dm-verity module" + + # Wait for keyring to be created (poll with timeout) + local keyring_id="" + local timeout=50 # 5 seconds (50 * 0.1s) + while [ $timeout -gt 0 ]; do + keyring_id=$(find_dm_verity_keyring) && break + sleep 0.1 + timeout=$((timeout - 1)) + done + + if [ -z "$keyring_id" ]; then + die "dm-verity keyring not found after module load (timeout)" + fi + + log_info "Found .dm-verity keyring: $keyring_id" + echo "$keyring_id" > "$WORK_DIR/keyring_id" + + # Read and display module parameters + KEYRING_UNSEALED=$(get_module_param "keyring_unsealed") + REQUIRE_SIGNATURES=$(get_module_param "require_signatures") + + log_info "Module parameters:" + log_info " keyring_unsealed=$KEYRING_UNSEALED" + log_info " require_signatures=$REQUIRE_SIGNATURES" +} + +unload_dm_verity_module() { + log_info "Unloading dm-verity module..." + + # Clean up any dm-verity devices first + local dm_dev + while read -r dm_dev _; do + [ -n "$dm_dev" ] || continue + log_info "Removing dm-verity device: $dm_dev" + dmsetup remove "$dm_dev" 2>/dev/null || true + done < <(dmsetup ls --target verity 2>/dev/null) + + if lsmod | grep -q '^dm_verity'; then + modprobe -r dm-verity 2>/dev/null || \ + log_warn "Failed to unload dm-verity module" + sleep 1 + fi +} + +generate_keys() { + log_info "Generating signing key pair..." + + # Generate private key (2048-bit for faster test execution) + openssl genrsa -out "$WORK_DIR/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # The kernel requires digitalSignature key usage for signature verification + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$WORK_DIR/openssl.cnf" << 'EOF' +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-key + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$WORK_DIR/private.pem" \ + -out "$WORK_DIR/cert.pem" -days 365 \ + -config "$WORK_DIR/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$WORK_DIR/cert.pem" -outform DER \ + -out "$WORK_DIR/cert.der" + + # Show certificate info for debugging + log_info "Certificate details:" + openssl x509 -in "$WORK_DIR/cert.pem" -noout -text 2>/dev/null | \ + grep -E "Subject:|Issuer:|Key Usage|Extended" | head -10 + + log_info "Keys generated successfully" +} + +seal_keyring() { + log_info "Sealing the .dm-verity keyring..." + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + keyctl restrict_keyring "$keyring_id" || \ + die "Failed to seal keyring" + + log_info "Keyring sealed successfully" +} + +create_test_device() { + log_info "Creating test device images..." + + # Create data image with random content (8MB is sufficient for testing) + dd if=/dev/urandom of="$WORK_DIR/data.img" bs=1M count=8 status=none + + # Create hash image (will be populated by veritysetup) + dd if=/dev/zero of="$WORK_DIR/hash.img" bs=1M count=1 status=none + + # Setup loop devices + DATA_DEV=$(losetup --find --show "$WORK_DIR/data.img") + HASH_DEV=$(losetup --find --show "$WORK_DIR/hash.img") + + log_info "Data device: $DATA_DEV" + log_info "Hash device: $HASH_DEV" +} + +create_verity_hash() { + log_info "Creating dm-verity hash tree..." + + local root_hash output + output=$(veritysetup format "$DATA_DEV" "$HASH_DEV" 2>&1) + root_hash=$(echo "$output" | grep "Root hash:" | awk '{print $3}') + + if [ -z "$root_hash" ]; then + log_error "veritysetup format output:" + echo "$output" | sed 's/^/ /' + die "Failed to get root hash from veritysetup format" + fi + + echo "$root_hash" > "$WORK_DIR/root_hash" + log_info "Root hash: $root_hash" +} + +create_detached_signature() { + local infile="$1" + local outfile="$2" + local cert="$3" + local key="$4" + + # Use openssl smime (not cms) for PKCS#7 signatures compatible with kernel + # Flags from working veritysetup example: + # -nocerts: don't include certificate in signature + # -noattr: no signed attributes + # -binary: binary input mode + if openssl smime -sign -nocerts -noattr -binary \ + -in "$infile" \ + -inkey "$key" \ + -signer "$cert" \ + -outform der \ + -out "$outfile" 2>/dev/null; then + return 0 + fi + + log_error "Failed to create signature" + return 1 +} + +activate_verity_device() { + local with_sig="$1" + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Clear dmesg and capture any kernel messages during activation + dmesg -C 2>/dev/null || true + + if [ "$with_sig" = "yes" ]; then + log_info "Activating dm-verity device with signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" \ + --root-hash-signature="$WORK_DIR/root_hash.p7s" 2>&1 + local ret=$? + else + log_info "Activating dm-verity device without signature..." + veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" 2>&1 + local ret=$? + fi + + # Show relevant kernel messages + local kmsg + kmsg=$(dmesg 2>/dev/null | grep -i -E 'verity|pkcs|signature|asymmetric|key' | tail -10) + if [ -n "$kmsg" ]; then + log_info "Kernel messages:" + echo "$kmsg" | while read -r line; do echo " $line"; done + fi + + return $ret +} + +deactivate_verity_device() { + if dmsetup info "$DM_NAME" &>/dev/null; then + dmsetup remove "$DM_NAME" 2>/dev/null || true + fi +} + +show_keyring_status() { + log_info "Keyring status:" + + local keyring_id + keyring_id=$(find_dm_verity_keyring) || true + + if [ -n "$keyring_id" ]; then + echo " Keyring ID: $keyring_id" + keyctl show "$keyring_id" 2>/dev/null || true + grep '\.dm-verity' /proc/keys 2>/dev/null || true + fi +} + +list_keyring_keys() { + log_info "Keys in .dm-verity keyring:" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id" 2>/dev/null) || \ + keyring_id=$(find_dm_verity_keyring) || true + + if [ -z "$keyring_id" ]; then + log_warn "Could not find keyring" + return + fi + + # List all keys in the keyring + local keys + keys=$(keyctl list "$keyring_id" 2>/dev/null) + if [ -z "$keys" ] || [ "$keys" = "keyring is empty" ]; then + echo " (empty)" + else + echo "$keys" | while read -r line; do + echo " $line" + done + + # Show detailed info for each key + log_info "Key details:" + keyctl list "$keyring_id" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+$' | while read -r key_id; do + echo " Key $key_id:" + keyctl describe "$key_id" 2>/dev/null | sed 's/^/ /' + done + fi +} + +generate_named_key() { + local name="$1" + local key_dir="$WORK_DIR/keys/$name" + + mkdir -p "$key_dir" + + # Log to stderr so it doesn't interfere with return value + echo "[INFO] Generating key pair: $name" >&2 + + # Generate private key + openssl genrsa -out "$key_dir/private.pem" 2048 2>/dev/null + + # Create OpenSSL config for certificate extensions + # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for + # the kernel to match keys in the keyring (especially for self-signed certs) + cat > "$key_dir/openssl.cnf" << EOF +[req] +distinguished_name = req_distinguished_name +x509_extensions = v3_ca +prompt = no + +[req_distinguished_name] +CN = dm-verity-test-$name + +[v3_ca] +basicConstraints = critical,CA:FALSE +keyUsage = digitalSignature +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid +EOF + + # Generate self-signed certificate with proper extensions + openssl req -new -x509 -key "$key_dir/private.pem" \ + -out "$key_dir/cert.pem" -days 365 \ + -config "$key_dir/openssl.cnf" 2>/dev/null + + # Convert certificate to DER format for kernel + openssl x509 -in "$key_dir/cert.pem" -outform DER \ + -out "$key_dir/cert.der" + + # Return the key directory path (only this goes to stdout) + echo "$key_dir" +} + +upload_named_key() { + local name="$1" + local key_dir="$2" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Uploading key '$name' to keyring..." + + local key_id + if key_id=$(keyctl padd asymmetric "$name" "$keyring_id" \ + < "$key_dir/cert.der" 2>&1); then + log_info "Key '$name' uploaded with ID: $key_id" + echo "$key_id" > "$key_dir/key_id" + return 0 + else + log_error "Failed to upload key '$name': $key_id" + return 1 + fi +} + +# +# Test: Verify sealed keyring rejects key additions +# +test_sealed_keyring_rejects_keys() { + log_info "TEST: Verify sealed keyring rejects key additions" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + generate_keys + + # Try to add a key - should fail + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Key addition should have been rejected on sealed keyring" + return 1 + else + log_pass "Sealed keyring correctly rejected key addition" + return 0 + fi +} + +# +# Test: Multiple keys in keyring +# +test_multiple_keys() { + log_info "TEST: Multiple keys in keyring" + + local key1_dir key2_dir key3_dir + + # Generate three different keys + key1_dir=$(generate_named_key "vendor-a") + key2_dir=$(generate_named_key "vendor-b") + key3_dir=$(generate_named_key "vendor-c") + + # Upload all three keys + upload_named_key "vendor-a" "$key1_dir" || return 1 + upload_named_key "vendor-b" "$key2_dir" || return 1 + upload_named_key "vendor-c" "$key3_dir" || return 1 + + log_info "" + log_info "Keys in keyring before sealing:" + list_keyring_keys + show_keyring_status + + # Seal the keyring + log_info "" + seal_keyring + + # List keys after sealing + log_info "" + log_info "Keys in keyring after sealing:" + list_keyring_keys + show_keyring_status + + log_pass "Key upload and keyring sealing succeeded" + + # Create test device + log_info "" + create_test_device + create_verity_hash + + # Test 1: Sign with key1, should verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-a key" + if ! sign_root_hash_with_key "$key1_dir"; then + log_fail "Failed to sign with vendor-a key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-a key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-a key should succeed" + return 1 + fi + + # Test 2: Sign with key2, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-b key" + if ! sign_root_hash_with_key "$key2_dir"; then + log_fail "Failed to sign with vendor-b key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-b key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-b key should succeed" + return 1 + fi + + # Test 3: Sign with key3, should also verify successfully + log_info "" + log_info "Sub-test: Verify with vendor-c key" + if ! sign_root_hash_with_key "$key3_dir"; then + log_fail "Failed to sign with vendor-c key" + return 1 + fi + if activate_verity_device "yes"; then + log_pass "Verification with vendor-c key succeeded" + deactivate_verity_device + else + log_fail "Verification with vendor-c key should succeed" + return 1 + fi + + # Test 4: Generate a key NOT in the keyring, should fail + log_info "" + log_info "Sub-test: Verify with unknown key (should fail)" + local unknown_key_dir + unknown_key_dir=$(generate_named_key "unknown-vendor") + if ! sign_root_hash_with_key "$unknown_key_dir"; then + log_fail "Failed to sign with unknown-vendor key" + return 1 + fi + if activate_verity_device "yes"; then + log_fail "Verification with unknown key should fail" + deactivate_verity_device + return 1 + else + log_pass "Verification with unknown key correctly rejected" + fi + + log_info "" + log_pass "Multiple keys test completed successfully" + return 0 +} + +sign_root_hash_with_key() { + local key_dir="$1" + + local root_hash + root_hash=$(cat "$WORK_DIR/root_hash") + + # Create the data to sign (hex string, not binary) + echo -n "$root_hash" > "$WORK_DIR/root_hash.txt" + + # Debug: show exactly what we're signing + log_info "Root hash (hex): $root_hash" + log_info "Root hash hex string size: $(wc -c < "$WORK_DIR/root_hash.txt") bytes" + + # Create detached PKCS#7 signature + if ! create_detached_signature "$WORK_DIR/root_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem"; then + log_error "Failed to sign root hash with key from $key_dir" + return 1 + fi + + # Debug: show signing certificate info + log_info "Signed with certificate:" + openssl x509 -in "$key_dir/cert.pem" -noout -subject 2>/dev/null | sed 's/^/ /' + + # Debug: verify signature locally + # -nointern: cert not in signature, use -certfile + # -noverify: skip certificate chain validation (self-signed) + if openssl smime -verify -binary -inform der -nointern -noverify \ + -in "$WORK_DIR/root_hash.p7s" \ + -content "$WORK_DIR/root_hash.txt" \ + -certfile "$key_dir/cert.pem" \ + -out /dev/null 2>/dev/null; then + log_info "Local signature verification: PASSED" + else + log_warn "Local signature verification: FAILED" + fi + return 0 +} + +# +# Test: Verify corrupted signatures are rejected +# +test_corrupted_signature() { + log_info "TEST: Verify corrupted signatures are rejected" + + # This test requires a valid setup from test_multiple_keys or similar + # It modifies the signature file and verifies rejection + + if [ ! -f "$WORK_DIR/root_hash.p7s" ]; then + log_warn "No signature file found, skipping corrupted signature test" + return 0 + fi + + # Save original signature + cp "$WORK_DIR/root_hash.p7s" "$WORK_DIR/root_hash.p7s.orig" + + # Test 1: Truncated signature + log_info "Sub-test: Truncated signature (should fail)" + head -c 100 "$WORK_DIR/root_hash.p7s.orig" > "$WORK_DIR/root_hash.p7s" + if activate_verity_device "yes"; then + log_fail "Truncated signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Truncated signature correctly rejected" + fi + + # Test 2: Corrupted signature (flip some bytes) + log_info "Sub-test: Corrupted signature bytes (should fail)" + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + # Corrupt bytes in the middle of the signature + local sig_size + sig_size=$(wc -c < "$WORK_DIR/root_hash.p7s") + local corrupt_offset=$((sig_size / 2)) + printf '\xff\xff\xff\xff' | dd of="$WORK_DIR/root_hash.p7s" bs=1 seek=$corrupt_offset conv=notrunc 2>/dev/null + if activate_verity_device "yes"; then + log_fail "Corrupted signature should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Corrupted signature correctly rejected" + fi + + # Test 3: Signature over wrong data (sign different content) + log_info "Sub-test: Signature over wrong data (should fail)" + # Create a different root hash (all zeros as hex string) + printf '%064d' 0 > "$WORK_DIR/wrong_hash.txt" + # Get the first key directory that was used + local key_dir="$WORK_DIR/keys/vendor-a" + if [ -d "$key_dir" ]; then + create_detached_signature "$WORK_DIR/wrong_hash.txt" "$WORK_DIR/root_hash.p7s" \ + "$key_dir/cert.pem" "$key_dir/private.pem" + if activate_verity_device "yes"; then + log_fail "Signature over wrong data should be rejected" + deactivate_verity_device + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + return 1 + else + log_pass "Signature over wrong data correctly rejected" + fi + else + log_warn "Key directory not found, skipping wrong data test" + fi + + # Restore original signature + cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s" + + log_pass "Corrupted signature test completed successfully" + return 0 +} + +# +# Test: Verify keyring is sealed when keyring_unsealed=0 +# +test_keyring_sealed_by_default() { + log_info "TEST: Verify keyring is sealed by default (keyring_unsealed=0)" + + local keyring_id + keyring_id=$(cat "$WORK_DIR/keyring_id") + + log_info "Current keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + generate_keys + + # Try to add a key - should fail if keyring is sealed + log_info "Attempting to add key to sealed keyring..." + if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \ + < "$WORK_DIR/cert.der" 2>/dev/null; then + log_fail "Keyring should be sealed when keyring_unsealed=0" + list_keyring_keys + return 1 + else + log_pass "Keyring is correctly sealed when keyring_unsealed=0" + log_info "Keyring state after failed add attempt:" + list_keyring_keys + return 0 + fi +} + +# +# Test: Verify dm-verity keyring is inactive when sealed empty +# +test_keyring_inactive_when_empty() { + log_info "TEST: Verify dm-verity keyring is inactive when sealed empty" + + # When keyring_unsealed=0, the keyring is sealed immediately while empty + # This means it should NOT be used for verification (nr_leaves_on_tree=0) + + log_info "Keyring state (should be empty and sealed):" + list_keyring_keys + show_keyring_status + + create_test_device + create_verity_hash + + # Without any keys in the dm-verity keyring, and with it sealed, + # verification should fall through to the secondary/platform keyrings + # and likely succeed (if require_signatures=0) or fail (if =1) + + log_info "Sub-test: Device activation with sealed empty keyring" + if [ "$REQUIRE_SIGNATURES" = "Y" ] || [ "$REQUIRE_SIGNATURES" = "1" ]; then + if activate_verity_device "no"; then + log_fail "Device should NOT activate without signature when require_signatures=1" + deactivate_verity_device + return 1 + else + log_pass "Device correctly rejected (require_signatures=1, no valid signature)" + fi + else + if activate_verity_device "no"; then + log_pass "Device activated (require_signatures=0, empty dm-verity keyring is inactive)" + deactivate_verity_device + else + log_fail "Device should activate when require_signatures=0" + return 1 + fi + fi + + return 0 +} + +main() { + local rc=0 + + log_info "=== dm-verity keyring test ===" + log_info "" + + # Create work directory + WORK_DIR=$(mktemp -d -t dm-verity-test.XXXXXX) + log_info "Work directory: $WORK_DIR" + + check_requirements + + # + # Test 1: UNSEALED keyring mode (keyring_unsealed=1) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: UNSEALED KEYRING ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 1 1 # keyring_unsealed=1, require_signatures=1 + show_keyring_status + + log_info "" + if ! test_multiple_keys; then + rc=1 + fi + + # After sealing, verify it rejects new keys + log_info "" + if ! test_sealed_keyring_rejects_keys; then + rc=1 + fi + + # Test corrupted signatures are rejected + log_info "" + if ! test_corrupted_signature; then + rc=1 + fi + + # Clean up devices before reloading module + deactivate_verity_device + if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then + losetup -d "$DATA_DEV" 2>/dev/null || true + DATA_DEV="" + fi + if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then + losetup -d "$HASH_DEV" 2>/dev/null || true + HASH_DEV="" + fi + + # + # Test 2: SEALED keyring mode (keyring_unsealed=0, default) + # + log_info "" + log_info "========================================" + log_info "=== TEST MODE: SEALED KEYRING (default) ===" + log_info "========================================" + log_info "" + + load_dm_verity_module 0 0 # keyring_unsealed=0, require_signatures=0 + show_keyring_status + + log_info "" + if ! test_keyring_sealed_by_default; then + rc=1 + fi + + log_info "" + if ! test_keyring_inactive_when_empty; then + rc=1 + fi + + # + # Summary + # + log_info "" + log_info "========================================" + if [ $rc -eq 0 ]; then + log_info "=== All tests PASSED ===" + else + log_error "=== Some tests FAILED ===" + fi + log_info "========================================" + + return $rc +} + +main "$@" diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index f5c71d993750..8154d6d429d3 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -15,12 +15,6 @@ TEST_PROGS := \ hds.py \ napi_id.py \ napi_threaded.py \ - netcons_basic.sh \ - netcons_cmdline.sh \ - netcons_fragmented_msg.sh \ - netcons_overflow.sh \ - netcons_sysdata.sh \ - netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c index e894037d2e3e..3c0745b68bfa 100644 --- a/tools/testing/selftests/drivers/net/gro.c +++ b/tools/testing/selftests/drivers/net/gro.c @@ -3,26 +3,45 @@ * This testsuite provides conformance testing for GRO coalescing. * * Test cases: - * 1.data + * + * data_*: * Data packets of the same size and same header setup with correct * sequence numbers coalesce. The one exception being the last data * packet coalesced: it can be smaller than the rest and coalesced * as long as it is in the same flow. - * 2.ack + * - data_same: same size packets coalesce + * - data_lrg_sml: large then small coalesces + * - data_sml_lrg: small then large doesn't coalesce + * + * ack: * Pure ACK does not coalesce. - * 3.flags - * Specific test cases: no packets with PSH, SYN, URG, RST set will - * be coalesced. - * 4.tcp + * + * flags_*: + * No packets with PSH, SYN, URG, RST, CWR set will be coalesced. + * - flags_psh, flags_syn, flags_rst, flags_urg, flags_cwr + * + * tcp_*: * Packets with incorrect checksum, non-consecutive seqno and * different TCP header options shouldn't coalesce. Nit: given that * some extension headers have paddings, such as timestamp, headers - * that are padding differently would not be coalesced. - * 5.ip: - * Packets with different (ECN, TTL, TOS) header, ip options or - * ip fragments (ipv6) shouldn't coalesce. - * 6.large: + * that are padded differently would not be coalesced. + * - tcp_csum: incorrect checksum + * - tcp_seq: non-consecutive sequence numbers + * - tcp_ts: different timestamps + * - tcp_opt: different TCP options + * + * ip_*: + * Packets with different (ECN, TTL, TOS) header, IP options or + * IP fragments shouldn't coalesce. + * - ip_ecn, ip_tos: shared between IPv4/IPv6 + * - ip_ttl, ip_opt, ip_frag4: IPv4 only + * - ip_id_df*: IPv4 IP ID field coalescing tests + * - ip_frag6, ip_v6ext_*: IPv6 only + * + * large_*: * Packets larger than GRO_MAX_SIZE packets shouldn't coalesce. + * - large_max: exceeding max size + * - large_rem: remainder handling * * MSS is defined as 4096 - header because if it is too small * (i.e. 1500 MTU - header), it will result in many packets, @@ -79,6 +98,15 @@ #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +enum flush_id_case { + FLUSH_ID_DF1_INC, + FLUSH_ID_DF1_FIXED, + FLUSH_ID_DF0_INC, + FLUSH_ID_DF0_FIXED, + FLUSH_ID_DF1_INC_FIXED, + FLUSH_ID_DF1_FIXED_INC, +}; + static const char *addr6_src = "fdaa::2"; static const char *addr6_dst = "fdaa::1"; static const char *addr4_src = "192.168.1.200"; @@ -95,7 +123,6 @@ static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; static bool ipip; -static const int num_flush_id_cases = 6; static void vlog(const char *fmt, ...) { @@ -127,19 +154,19 @@ static void setup_sock_filter(int fd) /* Overridden later if exthdrs are used: */ opt_ipproto_off = ipproto_off; - if (strcmp(testname, "ip") == 0) { - if (proto == PF_INET) - optlen = sizeof(struct ip_timestamp); - else { - BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); - BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); - BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); - - /* same size for HBH and Fragment extension header types */ - optlen = MIN_EXTHDR_SIZE; - opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) - + offsetof(struct ip6_ext, ip6e_nxt); - } + if (strcmp(testname, "ip_opt") == 0) { + optlen = sizeof(struct ip_timestamp); + } else if (strcmp(testname, "ip_frag6") == 0 || + strcmp(testname, "ip_v6ext_same") == 0 || + strcmp(testname, "ip_v6ext_diff") == 0) { + BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE); + BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE); + + /* same size for HBH and Fragment extension header types */ + optlen = MIN_EXTHDR_SIZE; + opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) + + offsetof(struct ip6_ext, ip6e_nxt); } /* this filter validates the following: @@ -333,32 +360,58 @@ static void create_packet(void *buf, int seq_offset, int ack_offset, fill_datalinklayer(buf); } -/* send one extra flag, not first and not last pkt */ -static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, - int rst, int urg) +#ifndef TH_CWR +#define TH_CWR 0x80 +#endif +static void set_flags(struct tcphdr *tcph, int payload_len, int psh, int syn, + int rst, int urg, int cwr) { - static char flag_buf[MAX_HDR_LEN + PAYLOAD_LEN]; - static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; - int payload_len, pkt_size, flag, i; - struct tcphdr *tcph; - - payload_len = PAYLOAD_LEN * psh; - pkt_size = total_hdr_len + payload_len; - flag = NUM_PACKETS / 2; - - create_packet(flag_buf, flag * payload_len, 0, payload_len, 0); - - tcph = (struct tcphdr *)(flag_buf + tcp_offset); tcph->psh = psh; tcph->syn = syn; tcph->rst = rst; tcph->urg = urg; + if (cwr) + tcph->th_flags |= TH_CWR; + else + tcph->th_flags &= ~TH_CWR; tcph->check = 0; tcph->check = tcp_checksum(tcph, payload_len); +} + +/* send extra flags of the (NUM_PACKETS / 2) and (NUM_PACKETS / 2 - 1) + * pkts, not first and not last pkt + */ +static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, + int rst, int urg, int cwr) +{ + static char flag_buf[2][MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + int payload_len, pkt_size, i; + struct tcphdr *tcph; + int flag[2]; + + payload_len = PAYLOAD_LEN * (psh || cwr); + pkt_size = total_hdr_len + payload_len; + flag[0] = NUM_PACKETS / 2; + flag[1] = NUM_PACKETS / 2 - 1; + + /* Create and configure packets with flags + */ + for (i = 0; i < 2; i++) { + if (flag[i] > 0) { + create_packet(flag_buf[i], flag[i] * payload_len, 0, + payload_len, 0); + tcph = (struct tcphdr *)(flag_buf[i] + tcp_offset); + set_flags(tcph, payload_len, psh, syn, rst, urg, cwr); + } + } for (i = 0; i < NUM_PACKETS + 1; i++) { - if (i == flag) { - write_packet(fd, flag_buf, pkt_size, daddr); + if (i == flag[0]) { + write_packet(fd, flag_buf[0], pkt_size, daddr); + continue; + } else if (i == flag[1] && cwr) { + write_packet(fd, flag_buf[1], pkt_size, daddr); continue; } create_packet(buf, i * PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); @@ -648,7 +701,8 @@ static void fix_ip4_checksum(struct iphdr *iph) iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); } -static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, + enum flush_id_case tcase) { static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -667,7 +721,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); switch (tcase) { - case 0: /* DF=1, Incrementing - should coalesce */ + case FLUSH_ID_DF1_INC: /* DF=1, Incrementing - should coalesce */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -675,7 +729,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 1: /* DF=1, Fixed - should coalesce */ + case FLUSH_ID_DF1_FIXED: /* DF=1, Fixed - should coalesce */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -683,7 +737,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(8); break; - case 2: /* DF=0, Incrementing - should coalesce */ + case FLUSH_ID_DF0_INC: /* DF=0, Incrementing - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -691,7 +745,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 3: /* DF=0, Fixed - should coalesce */ + case FLUSH_ID_DF0_FIXED: /* DF=0, Fixed - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -699,9 +753,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(8); break; - case 4: /* DF=1, two packets incrementing, and one fixed - should - * coalesce only the first two packets - */ + case FLUSH_ID_DF1_INC_FIXED: /* DF=1, two packets incrementing, and + * one fixed - should coalesce only the + * first two packets + */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -713,9 +768,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) send_three = true; break; - case 5: /* DF=1, two packets fixed, and one incrementing - should - * coalesce only the first two packets - */ + case FLUSH_ID_DF1_FIXED_INC: /* DF=1, two packets fixed, and one + * incrementing - should coalesce only + * the first two packets + */ iph1->frag_off |= htons(IP_DF); iph1->id = htons(8); @@ -739,16 +795,6 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) } } -static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) -{ - for (int i = 0; i < num_flush_id_cases; i++) { - sleep(1); - send_flush_id_case(fd, daddr, i); - sleep(1); - write_packet(fd, fin_pkt, total_hdr_len, daddr); - } -} - static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -926,6 +972,28 @@ static void set_timeout(int fd) error(1, errno, "cannot set timeout, setsockopt failed"); } +static void set_rcvbuf(int fd) +{ + int bufsize = 1 * 1024 * 1024; /* 1 MB */ + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bufsize, sizeof(bufsize))) + error(1, errno, "cannot set rcvbuf size, setsockopt failed"); +} + +static void recv_error(int fd, int rcv_errno) +{ + struct tpacket_stats stats; + socklen_t len; + + len = sizeof(stats); + if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len)) + error(1, errno, "can't get stats"); + + fprintf(stderr, "Socket stats: packets=%u, drops=%u\n", + stats.tp_packets, stats.tp_drops); + error(1, rcv_errno, "could not receive"); +} + static void check_recv_pkts(int fd, int *correct_payload, int correct_num_pkts) { @@ -950,7 +1018,7 @@ static void check_recv_pkts(int fd, int *correct_payload, ip_ext_len = 0; pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); if (pkt_size < 0) - error(1, errno, "could not receive"); + recv_error(fd, errno); if (iph->version == 4) ip_ext_len = (iph->ihl - 5) * 4; @@ -1008,108 +1076,131 @@ static void gro_sender(void) daddr.sll_halen = ETH_ALEN; create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1); - if (strcmp(testname, "data") == 0) { + /* data sub-tests */ + if (strcmp(testname, "data_same") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "data_lrg_sml") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "data_sml_lrg") == 0) { send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* ack test */ } else if (strcmp(testname, "ack") == 0) { send_ack(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "flags") == 0) { - send_flags(txfd, &daddr, 1, 0, 0, 0); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - send_flags(txfd, &daddr, 0, 1, 0, 0); + /* flags sub-tests */ + } else if (strcmp(testname, "flags_psh") == 0) { + send_flags(txfd, &daddr, 1, 0, 0, 0, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - send_flags(txfd, &daddr, 0, 0, 1, 0); + } else if (strcmp(testname, "flags_syn") == 0) { + send_flags(txfd, &daddr, 0, 1, 0, 0, 0); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - send_flags(txfd, &daddr, 0, 0, 0, 1); + } else if (strcmp(testname, "flags_rst") == 0) { + send_flags(txfd, &daddr, 0, 0, 1, 0, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "flags_urg") == 0) { + send_flags(txfd, &daddr, 0, 0, 0, 1, 0); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "flags_cwr") == 0) { + send_flags(txfd, &daddr, 0, 0, 0, 0, 1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "tcp") == 0) { + + /* tcp sub-tests */ + } else if (strcmp(testname, "tcp_csum") == 0) { send_changed_checksum(txfd, &daddr); - /* Adding sleep before sending FIN so that it is not - * received prior to other packets. - */ usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_seq") == 0) { send_changed_seq(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_ts") == 0) { send_changed_ts(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "tcp_opt") == 0) { send_diff_opt(txfd, &daddr); usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } else if (strcmp(testname, "ip") == 0) { + + /* ip sub-tests - shared between IPv4 and IPv6 */ + } else if (strcmp(testname, "ip_ecn") == 0) { send_changed_ECN(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - + } else if (strcmp(testname, "ip_tos") == 0) { send_changed_tos(txfd, &daddr); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - if (proto == PF_INET) { - /* Modified packets may be received out of order. - * Sleep function added to enforce test boundaries - * so that fin pkts are not received prior to other pkts. - */ - sleep(1); - send_changed_ttl(txfd, &daddr); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - send_ip_options(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - send_fragment4(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - test_flush_id(txfd, &daddr, fin_pkt); - } else if (proto == PF_INET6) { - sleep(1); - send_fragment6(txfd, &daddr); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - /* send IPv6 packets with ext header with same payload */ - send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - - sleep(1); - /* send IPv6 packets with ext header with different payload */ - send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); - sleep(1); - write_packet(txfd, fin_pkt, total_hdr_len, &daddr); - } - } else if (strcmp(testname, "large") == 0) { - /* 20 is the difference between min iphdr size - * and min ipv6hdr size. Like MAX_HDR_SIZE, - * MAX_PAYLOAD is defined with the larger header of the two. - */ + + /* ip sub-tests - IPv4 only */ + } else if (strcmp(testname, "ip_ttl") == 0) { + send_changed_ttl(txfd, &daddr); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_opt") == 0) { + send_ip_options(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_frag4") == 0) { + send_fragment4(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df0_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df0_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC_FIXED); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) { + send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED_INC); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* ip sub-tests - IPv6 only */ + } else if (strcmp(testname, "ip_frag6") == 0) { + send_fragment6(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_v6ext_same") == 0) { + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "ip_v6ext_diff") == 0) { + send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + /* large sub-tests */ + } else if (strcmp(testname, "large_max") == 0) { int offset = (proto == PF_INET && !ipip) ? 20 : 0; int remainder = (MAX_PAYLOAD + offset) % MSS; send_large(txfd, &daddr, remainder); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else if (strcmp(testname, "large_rem") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; send_large(txfd, &daddr, remainder + 1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else { - error(1, 0, "Unknown testcase"); + error(1, 0, "Unknown testcase: %s", testname); } if (close(txfd)) @@ -1126,132 +1217,166 @@ static void gro_receiver(void) error(1, 0, "socket creation"); setup_sock_filter(rxfd); set_timeout(rxfd); + set_rcvbuf(rxfd); bind_packetsocket(rxfd); ksft_ready(); memset(correct_payload, 0, sizeof(correct_payload)); - if (strcmp(testname, "data") == 0) { + /* data sub-tests */ + if (strcmp(testname, "data_same") == 0) { printf("pure data packet of same size: "); correct_payload[0] = PAYLOAD_LEN * 2; check_recv_pkts(rxfd, correct_payload, 1); - + } else if (strcmp(testname, "data_lrg_sml") == 0) { printf("large data packets followed by a smaller one: "); correct_payload[0] = PAYLOAD_LEN * 1.5; check_recv_pkts(rxfd, correct_payload, 1); - + } else if (strcmp(testname, "data_sml_lrg") == 0) { printf("small data packets followed by a larger one: "); correct_payload[0] = PAYLOAD_LEN / 2; correct_payload[1] = PAYLOAD_LEN; check_recv_pkts(rxfd, correct_payload, 2); + + /* ack test */ } else if (strcmp(testname, "ack") == 0) { printf("duplicate ack and pure ack: "); check_recv_pkts(rxfd, correct_payload, 3); - } else if (strcmp(testname, "flags") == 0) { + + /* flags sub-tests */ + } else if (strcmp(testname, "flags_psh") == 0) { correct_payload[0] = PAYLOAD_LEN * 3; correct_payload[1] = PAYLOAD_LEN * 2; - printf("psh flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "flags_syn") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; correct_payload[1] = 0; correct_payload[2] = PAYLOAD_LEN * 2; printf("syn flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - + } else if (strcmp(testname, "flags_rst") == 0) { + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; printf("rst flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - + } else if (strcmp(testname, "flags_urg") == 0) { + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = 0; + correct_payload[2] = PAYLOAD_LEN * 2; printf("urg flag ends coalescing: "); check_recv_pkts(rxfd, correct_payload, 3); - } else if (strcmp(testname, "tcp") == 0) { + } else if (strcmp(testname, "flags_cwr") == 0) { correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - correct_payload[2] = PAYLOAD_LEN; - correct_payload[3] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN * 2; + correct_payload[2] = PAYLOAD_LEN * 2; + printf("cwr flag ends coalescing: "); + check_recv_pkts(rxfd, correct_payload, 3); + /* tcp sub-tests */ + } else if (strcmp(testname, "tcp_csum") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("changed checksum does not coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "tcp_seq") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("Wrong Seq number doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - - printf("Different timestamp doesn't coalesce: "); + } else if (strcmp(testname, "tcp_ts") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + correct_payload[3] = PAYLOAD_LEN; + printf("Different timestamp doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 4); - - printf("Different options doesn't coalesce: "); + } else if (strcmp(testname, "tcp_opt") == 0) { correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + printf("Different options doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - } else if (strcmp(testname, "ip") == 0) { + + /* ip sub-tests - shared between IPv4 and IPv6 */ + } else if (strcmp(testname, "ip_ecn") == 0) { correct_payload[0] = PAYLOAD_LEN; correct_payload[1] = PAYLOAD_LEN; - printf("different ECN doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - + } else if (strcmp(testname, "ip_tos") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; printf("different tos doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); - if (proto == PF_INET) { - printf("different ttl doesn't coalesce: "); - check_recv_pkts(rxfd, correct_payload, 2); - - printf("ip options doesn't coalesce: "); - correct_payload[2] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 3); - - printf("fragmented ip4 doesn't coalesce: "); - check_recv_pkts(rxfd, correct_payload, 2); - - /* is_atomic checks */ - printf("DF=1, Incrementing - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=1, Fixed - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=0, Incrementing - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=0, Fixed - should coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - - printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - } else if (proto == PF_INET6) { - /* GRO doesn't check for ipv6 hop limit when flushing. - * Hence no corresponding test to the ipv4 case. - */ - printf("fragmented ip6 doesn't coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - correct_payload[1] = PAYLOAD_LEN; - correct_payload[2] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 3); - - printf("ipv6 with ext header does coalesce: "); - correct_payload[0] = PAYLOAD_LEN * 2; - check_recv_pkts(rxfd, correct_payload, 1); - - printf("ipv6 with ext header with different payloads doesn't coalesce: "); - correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); - } - } else if (strcmp(testname, "large") == 0) { + /* ip sub-tests - IPv4 only */ + } else if (strcmp(testname, "ip_ttl") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("different ttl doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_opt") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + printf("ip options doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "ip_frag4") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("fragmented ip4 doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_id_df1_inc") == 0) { + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df1_fixed") == 0) { + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df0_inc") == 0) { + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df0_fixed") == 0) { + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) { + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) { + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + /* ip sub-tests - IPv6 only */ + } else if (strcmp(testname, "ip_frag6") == 0) { + /* GRO doesn't check for ipv6 hop limit when flushing. + * Hence no corresponding test to the ipv4 case. + */ + printf("fragmented ip6 doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + correct_payload[2] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 3); + } else if (strcmp(testname, "ip_v6ext_same") == 0) { + printf("ipv6 with ext header does coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + } else if (strcmp(testname, "ip_v6ext_diff") == 0) { + printf("ipv6 with ext header with different payloads doesn't coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + /* large sub-tests */ + } else if (strcmp(testname, "large_max") == 0) { int offset = (proto == PF_INET && !ipip) ? 20 : 0; int remainder = (MAX_PAYLOAD + offset) % MSS; @@ -1259,14 +1384,18 @@ static void gro_receiver(void) correct_payload[1] = remainder; printf("Shouldn't coalesce if exceed IP max pkt size: "); check_recv_pkts(rxfd, correct_payload, 2); + } else if (strcmp(testname, "large_rem") == 0) { + int offset = (proto == PF_INET && !ipip) ? 20 : 0; + int remainder = (MAX_PAYLOAD + offset) % MSS; /* last segment sent individually, doesn't start new segment */ - correct_payload[0] = correct_payload[0] - remainder; + correct_payload[0] = (MAX_PAYLOAD + offset) - remainder; correct_payload[1] = remainder + 1; correct_payload[2] = remainder + 1; + printf("last segment sent individually: "); check_recv_pkts(rxfd, correct_payload, 3); } else { - error(1, 0, "Test case error, should never trigger"); + error(1, 0, "Test case error: unknown testname %s", testname); } if (close(rxfd)) diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index ba83713bf7b5..cbc1b19dbc91 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -9,18 +9,36 @@ binary in different configurations and checking for correct packet coalescing behavior. Test cases: - - data: Data packets with same size/headers and correct seq numbers coalesce + - data_same: Same size data packets coalesce + - data_lrg_sml: Large packet followed by smaller one coalesces + - data_sml_lrg: Small packet followed by larger one doesn't coalesce - ack: Pure ACK packets do not coalesce - - flags: Packets with PSH, SYN, URG, RST flags do not coalesce - - tcp: Packets with incorrect checksum, non-consecutive seqno don't coalesce - - ip: Packets with different ECN, TTL, TOS, or IP options don't coalesce - - large: Packets larger than GRO_MAX_SIZE don't coalesce + - flags_psh: Packets with PSH flag don't coalesce + - flags_syn: Packets with SYN flag don't coalesce + - flags_rst: Packets with RST flag don't coalesce + - flags_urg: Packets with URG flag don't coalesce + - flags_cwr: Packets with CWR flag don't coalesce + - tcp_csum: Packets with incorrect checksum don't coalesce + - tcp_seq: Packets with non-consecutive seqno don't coalesce + - tcp_ts: Packets with different timestamp options don't coalesce + - tcp_opt: Packets with different TCP options don't coalesce + - ip_ecn: Packets with different ECN don't coalesce + - ip_tos: Packets with different TOS don't coalesce + - ip_ttl: (IPv4) Packets with different TTL don't coalesce + - ip_opt: (IPv4) Packets with IP options don't coalesce + - ip_frag4: (IPv4) IPv4 fragments don't coalesce + - ip_id_df*: (IPv4) IP ID field coalescing tests + - ip_frag6: (IPv6) IPv6 fragments don't coalesce + - ip_v6ext_same: (IPv6) IPv6 ext header with same payload coalesces + - ip_v6ext_diff: (IPv6) IPv6 ext header with different payload doesn't coalesce + - large_max: Packets exceeding GRO_MAX_SIZE don't coalesce + - large_rem: Large packet remainder handling """ import os from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import NetDrvEpEnv, KsftXfailEx -from lib.py import cmd, defer, bkg, ip +from lib.py import bkg, cmd, defer, ethtool, ip from lib.py import ksft_variants @@ -70,49 +88,150 @@ def _set_mtu_restore(dev, mtu, host): defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host) -def _setup(cfg, test_name): +def _set_ethtool_feat(dev, current, feats, host=None): + s2n = {True: "on", False: "off"} + + new = ["-K", dev] + old = ["-K", dev] + no_change = True + for name, state in feats.items(): + new += [name, s2n[state]] + old += [name, s2n[current[name]["active"]]] + + if current[name]["active"] != state: + no_change = False + if current[name]["fixed"]: + raise KsftXfailEx(f"Device does not support {name}") + if no_change: + return + + eth_cmd = ethtool(" ".join(new), host=host) + defer(ethtool, " ".join(old), host=host) + + # If ethtool printed something kernel must have modified some features + if eth_cmd.stdout: + ksft_pr(eth_cmd) + + +def _setup(cfg, mode, test_name): """ Setup hardware loopback mode for GRO testing. """ if not hasattr(cfg, "bin_remote"): cfg.bin_local = cfg.test_dir / "gro" cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - # "large" test needs at least 4k MTU - if test_name == "large": + if not hasattr(cfg, "feat"): + cfg.feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}", + host=cfg.remote, json=True)[0] + + # "large_*" tests need at least 4k MTU + if test_name.startswith("large_"): _set_mtu_restore(cfg.dev, 4096, None) _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote) - flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" - irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" - - _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) - _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + if mode == "sw": + flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout" + irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs" + + _write_defer_restore(cfg, flush_path, "200000", defer_undo=True) + _write_defer_restore(cfg, irq_path, "10", defer_undo=True) + + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": True, + "rx-gro-hw": False, + "large-receive-offload": False}) + elif mode == "hw": + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": False, + "rx-gro-hw": True, + "large-receive-offload": False}) + + # Some NICs treat HW GRO as a GRO sub-feature so disabling GRO + # will also clear HW GRO. Use a hack of installing XDP generic + # to skip SW GRO, even when enabled. + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + if not feat["rx-gro-hw"]["active"]: + ksft_pr("Driver clears HW GRO and SW GRO is cleared, using generic XDP workaround") + prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" + ip(f"link set dev {cfg.ifname} xdpgeneric obj {prog} sec xdp") + defer(ip, f"link set dev {cfg.ifname} xdpgeneric off") + + # Attaching XDP may change features, fetch the latest state + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + + _set_ethtool_feat(cfg.ifname, feat, + {"generic-receive-offload": True, + "rx-gro-hw": True, + "large-receive-offload": False}) + elif mode == "lro": + # netdevsim advertises LRO for feature inheritance testing with + # bonding/team tests but it doesn't actually perform the offload + cfg.require_nsim(nsim_test=False) + + _set_ethtool_feat(cfg.ifname, cfg.feat, + {"generic-receive-offload": False, + "rx-gro-hw": False, + "large-receive-offload": True}) try: # Disable TSO for local tests cfg.require_nsim() # will raise KsftXfailEx if not running on nsim - cmd(f"ethtool -K {cfg.ifname} gro on tso off") - cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote) + _set_ethtool_feat(cfg.remote_ifname, cfg.remote_feat, + {"tcp-segmentation-offload": False}, + host=cfg.remote) except KsftXfailEx: pass + def _gro_variants(): """Generator that yields all combinations of protocol and test types.""" - for protocol in ["ipv4", "ipv6", "ipip"]: - for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]: - yield protocol, test_name + # Tests that work for all protocols + common_tests = [ + "data_same", "data_lrg_sml", "data_sml_lrg", + "ack", + "flags_psh", "flags_syn", "flags_rst", "flags_urg", "flags_cwr", + "tcp_csum", "tcp_seq", "tcp_ts", "tcp_opt", + "ip_ecn", "ip_tos", + "large_max", "large_rem", + ] + + # Tests specific to IPv4 + ipv4_tests = [ + "ip_ttl", "ip_opt", "ip_frag4", + "ip_id_df1_inc", "ip_id_df1_fixed", + "ip_id_df0_inc", "ip_id_df0_fixed", + "ip_id_df1_inc_fixed", "ip_id_df1_fixed_inc", + ] + + # Tests specific to IPv6 + ipv6_tests = [ + "ip_frag6", "ip_v6ext_same", "ip_v6ext_diff", + ] + + for mode in ["sw", "hw", "lro"]: + for protocol in ["ipv4", "ipv6", "ipip"]: + for test_name in common_tests: + yield mode, protocol, test_name + + if protocol in ["ipv4", "ipip"]: + for test_name in ipv4_tests: + yield mode, protocol, test_name + elif protocol == "ipv6": + for test_name in ipv6_tests: + yield mode, protocol, test_name @ksft_variants(_gro_variants()) -def test(cfg, protocol, test_name): +def test(cfg, mode, protocol, test_name): """Run a single GRO test with retries.""" ipver = "6" if protocol[-1] == "6" else "4" cfg.require_ipver(ipver) - _setup(cfg, test_name) + _setup(cfg, mode, test_name) base_cmd_args = [ f"--{protocol}", @@ -142,10 +261,9 @@ def test(cfg, protocol, test_name): if rx_proc.ret == 0: return - ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# ')) - ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# ')) + ksft_pr(rx_proc) - if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"): + if test_name.startswith("large_") and os.environ.get("KSFT_MACHINE_SLOW"): ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") return diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 9c163ba6feee..a64140333a46 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -35,6 +35,7 @@ TEST_PROGS = \ pp_alloc_fail.py \ rss_api.py \ rss_ctx.py \ + rss_drv.py \ rss_flow_label.py \ rss_input_xfrm.py \ toeplitz.py \ diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index 62456df947bc..240d13dbc54e 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -12,6 +12,7 @@ #include <unistd.h> #include <arpa/inet.h> +#include <linux/mman.h> #include <linux/errqueue.h> #include <linux/if_packet.h> #include <linux/ipv6.h> @@ -37,6 +38,23 @@ #include <liburing.h> +#define SKIP_CODE 42 + +struct t_io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u32 zcrx_id; + __u32 rx_buf_len; + __u64 __resv[3]; +}; + static long page_size; #define AREA_SIZE (8192 * page_size) #define SEND_SIZE (512 * 4096) @@ -65,6 +83,8 @@ static bool cfg_oneshot; static int cfg_oneshot_recvs; static int cfg_send_size = SEND_SIZE; static struct sockaddr_in6 cfg_addr; +static unsigned int cfg_rx_buf_len; +static bool cfg_dry_run; static char *payload; static void *area_ptr; @@ -128,14 +148,28 @@ static void setup_zcrx(struct io_uring *ring) if (!ifindex) error(1, 0, "bad interface name: %s", cfg_ifname); - area_ptr = mmap(NULL, - AREA_SIZE, - PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, - 0, - 0); - if (area_ptr == MAP_FAILED) - error(1, 0, "mmap(): zero copy area"); + if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | + MAP_HUGETLB | MAP_HUGE_2MB, + -1, + 0); + if (area_ptr == MAP_FAILED) { + printf("Can't allocate huge pages\n"); + exit(SKIP_CODE); + } + } else { + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + } ring_size = get_refill_ring_size(rq_entries); ring_ptr = mmap(NULL, @@ -157,17 +191,23 @@ static void setup_zcrx(struct io_uring *ring) .flags = 0, }; - struct io_uring_zcrx_ifq_reg reg = { + struct t_io_uring_zcrx_ifq_reg reg = { .if_idx = ifindex, .if_rxq = cfg_queue_id, .rq_entries = rq_entries, .area_ptr = (__u64)(unsigned long)&area_reg, .region_ptr = (__u64)(unsigned long)®ion_reg, + .rx_buf_len = cfg_rx_buf_len, }; - ret = io_uring_register_ifq(ring, ®); - if (ret) + ret = io_uring_register_ifq(ring, (void *)®); + if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP || + ret == -ERANGE)) { + printf("Large chunks are not supported %i\n", ret); + exit(SKIP_CODE); + } else if (ret) { error(1, 0, "io_uring_register_ifq(): %d", ret); + } rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); @@ -323,6 +363,8 @@ static void run_server(void) io_uring_queue_init(512, &ring, flags); setup_zcrx(&ring); + if (cfg_dry_run) + return; add_accept(&ring, fd); @@ -383,7 +425,7 @@ static void parse_opts(int argc, char **argv) usage(argv[0]); cfg_payload_len = max_payload_len; - while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) { + while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) { switch (c) { case 's': if (cfg_client) @@ -418,6 +460,12 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_send_size = strtoul(optarg, NULL, 0); break; + case 'x': + cfg_rx_buf_len = page_size * strtoul(optarg, NULL, 0); + break; + case 'd': + cfg_dry_run = true; + break; } } diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 712c806508b5..c63d6d6450d2 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -3,104 +3,121 @@ import re from os import path -from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import ksft_run, ksft_exit, KsftSkipEx, ksft_variants, KsftNamedVariant from lib.py import NetDrvEpEnv from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen +from lib.py import EthtoolFamily +SKIP_CODE = 42 -def _get_current_settings(cfg): - output = ethtool(f"-g {cfg.ifname}", json=True)[0] - return (output['rx'], output['hds-thresh']) - - -def _get_combined_channels(cfg): - output = ethtool(f"-l {cfg.ifname}").stdout - values = re.findall(r'Combined:\s+(\d+)', output) - return int(values[1]) - - -def _create_rss_ctx(cfg, chan): - output = ethtool(f"-X {cfg.ifname} context new start {chan} equal 1").stdout +def create_rss_ctx(cfg): + output = ethtool(f"-X {cfg.ifname} context new start {cfg.target} equal 1").stdout values = re.search(r'New RSS context is (\d+)', output).group(1) - ctx_id = int(values) - return (ctx_id, defer(ethtool, f"-X {cfg.ifname} delete context {ctx_id}")) + return int(values) -def _set_flow_rule(cfg, port, chan): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {chan}").stdout +def set_flow_rule(cfg): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.target}").stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def _set_flow_rule_rss(cfg, port, ctx_id): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} context {ctx_id}").stdout +def set_flow_rule_rss(cfg, rss_ctx_id): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def test_zcrx(cfg) -> None: - cfg.require_ipver('6') - - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() - - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") +def single(cfg): + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + rx_rings = rings['rx'] + hds_thresh = rings.get('hds-thresh', 0) - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': hds_thresh, + 'rx': rx_rings}) - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") + cfg.target = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.target}") defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) + flow_rule_id = set_flow_rule(cfg) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" - with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") - cmd(tx_cmd, host=cfg.remote) +def rss(cfg): + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx('Test requires NETIF with at least 2 combined channels') -def test_zcrx_oneshot(cfg) -> None: - cfg.require_ipver('6') + rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + rx_rings = rings['rx'] + hds_thresh = rings.get('hds-thresh', 0) - combined_chans = _get_combined_channels(cfg) - if combined_chans < 2: - raise KsftSkipEx('at least 2 combined channels required') - (rx_ring, hds_thresh) = _get_current_settings(cfg) - port = rand_port() + cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': 64}) + defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown', + 'hds-thresh': hds_thresh, + 'rx': rx_rings}) - ethtool(f"-G {cfg.ifname} tcp-data-split on") - defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto") + cfg.target = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.target}") + defer(ethtool, f"-X {cfg.ifname} default") - ethtool(f"-G {cfg.ifname} hds-thresh 0") - defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}") + rss_ctx_id = create_rss_ctx(cfg) + defer(ethtool, f"-X {cfg.ifname} delete context {rss_ctx_id}") - ethtool(f"-G {cfg.ifname} rx 64") - defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}") + flow_rule_id = set_flow_rule_rss(cfg, rss_ctx_id) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") - defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) - defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") +@ksft_variants([ + KsftNamedVariant("single", single), + KsftNamedVariant("rss", rss), +]) +def test_zcrx(cfg, setup) -> None: + cfg.require_ipver('6') - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -o 4" - tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 4096 -z 16384" + setup(cfg) + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target}" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" with bkg(rx_cmd, exit_wait=True): - wait_port_listen(port, proto="tcp") + wait_port_listen(cfg.port, proto="tcp") cmd(tx_cmd, host=cfg.remote) -def test_zcrx_rss(cfg) -> None: +@ksft_variants([ + KsftNamedVariant("single", single), + KsftNamedVariant("rss", rss), +]) +def test_zcrx_oneshot(cfg, setup) -> None: + cfg.require_ipver('6') + + setup(cfg) + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -o 4" + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 4096 -z 16384" + with bkg(rx_cmd, exit_wait=True): + wait_port_listen(cfg.port, proto="tcp") + cmd(tx_cmd, host=cfg.remote) + + +def test_zcrx_large_chunks(cfg) -> None: + """Test zcrx with large buffer chunks.""" + cfg.require_ipver('6') combined_chans = _get_combined_channels(cfg) @@ -121,12 +138,16 @@ def test_zcrx_rss(cfg) -> None: ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}") defer(ethtool, f"-X {cfg.ifname} default") - (ctx_id, delete_ctx) = _create_rss_ctx(cfg, combined_chans - 1) - flow_rule_id = _set_flow_rule_rss(cfg, port, ctx_id) + flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" + rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -x 2" tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840" + + probe = cmd(rx_cmd + " -d", fail=False) + if probe.ret == SKIP_CODE: + raise KsftSkipEx(probe.stdout) + with bkg(rx_cmd, exit_wait=True): wait_port_listen(port, proto="tcp") cmd(tx_cmd, host=cfg.remote) @@ -137,7 +158,9 @@ def main() -> None: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + cfg.ethnl = EthtoolFamily() + cfg.port = rand_port() + ksft_run(globs=globals(), cases=[test_zcrx, test_zcrx_oneshot], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 3288ed04ce08..16864c844108 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -48,6 +48,7 @@ #include <errno.h> #define __iovec_defined #include <fcntl.h> +#include <limits.h> #include <malloc.h> #include <error.h> #include <poll.h> diff --git a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py index c1e943d53f19..c632b41e7a23 100755 --- a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py +++ b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py @@ -1,15 +1,38 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +# pylint: disable=locally-disabled, invalid-name, attribute-defined-outside-init, too-few-public-methods """ Tests related to configuration of HW timestamping """ import errno +import ctypes +import fcntl +import socket from lib.py import ksft_run, ksft_exit, ksft_ge, ksft_eq, KsftSkipEx from lib.py import NetDrvEnv, EthtoolFamily, NlError +SIOCSHWTSTAMP = 0x89b0 +SIOCGHWTSTAMP = 0x89b1 +class hwtstamp_config(ctypes.Structure): + """ Python copy of struct hwtstamp_config """ + _fields_ = [ + ("flags", ctypes.c_int), + ("tx_type", ctypes.c_int), + ("rx_filter", ctypes.c_int), + ] + + +class ifreq(ctypes.Structure): + """ Python copy of struct ifreq """ + _fields_ = [ + ("ifr_name", ctypes.c_char * 16), + ("ifr_data", ctypes.POINTER(hwtstamp_config)), + ] + + def __get_hwtimestamp_support(cfg): """ Retrieve supported configuration information """ @@ -31,8 +54,29 @@ def __get_hwtimestamp_support(cfg): return ctx +def __get_hwtimestamp_config_ioctl(cfg): + """ Retrieve current TS configuration information (via ioctl) """ + + config = hwtstamp_config() + + req = ifreq() + req.ifr_name = cfg.ifname.encode() + req.ifr_data = ctypes.pointer(config) + + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + fcntl.ioctl(sock.fileno(), SIOCGHWTSTAMP, req) + sock.close() + + except OSError as e: + if e.errno == errno.EOPNOTSUPP: + raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e + raise + return config + + def __get_hwtimestamp_config(cfg): - """ Retrieve current TS configuration information """ + """ Retrieve current TS configuration information (via netLink) """ try: tscfg = cfg.ethnl.tsconfig_get({'header': {'dev-name': cfg.ifname}}) @@ -43,8 +87,27 @@ def __get_hwtimestamp_config(cfg): return tscfg +def __set_hwtimestamp_config_ioctl(cfg, ts): + """ Setup new TS configuration information (via ioctl) """ + config = hwtstamp_config() + config.rx_filter = ts['rx-filters']['bits']['bit'][0]['index'] + config.tx_type = ts['tx-types']['bits']['bit'][0]['index'] + req = ifreq() + req.ifr_name = cfg.ifname.encode() + req.ifr_data = ctypes.pointer(config) + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + fcntl.ioctl(sock.fileno(), SIOCSHWTSTAMP, req) + sock.close() + + except OSError as e: + if e.errno == errno.EOPNOTSUPP: + raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e + raise + + def __set_hwtimestamp_config(cfg, ts): - """ Setup new TS configuration information """ + """ Setup new TS configuration information (via netlink) """ ts['header'] = {'dev-name': cfg.ifname} try: @@ -56,9 +119,9 @@ def __set_hwtimestamp_config(cfg, ts): return res -def test_hwtstamp_tx(cfg): +def __perform_hwtstamp_tx(cfg, is_ioctl): """ - Test TX timestamp configuration. + Test TX timestamp configuration via either netlink or ioctl. The driver should apply provided config and report back proper state. """ @@ -66,16 +129,37 @@ def test_hwtstamp_tx(cfg): ts = __get_hwtimestamp_support(cfg) tx = ts['tx'] for t in tx: + res = None tscfg = orig_tscfg tscfg['tx-types']['bits']['bit'] = [t] - res = __set_hwtimestamp_config(cfg, tscfg) + if is_ioctl: + __set_hwtimestamp_config_ioctl(cfg, tscfg) + else: + res = __set_hwtimestamp_config(cfg, tscfg) if res is None: res = __get_hwtimestamp_config(cfg) + resioctl = __get_hwtimestamp_config_ioctl(cfg) ksft_eq(res['tx-types']['bits']['bit'], [t]) + ksft_eq(resioctl.tx_type, t['index']) __set_hwtimestamp_config(cfg, orig_tscfg) +def test_hwtstamp_tx_netlink(cfg): + """ + Test TX timestamp configuration setup via netlink. + The driver should apply provided config and report back proper state. + """ + __perform_hwtstamp_tx(cfg, False) + + +def test_hwtstamp_tx_ioctl(cfg): + """ + Test TX timestamp configuration setup via ioctl. + The driver should apply provided config and report back proper state. + """ + __perform_hwtstamp_tx(cfg, True) + -def test_hwtstamp_rx(cfg): +def __perform_hwtstamp_rx(cfg, is_ioctl): """ Test RX timestamp configuration. The filter configuration is taken from the list of supported filters. @@ -87,11 +171,17 @@ def test_hwtstamp_rx(cfg): ts = __get_hwtimestamp_support(cfg) rx = ts['rx'] for r in rx: + res = None tscfg = orig_tscfg tscfg['rx-filters']['bits']['bit'] = [r] - res = __set_hwtimestamp_config(cfg, tscfg) + if is_ioctl: + __set_hwtimestamp_config_ioctl(cfg, tscfg) + else: + res = __set_hwtimestamp_config(cfg, tscfg) if res is None: res = __get_hwtimestamp_config(cfg) + resioctl = __get_hwtimestamp_config_ioctl(cfg) + ksft_eq(resioctl.rx_filter, res['rx-filters']['bits']['bit'][0]['index']) if r['index'] == 0 or r['index'] == 1: ksft_eq(res['rx-filters']['bits']['bit'][0]['index'], r['index']) else: @@ -100,12 +190,34 @@ def test_hwtstamp_rx(cfg): __set_hwtimestamp_config(cfg, orig_tscfg) +def test_hwtstamp_rx_netlink(cfg): + """ + Test RX timestamp configuration via netlink. + The filter configuration is taken from the list of supported filters. + The driver should apply the config without error and report back proper state. + Some extension of the timestamping scope is allowed for PTP filters. + """ + __perform_hwtstamp_rx(cfg, False) + + +def test_hwtstamp_rx_ioctl(cfg): + """ + Test RX timestamp configuration via ioctl. + The filter configuration is taken from the list of supported filters. + The driver should apply the config without error and report back proper state. + Some extension of the timestamping scope is allowed for PTP filters. + """ + __perform_hwtstamp_rx(cfg, True) + + def main() -> None: """ Ksft boiler plate main """ with NetDrvEnv(__file__, nsim_test=False) as cfg: cfg.ethnl = EthtoolFamily() - ksft_run([test_hwtstamp_tx, test_hwtstamp_rx], args=(cfg,)) + ksft_run([test_hwtstamp_tx_ioctl, test_hwtstamp_tx_netlink, + test_hwtstamp_rx_ioctl, test_hwtstamp_rx_netlink], + args=(cfg,)) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/rss_drv.py b/tools/testing/selftests/drivers/net/hw/rss_drv.py new file mode 100755 index 000000000000..2d1a33189076 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/rss_drv.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Driver-related behavior tests for RSS. +""" + +from lib.py import ksft_run, ksft_exit, ksft_ge +from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx +from lib.py import defer, ethtool +from lib.py import EthtoolFamily, NlError +from lib.py import NetDrvEnv + + +def _is_power_of_two(n): + return n > 0 and (n & (n - 1)) == 0 + + +def _get_rss(cfg, context=0): + return ethtool(f"-x {cfg.ifname} context {context}", json=True)[0] + + +def _test_rss_indir_size(cfg, qcnt, context=0): + """Test that indirection table size is at least 4x queue count.""" + ethtool(f"-L {cfg.ifname} combined {qcnt}") + + rss = _get_rss(cfg, context=context) + indir = rss['rss-indirection-table'] + ksft_ge(len(indir), 4 * qcnt, "Table smaller than 4x") + return len(indir) + + +def _maybe_create_context(cfg, create_context): + """ Either create a context and return its ID or return 0 for main ctx """ + if not create_context: + return 0 + try: + ctx = cfg.ethnl.rss_create_act({'header': {'dev-index': cfg.ifindex}}) + ctx_id = ctx['context'] + defer(cfg.ethnl.rss_delete_act, + {'header': {'dev-index': cfg.ifindex}, 'context': ctx_id}) + except NlError: + raise KsftSkipEx("Device does not support additional RSS contexts") + + return ctx_id + + +@ksft_variants([ + KsftNamedVariant("main", False), + KsftNamedVariant("ctx", True), +]) +def indir_size_4x(cfg, create_context): + """ + Test that the indirection table has at least 4 entries per queue. + Empirically network-heavy workloads like memcache suffer with the 33% + imbalance of a 2x indirection table size. + 4x table translates to a 16% imbalance. + """ + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + ch_max = channels.get('combined-max', 0) + qcnt = channels['combined-count'] + + if ch_max < 3: + raise KsftSkipEx(f"Not enough queues for the test: max={ch_max}") + + defer(ethtool, f"-L {cfg.ifname} combined {qcnt}") + ethtool(f"-L {cfg.ifname} combined 3") + + ctx_id = _maybe_create_context(cfg, create_context) + + indir_sz = _test_rss_indir_size(cfg, 3, context=ctx_id) + + # Test with max queue count (max - 1 if max is a power of two) + test_max = ch_max - 1 if _is_power_of_two(ch_max) else ch_max + if test_max > 3 and indir_sz < test_max * 4: + _test_rss_indir_size(cfg, test_max, context=ctx_id) + + +def main() -> None: + """ Ksft boiler plate main """ + with NetDrvEnv(__file__) as cfg: + cfg.ethnl = EthtoolFamily() + ksft_run([indir_size_4x], args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py index 6fa95fe27c47..7dc80070884a 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py +++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py @@ -145,9 +145,14 @@ def test_rss_flow_label_6only(cfg): # Try to enable Flow Labels and check again, in case it leaks thru initial = _ethtool_get_cfg(cfg, "udp6") - changed = initial.replace("l", "") if "l" in initial else initial + "l" - - cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}") + no_lbl = initial.replace("l", "") + if "l" not in initial: + try: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}") + except CmdExitFailure as exc: + raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc + else: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}") restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") _check_v4_flow_types(cfg) diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index 72880e388478..503f1a2a2872 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -5,9 +5,9 @@ import multiprocessing import socket from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, cmd, fd_read_timeout from lib.py import NetDrvEpEnv -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import KsftSkipEx, KsftFailEx -from lib.py import rand_port +from lib.py import defer, ksft_pr, rand_port def traffic(cfg, local_port, remote_port, ipver): @@ -21,6 +21,40 @@ def traffic(cfg, local_port, remote_port, ipver): return sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) +def _rss_input_xfrm_try_enable(cfg): + """ + Check if symmetric input-xfrm is already enabled, if not try to enable it + and register a cleanup. + """ + rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}) + orig_xfrm = rss.get('input-xfrm', set()) + sym_xfrm = set(filter(lambda x: 'sym' in x, orig_xfrm)) + + if sym_xfrm: + ksft_pr("Sym input xfrm already enabled:", sym_xfrm) + return sym_xfrm + + for xfrm in cfg.ethnl.consts["input-xfrm"].entries: + # Skip non-symmetric transforms + if "sym" not in xfrm: + continue + + try_xfrm = {xfrm} | orig_xfrm + try: + cfg.ethnl.rss_set({"header": {"dev-index": cfg.ifindex}, + "input-xfrm": try_xfrm}) + except NlError: + continue + + ksft_pr("Sym input xfrm configured:", try_xfrm) + defer(cfg.ethnl.rss_set, + {"header": {"dev-index": cfg.ifindex}, + "input-xfrm": orig_xfrm}) + return {xfrm} + + return set() + + def test_rss_input_xfrm(cfg, ipver): """ Test symmetric input_xfrm. @@ -37,12 +71,10 @@ def test_rss_input_xfrm(cfg, ipver): if not hasattr(socket, "SO_INCOMING_CPU"): raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") - rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}}) - input_xfrm = set(filter(lambda x: 'sym' in x, rss.get('input-xfrm', {}))) - # Check for symmetric xor/or-xor + input_xfrm = _rss_input_xfrm_try_enable(cfg) if not input_xfrm: - raise KsftSkipEx("Symmetric RSS hash not requested") + raise KsftSkipEx("Symmetric RSS hash not supported by device") cpus = set() successful = 0 diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c index 285bb17df9c2..035bf908d8d9 100644 --- a/tools/testing/selftests/drivers/net/hw/toeplitz.c +++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c @@ -59,7 +59,7 @@ #include "../../../net/lib/ksft.h" #define TOEPLITZ_KEY_MIN_LEN 40 -#define TOEPLITZ_KEY_MAX_LEN 60 +#define TOEPLITZ_KEY_MAX_LEN 256 #define TOEPLITZ_STR_LEN(K) (((K) * 3) - 1) /* hex encoded: AA:BB:CC:...:ZZ */ #define TOEPLITZ_STR_MIN_LEN TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN) @@ -72,6 +72,8 @@ #define RPS_MAX_CPUS 16UL /* must be a power of 2 */ +#define MIN_PKT_SAMPLES 40 /* minimum number of packets to receive */ + /* configuration options (cmdline arguments) */ static uint16_t cfg_dport = 8000; static int cfg_family = AF_INET6; @@ -251,15 +253,31 @@ static bool recv_block(struct ring_state *ring) return true; } -/* simple test: sleep once unconditionally and then process all rings */ +/* simple test: process all rings until MIN_PKT_SAMPLES packets are received, + * or the test times out. + */ static void process_rings(void) { + struct timeval start, now; + bool pkts_found = true; + long elapsed_usec; int i; - usleep(1000 * cfg_timeout_msec); + gettimeofday(&start, NULL); - for (i = 0; i < num_cpus; i++) - do {} while (recv_block(&rings[i])); + do { + if (!pkts_found) + usleep(100); + + pkts_found = false; + for (i = 0; i < num_cpus; i++) + pkts_found |= recv_block(&rings[i]); + + gettimeofday(&now, NULL); + elapsed_usec = (now.tv_sec - start.tv_sec) * 1000000 + + (now.tv_usec - start.tv_usec); + } while (frames_received - frames_nohash < MIN_PKT_SAMPLES && + elapsed_usec < cfg_timeout_msec * 1000); fprintf(stderr, "count: pass=%u nohash=%u fail=%u\n", frames_received - frames_nohash - frames_error, diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 8b644fd84ff2..41cc248ac848 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -170,6 +170,7 @@ class NetDrvEpEnv(NetDrvEnvBase): self.remote_ifname = self.resolve_remote_ifc() self.remote_dev = ip("-d link show dev " + self.remote_ifname, host=self.remote, json=True)[0] + self.remote_ifindex = self.remote_dev['ifindex'] self._required_cmd = {} @@ -247,9 +248,12 @@ class NetDrvEpEnv(NetDrvEnvBase): if not self.addr_v[ipver] or not self.remote_addr_v[ipver]: raise KsftSkipEx(f"Test requires IPv{ipver} connectivity") - def require_nsim(self): - if self._ns is None: + def require_nsim(self, nsim_test=True): + """Require or exclude netdevsim for this test""" + if nsim_test and self._ns is None: raise KsftXfailEx("Test only works on netdevsim") + if nsim_test is False and self._ns is not None: + raise KsftXfailEx("Test does not work on netdevsim") def _require_cmd(self, comm, key, host=None): cached = self._required_cmd.get(comm, {}) diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index ae8abff4be40..b6093bcf2b06 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -203,19 +203,21 @@ function do_cleanup() { function cleanup_netcons() { # delete netconsole dynamic reconfiguration # do not fail if the target is already disabled - if [[ ! -d "${NETCONS_PATH}" ]] + local TARGET_PATH=${1:-${NETCONS_PATH}} + + if [[ ! -d "${TARGET_PATH}" ]] then # in some cases this is called before netcons path is created return fi - if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + if [[ $(cat "${TARGET_PATH}"/enabled) != 0 ]] then - echo 0 > "${NETCONS_PATH}"/enabled || true + echo 0 > "${TARGET_PATH}"/enabled || true fi # Remove all the keys that got created during the selftest - find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete + find "${TARGET_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry - rmdir "${NETCONS_PATH}" + rmdir "${TARGET_PATH}" } function cleanup() { @@ -377,6 +379,29 @@ function check_netconsole_module() { fi } +function wait_target_state() { + local TARGET=${1} + local STATE=${2} + local TARGET_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" + local ENABLED=0 + + if [ "${STATE}" == "enabled" ] + then + ENABLED=1 + fi + + if [ ! -d "$TARGET_PATH" ]; then + echo "FAIL: Target does not exist." >&2 + exit "${ksft_fail}" + fi + + local CHECK_CMD="grep \"$ENABLED\" \"$TARGET_PATH/enabled\"" + slowwait 2 sh -c "test -n \"\$($CHECK_CMD)\"" || { + echo "FAIL: ${TARGET} is not ${STATE}." >&2 + exit "${ksft_fail}" + } +} + # A wrapper to translate protocol version to udp version function wait_for_port() { local NAMESPACE=${1} diff --git a/tools/testing/selftests/drivers/net/netconsole/Makefile b/tools/testing/selftests/drivers/net/netconsole/Makefile new file mode 100644 index 000000000000..b56c70b7e274 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_INCLUDES := \ + ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ +# end of TEST_INCLUDES + +TEST_PROGS := \ + netcons_basic.sh \ + netcons_cmdline.sh \ + netcons_fragmented_msg.sh \ + netcons_overflow.sh \ + netcons_resume.sh \ + netcons_sysdata.sh \ + netcons_torture.sh \ +# end of TEST_PROGS + +include ../../../lib.mk + diff --git a/tools/testing/selftests/drivers/net/netconsole/config b/tools/testing/selftests/drivers/net/netconsole/config new file mode 100644 index 000000000000..a3f6b0fd44ef --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/config @@ -0,0 +1,6 @@ +CONFIG_CONFIGFS_FS=y +CONFIG_IPV6=y +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y +CONFIG_NETDEVSIM=m diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh index 2022f3061738..59cf10013ecd 100755 --- a/tools/testing/selftests/drivers/net/netcons_basic.sh +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh @@ -18,7 +18,7 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh modprobe netdevsim 2> /dev/null || true modprobe netconsole 2> /dev/null || true diff --git a/tools/testing/selftests/drivers/net/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh index d1d23dc67f99..96d704b8d9d9 100755 --- a/tools/testing/selftests/drivers/net/netcons_cmdline.sh +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh @@ -12,7 +12,7 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh check_netconsole_module diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh index 4a71e01a230c..0dc7280c3080 100755 --- a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh @@ -16,7 +16,7 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh modprobe netdevsim 2> /dev/null || true modprobe netconsole 2> /dev/null || true diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh index 06089643b771..a8e43d08c166 100755 --- a/tools/testing/selftests/drivers/net/netcons_overflow.sh +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh @@ -13,7 +13,7 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh # This is coming from netconsole code. Check for it in drivers/net/netconsole.c MAX_USERDATA_ITEMS=256 diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh new file mode 100755 index 000000000000..cb59cf436dd0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# This test validates that netconsole is able to resume a target that was +# deactivated when its interface was removed when the interface is brought +# back up. +# +# The test configures a netconsole target and then removes netdevsim module to +# cause the interface to disappear. Targets are configured via cmdline to ensure +# targets bound by interface name and mac address can be resumed. +# The test verifies that the target moved to disabled state before adding +# netdevsim and the interface back. +# +# Finally, the test verifies that the target is re-enabled automatically and +# the message is received on the destination interface. +# +# Author: Andre Carvalho <asantostc@gmail.com> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +SAVED_SRCMAC="" # to be populated later +SAVED_DSTMAC="" # to be populated later + +modprobe netdevsim 2> /dev/null || true +rmmod netconsole 2> /dev/null || true + +check_netconsole_module + +function cleanup() { + cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0" + do_cleanup + rmmod netconsole +} + +function trigger_reactivation() { + # Add back low level module + modprobe netdevsim + # Recreate namespace and two interfaces + set_network + # Restore MACs + ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \ + address "${SAVED_DSTMAC}" + if [ "${BINDMODE}" == "mac" ]; then + ip link set dev "${SRCIF}" down + ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}" + # Rename device in order to trigger target resume, as initial + # when device was recreated it didn't have correct mac address. + ip link set dev "${SRCIF}" name "${TARGET}" + fi +} + +function trigger_deactivation() { + # Start by storing mac addresses so we can be restored in reactivate + SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \ + cat /sys/class/net/"$DSTIF"/address) + SAVED_SRCMAC=$(mac_get "${SRCIF}") + # Remove low level module + rmmod netdevsim +} + +trap cleanup EXIT + +# Run the test twice, with different cmdline parameters +for BINDMODE in "ifname" "mac" +do + echo "Running with bind mode: ${BINDMODE}" >&2 + # Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) + echo "6 5" > /proc/sys/kernel/printk + + # Create one namespace and two interfaces + set_network + + # Create the command line for netconsole, with the configuration from + # the function above + CMDLINE=$(create_cmdline_str "${BINDMODE}") + + # The content of kmsg will be save to the following file + OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}" + + # Load the module, with the cmdline set + modprobe netconsole "${CMDLINE}" + # Expose cmdline target in configfs + mkdir "${NETCONS_CONFIGFS}/cmdline0" + + # Target should be enabled + wait_target_state "cmdline0" "enabled" + + # Trigger deactivation by unloading netdevsim module. Target should be + # disabled. + trigger_deactivation + wait_target_state "cmdline0" "disabled" + + # Trigger reactivation by loading netdevsim, recreating the network and + # restoring mac addresses. Target should be re-enabled. + trigger_reactivation + wait_target_state "cmdline0" "enabled" + + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_msg "${OUTPUT_FILE}" + + # kill socat in case it is still running + pkill_socat + # Cleanup & unload the module + cleanup + + echo "${BINDMODE} : Test passed" >&2 +done + +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh index baf69031089e..3fb8c4afe3d2 100755 --- a/tools/testing/selftests/drivers/net/netcons_sysdata.sh +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh @@ -18,7 +18,7 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh # Enable the sysdata cpu_nr feature function set_cpu_nr() { diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh index 2ce9ee3719d1..33a44adb6f8f 100755 --- a/tools/testing/selftests/drivers/net/netcons_torture.sh +++ b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh @@ -17,7 +17,7 @@ set -euo pipefail SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") -source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh # Number of times the main loop run ITERATIONS=${1:-150} diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 52523bdad240..864d9fce1094 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -266,6 +266,7 @@ def assoc_sk_only_mismatch(cfg): the_exception = cm.exception ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id") ksft_eq(the_exception.nl_msg.error, -errno.EINVAL) + _close_conn(cfg, s) def assoc_sk_only_mismatch_tx(cfg): @@ -283,6 +284,7 @@ def assoc_sk_only_mismatch_tx(cfg): the_exception = cm.exception ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id") ksft_eq(the_exception.nl_msg.error, -errno.EINVAL) + _close_conn(cfg, s) def assoc_sk_only_unconn(cfg): @@ -601,8 +603,8 @@ def main() -> None: cfg.comm_port = rand_port() srv = None try: - with bkg(responder + f" -p {cfg.comm_port}", host=cfg.remote, - exit_wait=True) as srv: + with bkg(responder + f" -p {cfg.comm_port} -i {cfg.remote_ifindex}", + host=cfg.remote, exit_wait=True) as srv: wait_port_listen(cfg.comm_port, host=cfg.remote) cfg.comm_sock = socket.create_connection((cfg.remote_addr, diff --git a/tools/testing/selftests/drivers/net/psp_responder.c b/tools/testing/selftests/drivers/net/psp_responder.c index f309e0d73cbf..a26e7628bbb1 100644 --- a/tools/testing/selftests/drivers/net/psp_responder.c +++ b/tools/testing/selftests/drivers/net/psp_responder.c @@ -22,7 +22,7 @@ static bool should_quit; struct opts { int port; - int devid; + int ifindex; bool verbose; }; @@ -360,7 +360,7 @@ static void usage(const char *name, const char *miss) if (miss) fprintf(stderr, "Missing argument: %s\n", miss); - fprintf(stderr, "Usage: %s -p port [-v] [-d psp-dev-id]\n", name); + fprintf(stderr, "Usage: %s -p port [-v] [-i ifindex]\n", name); exit(EXIT_FAILURE); } @@ -368,7 +368,7 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts) { int opt; - while ((opt = getopt(argc, argv, "vp:d:")) != -1) { + while ((opt = getopt(argc, argv, "vp:i:")) != -1) { switch (opt) { case 'v': opts->verbose = 1; @@ -376,8 +376,8 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts) case 'p': opts->port = atoi(optarg); break; - case 'd': - opts->devid = atoi(optarg); + case 'i': + opts->ifindex = atoi(optarg); break; default: usage(argv[0], NULL); @@ -410,12 +410,11 @@ static int psp_dev_set_ena(struct ynl_sock *ys, __u32 dev_id, __u32 versions) int main(int argc, char **argv) { struct psp_dev_get_list *dev_list; - bool devid_found = false; __u32 ver_ena, ver_cap; struct opts opts = {}; struct ynl_error yerr; struct ynl_sock *ys; - int first_id = 0; + int devid = -1; int ret; parse_cmd_opts(argc, argv, &opts); @@ -429,20 +428,19 @@ int main(int argc, char **argv) } dev_list = psp_dev_get_dump(ys); - if (ynl_dump_empty(dev_list)) { - if (ys->err.code) - goto err_close; - fprintf(stderr, "No PSP devices\n"); - goto err_close_silent; - } + if (ynl_dump_empty(dev_list) && ys->err.code) + goto err_close; ynl_dump_foreach(dev_list, d) { - if (opts.devid) { - devid_found = true; + if (opts.ifindex) { + if (d->ifindex != opts.ifindex) + continue; + devid = d->id; ver_ena = d->psp_versions_ena; ver_cap = d->psp_versions_cap; - } else if (!first_id) { - first_id = d->id; + break; + } else if (devid < 0) { + devid = d->id; ver_ena = d->psp_versions_ena; ver_cap = d->psp_versions_cap; } else { @@ -452,23 +450,21 @@ int main(int argc, char **argv) } psp_dev_get_list_free(dev_list); - if (opts.devid && !devid_found) { - fprintf(stderr, "PSP device %d requested on cmdline, not found\n", - opts.devid); - goto err_close_silent; - } else if (!opts.devid) { - opts.devid = first_id; - } + if (opts.ifindex && devid < 0) + fprintf(stderr, + "WARN: PSP device with ifindex %d requested on cmdline, not found\n", + opts.ifindex); - if (ver_ena != ver_cap) { - ret = psp_dev_set_ena(ys, opts.devid, ver_cap); + if (devid >= 0 && ver_ena != ver_cap) { + ret = psp_dev_set_ena(ys, devid, ver_cap); if (ret) goto err_close; } ret = run_responder(ys, &opts); - if (ver_ena != ver_cap && psp_dev_set_ena(ys, opts.devid, ver_ena)) + if (devid >= 0 && ver_ena != ver_cap && + psp_dev_set_ena(ys, devid, ver_ena)) fprintf(stderr, "WARN: failed to set the PSP versions back\n"); ynl_sock_destroy(ys); diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c index 94c6c81c2301..2c4c50500116 100644 --- a/tools/testing/selftests/filesystems/anon_inode_test.c +++ b/tools/testing/selftests/filesystems/anon_inode_test.c @@ -42,7 +42,10 @@ TEST(anon_inode_no_exec) fd_context = sys_fsopen("tmpfs", 0); ASSERT_GE(fd_context, 0); - ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0); + char *const empty_argv[] = {NULL}; + char *const empty_envp[] = {NULL}; + + ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0); ASSERT_EQ(errno, EACCES); EXPECT_EQ(close(fd_context), 0); diff --git a/tools/testing/selftests/filesystems/open_tree_ns/.gitignore b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore new file mode 100644 index 000000000000..fb12b93fbcaa --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore @@ -0,0 +1 @@ +open_tree_ns_test diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile new file mode 100644 index 000000000000..73c03c4a7ef6 --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := open_tree_ns_test + +CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES) +LDLIBS := -lcap + +include ../../lib.mk + +$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c new file mode 100644 index 000000000000..9711556280ae --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c @@ -0,0 +1,1030 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for OPEN_TREE_NAMESPACE flag. + * + * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount + * namespace containing the specified mount tree. + */ +#define _GNU_SOURCE + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/nsfs.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../wrappers.h" +#include "../statmount/statmount.h" +#include "../utils.h" +#include "../../kselftest_harness.h" + +#ifndef OPEN_TREE_NAMESPACE +#define OPEN_TREE_NAMESPACE (1 << 1) +#endif + +static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id) +{ + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) + return -errno; + return 0; +} + +static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + ret = get_mnt_ns_id(fd, mnt_ns_id); + close(fd); + return ret; +} + +#define STATMOUNT_BUFSIZE (1 << 15) + +static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask) +{ + struct statmount *buf; + size_t bufsize = STATMOUNT_BUFSIZE; + int ret; + + for (;;) { + buf = malloc(bufsize); + if (!buf) + return NULL; + + ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0); + if (ret == 0) + return buf; + + free(buf); + if (errno != EOVERFLOW) + return NULL; + + bufsize <<= 1; + } +} + +static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) +{ + const char *fs_type = ""; + const char *mnt_root = ""; + const char *mnt_point = ""; + + if (sm->mask & STATMOUNT_FS_TYPE) + fs_type = sm->str + sm->fs_type; + if (sm->mask & STATMOUNT_MNT_ROOT) + mnt_root = sm->str + sm->mnt_root; + if (sm->mask & STATMOUNT_MNT_POINT) + mnt_point = sm->str + sm->mnt_point; + + TH_LOG(" mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s", + (unsigned long long)sm->mnt_id, + (unsigned long long)sm->mnt_parent_id, + fs_type, mnt_root, mnt_point); +} + +static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) +{ + uint64_t list[256]; + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) { + TH_LOG("listmount failed: %s", strerror(errno)); + return; + } + + TH_LOG("Mount namespace %llu contains %zd mount(s):", + (unsigned long long)mnt_ns_id, nr_mounts); + + for (ssize_t i = 0; i < nr_mounts; i++) { + struct statmount *sm; + + sm = statmount_alloc(list[i], mnt_ns_id, + STATMOUNT_MNT_BASIC | + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT); + if (!sm) { + TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", + i, (unsigned long long)list[i], strerror(errno)); + continue; + } + + log_mount(_metadata, sm); + free(sm); + } +} + +FIXTURE(open_tree_ns) +{ + int fd; + uint64_t current_ns_id; +}; + +FIXTURE_VARIANT(open_tree_ns) +{ + const char *path; + unsigned int flags; + bool expect_success; + bool expect_different_ns; + int min_mounts; +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, basic_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + /* + * The empty rootfs is hidden from listmount()/mountinfo, + * so we only see the bind mount on top of it. + */ + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_proc) +{ + .path = "/proc", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_run) +{ + .path = "/run", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, invalid_recursive_alone) +{ + .path = "/", + .flags = AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = false, + .expect_different_ns = false, + .min_mounts = 0, +}; + +FIXTURE_SETUP(open_tree_ns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); + + /* Get current mount namespace ID for comparison */ + ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id); + if (ret < 0) + SKIP(return, "Failed to get current mount namespace ID"); +} + +FIXTURE_TEARDOWN(open_tree_ns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns, create_namespace) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + + if (!variant->expect_success) { + ASSERT_LT(self->fd, 0); + ASSERT_EQ(errno, EINVAL); + return; + } + + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Verify we can get the namespace ID */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + /* Verify it's a different namespace */ + if (variant->expect_different_ns) + ASSERT_NE(new_ns_id, self->current_ns_id); + + /* List mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("%m - listmount failed"); + } + + /* Verify minimum expected mounts */ + ASSERT_GE(nr_mounts, variant->min_mounts); + TH_LOG("Namespace contains %zd mounts", nr_mounts); +} + +TEST_F(open_tree_ns, setns_into_namespace) +{ + uint64_t new_ns_id; + pid_t pid; + int status; + int ret; + + /* Only test with basic flags */ + if (!(variant->flags & OPEN_TREE_NAMESPACE)) + SKIP(return, "setns test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Get namespace ID and dump all mounts */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + dump_mounts(_metadata, new_ns_id); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: try to enter the namespace */ + if (setns(self->fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(open_tree_ns, verify_mount_properties) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + /* Only test with basic flags on root */ + if (variant->flags != (OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC) || + strcmp(variant->path, "/") != 0) + SKIP(return, "mount properties test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, "/", OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + /* Get info about the root mount (the bind mount, rootfs is hidden) */ + ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + ASSERT_NE(sm.mnt_id, sm.mnt_parent_id); + + TH_LOG("Root mount id: %llu, parent: %llu", + (unsigned long long)sm.mnt_id, + (unsigned long long)sm.mnt_parent_id); +} + +FIXTURE(open_tree_ns_caps) +{ + bool has_caps; +}; + +FIXTURE_SETUP(open_tree_ns_caps) +{ + int ret; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + self->has_caps = (geteuid() == 0); +} + +FIXTURE_TEARDOWN(open_tree_ns_caps) +{ +} + +TEST_F(open_tree_ns_caps, requires_cap_sys_admin) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + + /* Child: drop privileges using utils.h helper */ + if (enter_userns() != 0) + _exit(2); + + /* Drop all caps using utils.h helper */ + if (caps_down() == 0) + _exit(3); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd >= 0) { + close(fd); + /* Should have failed without caps */ + _exit(1); + } + + if (errno == EPERM) + _exit(0); + + /* EINVAL means OPEN_TREE_NAMESPACE not supported */ + if (errno == EINVAL) + _exit(4); + + /* Unexpected error */ + _exit(5); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Expected: EPERM without caps */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("OPEN_TREE_NAMESPACE succeeded without caps"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "caps_down failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_userns) +{ + int fd; +}; + +FIXTURE_SETUP(open_tree_ns_userns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); +} + +FIXTURE_TEARDOWN(open_tree_ns_userns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns_userns, create_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace (also creates mount namespace) */ + if (enter_userns() != 0) + _exit(2); + + /* Now we have CAP_SYS_ADMIN in the user namespace */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); /* OPEN_TREE_NAMESPACE not supported */ + _exit(1); + } + + /* Verify we can get the namespace ID */ + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Verify we can list mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Should have at least 1 mount */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, setns_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + int fd; + pid_t inner_pid; + int inner_status; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Fork again to test setns into the new namespace */ + inner_pid = fork(); + if (inner_pid < 0) + _exit(8); + + if (inner_pid == 0) { + /* Inner child: enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + if (waitpid(inner_pid, &inner_status, 0) != inner_pid) + _exit(9); + + if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0) + _exit(10); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree or setns failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("Inner fork failed"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("Inner waitpid failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, recursive_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + /* Test recursive flag in userns */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Recursive should copy submounts too */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE|AT_RECURSIVE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_fails_einval) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) == 0) { + free(sm); + _exit(7); + } + + if (errno != EINVAL) { + /* Wrong error */ + free(sm); + _exit(8); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_succeeds) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + if (unshare(CLONE_NEWNS)) + _exit(1); + + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0) + _exit(1); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) != 0) { + free(sm); + _exit(7); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_unbindable) +{ + char tmpdir[PATH_MAX]; + bool mounted; +}; + +FIXTURE_SETUP(open_tree_ns_unbindable) +{ + int ret; + + self->mounted = false; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Create a temporary directory for the test mount */ + snprintf(self->tmpdir, sizeof(self->tmpdir), + "/tmp/open_tree_ns_test.XXXXXX"); + ASSERT_NE(mkdtemp(self->tmpdir), NULL); + + /* Mount tmpfs there */ + ret = mount("tmpfs", self->tmpdir, "tmpfs", 0, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to mount tmpfs"); + } + self->mounted = true; + + ret = mount(NULL, self->tmpdir, NULL, MS_UNBINDABLE, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to make tmpfs unbindable"); + } +} + +FIXTURE_TEARDOWN(open_tree_ns_unbindable) +{ + if (self->mounted) + umount2(self->tmpdir, MNT_DETACH); + rmdir(self->tmpdir); +} + +TEST_F(open_tree_ns_unbindable, fails_on_unbindable) +{ + int fd; + + fd = sys_open_tree(AT_FDCWD, self->tmpdir, + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + ASSERT_LT(fd, 0); +} + +TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + bool found_unbindable = false; + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + ASSERT_GT(fd, 0); + + ASSERT_EQ(get_mnt_ns_id(fd, &new_ns_id), 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("listmount failed: %m"); + } + + /* + * Iterate through all mounts in the new namespace and verify + * the unbindable tmpfs mount was silently dropped. + */ + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT); + ASSERT_NE(sm, NULL) { + TH_LOG("statmount_alloc failed for mnt_id %llu", + (unsigned long long)list[i]); + } + + mnt_point = sm->str + sm->mnt_point; + + if (strcmp(mnt_point, self->tmpdir) == 0) { + TH_LOG("Found unbindable mount at %s (should have been dropped)", + mnt_point); + found_unbindable = true; + } + + free(sm); + } + + ASSERT_FALSE(found_unbindable) { + TH_LOG("Unbindable mount at %s was not dropped", self->tmpdir); + } + + close(fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index 99e5ad082fb1..e1cba4bfd8d9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -43,19 +43,24 @@ #endif #endif -static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, - struct statmount *buf, size_t bufsize, +static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint32_t fd, + uint64_t mask, struct statmount *buf, size_t bufsize, unsigned int flags) { struct mnt_id_req req = { .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, .param = mask, }; - if (mnt_ns_id) { + if (flags & STATMOUNT_BY_FD) { req.size = MNT_ID_REQ_SIZE_VER1; - req.mnt_ns_id = mnt_ns_id; + req.mnt_fd = fd; + } else { + req.mnt_id = mnt_id; + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } } return syscall(__NR_statmount, &req, buf, bufsize, flags); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 6e53430423d2..a04bcaace126 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -33,15 +33,24 @@ static const char *const known_fs[] = { "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) +static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags) { size_t bufsize = 1 << 15; - struct statmount *buf = NULL, *tmp = alloca(bufsize); + struct statmount *buf = NULL, *tmp = NULL; int tofree = 0; int ret; + if (flags & STATMOUNT_BY_FD && fd < 0) + return NULL; + + tmp = alloca(bufsize); + for (;;) { - ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); + if (flags & STATMOUNT_BY_FD) + ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags); + else + ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags); + if (ret != -1) break; if (tofree) @@ -237,7 +246,7 @@ static void test_statmount_zero_mask(void) struct statmount sm; int ret; - ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, 0, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount zero mask: %s\n", strerror(errno)); @@ -263,7 +272,7 @@ static void test_statmount_mnt_basic(void) int ret; uint64_t mask = STATMOUNT_MNT_BASIC; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount mnt basic: %s\n", strerror(errno)); @@ -323,7 +332,7 @@ static void test_statmount_sb_basic(void) struct statx sx; struct statfs sf; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount sb basic: %s\n", strerror(errno)); @@ -375,7 +384,7 @@ static void test_statmount_mnt_point(void) { struct statmount *sm; - sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_POINT, 0); if (!sm) { ksft_test_result_fail("statmount mount point: %s\n", strerror(errno)); @@ -405,7 +414,7 @@ static void test_statmount_mnt_root(void) assert(last_dir); last_dir++; - sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT, 0); if (!sm) { ksft_test_result_fail("statmount mount root: %s\n", strerror(errno)); @@ -438,7 +447,7 @@ static void test_statmount_fs_type(void) const char *fs_type; const char *const *s; - sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); if (!sm) { ksft_test_result_fail("statmount fs type: %s\n", strerror(errno)); @@ -467,7 +476,7 @@ static void test_statmount_mnt_opts(void) char *line = NULL; size_t len = 0; - sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, 0); if (!sm) { ksft_test_result_fail("statmount mnt opts: %s\n", @@ -557,7 +566,7 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) uint32_t start, i; int ret; - sm = statmount_alloc(root_id, mask, 0); + sm = statmount_alloc(root_id, 0, mask, 0); if (!sm) { ksft_test_result_fail("statmount %s: %s\n", name, strerror(errno)); @@ -586,14 +595,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) exactsize = sm->size; shortsize = sizeof(*sm) + i; - ret = statmount(root_id, 0, mask, sm, exactsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, exactsize, 0); if (ret == -1) { ksft_test_result_fail("statmount exact size: %s\n", strerror(errno)); goto out; } errno = 0; - ret = statmount(root_id, 0, mask, sm, shortsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, shortsize, 0); if (ret != -1 || errno != EOVERFLOW) { ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", strerror(errno)); @@ -658,6 +667,226 @@ static void test_listmount_tree(void) ksft_test_result_pass("listmount tree\n"); } +static void test_statmount_by_fd(void) +{ + struct statmount *sm = NULL; + char tmpdir[] = "/statmount.fd.XXXXXX"; + const char root[] = "/test"; + char subdir[PATH_MAX], tmproot[PATH_MAX]; + int fd; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + snprintf(tmproot, PATH_MAX, "%s/%s", tmpdir, "chroot"); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, NULL, MS_BIND, 0)) { + ksft_perror("mount"); + goto err_subdir; + } + + if (mkdir(tmproot, 0755)) { + ksft_perror("mkdir"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_tmproot; + } + + if (chroot(tmproot)) { + ksft_perror("chroot"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_chroot; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_chroot; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in statmount\n"); + goto err_chroot; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto err_chroot; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_chroot; + } + + if (chroot(".")) { + ksft_perror("chroot"); + goto out; + } + + free(sm); + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_fd; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_POINT)) { + ksft_test_result_fail("STATMOUNT_MNT_POINT not set in statmount\n"); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto out; + } + + if (strcmp(subdir, sm->str + sm->mnt_point) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_point," + "statmount mnt_point: %s != %s\n", sm->str + sm->mnt_point, subdir); + goto out; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", sm->str + sm->mnt_root, root); + goto out; + } + + ksft_test_result_pass("statmount by fd\n"); + goto out; +err_chroot: + chroot("."); +out: + free(sm); +err_fd: + close(fd); +err_tmproot: + rmdir(tmproot); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + +static void test_statmount_by_fd_unmounted(void) +{ + const char root[] = "/test.unmounted"; + char tmpdir[] = "/statmount.fd.XXXXXX"; + char subdir[PATH_MAX]; + int fd; + struct statmount *sm = NULL; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, 0, MS_BIND, NULL)) { + ksft_perror("mount"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_subdir; + } + + if (umount2(tmpdir, MNT_DETACH)) { + ksft_perror("umount2"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd unmounted: %s\n", + strerror(errno)); + goto err_sm; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_sm; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in mask\n"); + goto err_sm; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in mask\n"); + goto err_sm; + } + + if (strcmp(sm->str + sm->mnt_root, root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_sm; + } + + ksft_test_result_pass("statmount by fd on unmounted mount\n"); +err_sm: + free(sm); +err_fd: + close(fd); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t)) int main(void) @@ -669,14 +898,14 @@ int main(void) ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); setup_namespace(); - ksft_set_plan(15); + ksft_set_plan(17); test_listmount_empty_root(); test_statmount_zero_mask(); test_statmount_mnt_basic(); @@ -693,6 +922,8 @@ int main(void) test_statmount_string(all_mask, str_off(fs_type), "fs type & all"); test_listmount_tree(); + test_statmount_by_fd_unmounted(); + test_statmount_by_fd(); if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index d56d4103182f..063d9de46431 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -102,7 +102,7 @@ static int _test_statmount_mnt_ns_id(void) if (!root_id) return NSID_ERROR; - ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret == -1) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); return NSID_ERROR; @@ -128,6 +128,98 @@ static int _test_statmount_mnt_ns_id(void) return NSID_PASS; } +static int _test_statmount_mnt_ns_id_by_fd(void) +{ + struct statmount sm; + uint64_t mnt_ns_id; + int ret, fd, mounted = 1, status = NSID_ERROR; + char mnt[] = "/statmount.fd.XXXXXX"; + + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); + if (ret != NSID_PASS) + return ret; + + if (!mkdtemp(mnt)) { + ksft_print_msg("statmount by fd mnt ns id mkdtemp: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (mount(mnt, mnt, NULL, MS_BIND, 0)) { + ksft_print_msg("statmount by fd mnt ns id mount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto err; + } + + fd = open(mnt, O_PATH); + if (fd < 0) { + ksft_print_msg("statmount by fd mnt ns id open: %s\n", strerror(errno)); + goto err; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + status = NSID_SKIP; + goto out; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + status = NSID_FAIL; + goto out; + } + + mounted = 0; + if (umount2(mnt, MNT_DETACH)) { + ksft_print_msg("statmount by fd mnt ns id umount2: %s\n", strerror(errno)); + goto out; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + + if (sm.mask == STATMOUNT_MNT_NS_ID) { + ksft_print_msg("unexpected STATMOUNT_MNT_NS_ID in mask\n"); + status = NSID_FAIL; + goto out; + } + + status = NSID_PASS; +out: + close(fd); + if (mounted) + umount2(mnt, MNT_DETACH); +err: + rmdir(mnt); + return status; +} + + static void test_statmount_mnt_ns_id(void) { pid_t pid; @@ -148,6 +240,9 @@ static void test_statmount_mnt_ns_id(void) if (ret != NSID_PASS) exit(ret); ret = _test_statmount_mnt_ns_id(); + if (ret != NSID_PASS) + exit(ret); + ret = _test_statmount_mnt_ns_id_by_fd(); exit(ret); } @@ -179,7 +274,7 @@ static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) for (int i = 0; i < nr_mounts; i++) { struct statmount sm; - ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, + ret = statmount(list[i], mnt_ns_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret < 0) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); @@ -275,7 +370,7 @@ int main(void) int ret; ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c9dd5412b37b..d6f26f849053 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -515,6 +515,32 @@ int setup_userns(void) return 0; } +int enter_userns(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWUSER); + if (ret) + return ret; + + sprintf(buf, "0 %d 1", uid); + ret = write_file("/proc/self/uid_map", buf); + if (ret) + return ret; + ret = write_file("/proc/self/setgroups", "deny"); + if (ret) + return ret; + sprintf(buf, "0 %d 1", gid); + ret = write_file("/proc/self/gid_map", buf); + if (ret) + return ret; + + return 0; +} + /* caps_down - lower all effective caps */ int caps_down(void) { diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h index 70f7ccc607f4..0bccfed666a9 100644 --- a/tools/testing/selftests/filesystems/utils.h +++ b/tools/testing/selftests/filesystems/utils.h @@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down); extern bool switch_ids(uid_t uid, gid_t gid); extern int setup_userns(void); +extern int enter_userns(void); static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) { diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 531228b849da..80ab60905865 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -116,10 +116,8 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *wq), - unsigned int flags__k, void *aux__ign) __weak __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #endif /* __HID_BPF_HELPERS_H */ diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..d45bf4ccb3bf 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -251,6 +251,7 @@ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ + -U_FORTIFY_SOURCE \ -fno-builtin-memcmp -fno-builtin-memcpy \ -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -fno-strict-aliasing \ diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore index a820329cae0d..1974e17a2611 100644 --- a/tools/testing/selftests/landlock/.gitignore +++ b/tools/testing/selftests/landlock/.gitignore @@ -1,4 +1,5 @@ /*_test +/fs_bench /sandbox-and-launch /true /wait-pipe diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile index 044b83bde16e..fc43225d319a 100644 --- a/tools/testing/selftests/landlock/Makefile +++ b/tools/testing/selftests/landlock/Makefile @@ -9,6 +9,7 @@ LOCAL_HDRS += $(wildcard *.h) src_test := $(wildcard *_test.c) TEST_GEN_PROGS := $(src_test:.c=) +TEST_GEN_PROGS += fs_bench TEST_GEN_PROGS_EXTENDED := \ true \ diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 7b69002239d7..0fea236ef4bd 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -76,7 +76,7 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(7, landlock_create_ruleset(NULL, 0, + ASSERT_EQ(8, landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, @@ -288,7 +288,7 @@ TEST(restrict_self_fd) EXPECT_EQ(EBADFD, errno); } -TEST(restrict_self_fd_flags) +TEST(restrict_self_fd_logging_flags) { int fd; @@ -304,9 +304,9 @@ TEST(restrict_self_fd_flags) EXPECT_EQ(EBADFD, errno); } -TEST(restrict_self_flags) +TEST(restrict_self_logging_flags) { - const __u32 last_flag = LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF; + const __u32 last_flag = LANDLOCK_RESTRICT_SELF_TSYNC; /* Tests invalid flag combinations. */ diff --git a/tools/testing/selftests/landlock/fs_bench.c b/tools/testing/selftests/landlock/fs_bench.c new file mode 100644 index 000000000000..d13a88dcd1ed --- /dev/null +++ b/tools/testing/selftests/landlock/fs_bench.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Landlock filesystem benchmark + * + * This program benchmarks the time required for file access checks. We use a + * large number (-d flag) of nested directories where each directory inode has + * an associated Landlock rule, and we repeatedly (-n flag) exercise a file + * access for which Landlock has to walk the path all the way up to the root. + * + * With an increasing number of nested subdirectories, Landlock's portion of the + * overall system call time increases, which makes the effects of Landlock + * refactorings more measurable. + * + * This benchmark does *not* measure the building of the Landlock ruleset. The + * time required to add all these rules is not large enough to be easily + * measurable. A separate benchmark tool would be better to test that, and that + * tool could then also use a simpler file system layout. + * + * Copyright © 2026 Google LLC + */ + +#define _GNU_SOURCE +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/landlock.h> +#include <linux/prctl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/times.h> +#include <time.h> +#include <unistd.h> + +#include "wrappers.h" + +static void usage(const char *const argv0) +{ + printf("Usage:\n"); + printf(" %s [OPTIONS]\n", argv0); + printf("\n"); + printf(" Benchmark expensive Landlock checks for D nested dirs\n"); + printf("\n"); + printf("Options:\n"); + printf(" -h help\n"); + printf(" -L disable Landlock (as a baseline)\n"); + printf(" -d D set directory depth to D\n"); + printf(" -n N set number of benchmark iterations to N\n"); +} + +/* + * Build a deep directory, enforce Landlock and return the FD to the + * deepest dir. On any failure, exit the process with an error. + */ +static int build_directory(size_t depth, const bool use_landlock) +{ + const char *path = "d"; /* directory name */ + int abi, ruleset_fd, curr, prev; + + if (use_landlock) { + abi = landlock_create_ruleset(NULL, 0, + LANDLOCK_CREATE_RULESET_VERSION); + if (abi < 7) + err(1, "Landlock ABI too low: got %d, wanted 7+", abi); + } + + ruleset_fd = -1; + if (use_landlock) { + struct landlock_ruleset_attr attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV | + LANDLOCK_ACCESS_FS_WRITE_FILE | + LANDLOCK_ACCESS_FS_MAKE_REG, + }; + ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0U); + if (ruleset_fd < 0) + err(1, "landlock_create_ruleset"); + } + + curr = open(".", O_PATH); + if (curr < 0) + err(1, "open(.)"); + + while (depth--) { + if (use_landlock) { + struct landlock_path_beneath_attr attr = { + .allowed_access = LANDLOCK_ACCESS_FS_IOCTL_DEV, + .parent_fd = curr, + }; + if (landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_PATH_BENEATH, &attr, + 0) < 0) + err(1, "landlock_add_rule"); + } + + if (mkdirat(curr, path, 0700) < 0) + err(1, "mkdirat(%s)", path); + + prev = curr; + curr = openat(curr, path, O_PATH); + if (curr < 0) + err(1, "openat(%s)", path); + + close(prev); + } + + if (use_landlock) { + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) + err(1, "prctl"); + + if (landlock_restrict_self(ruleset_fd, 0) < 0) + err(1, "landlock_restrict_self"); + } + + close(ruleset_fd); + return curr; +} + +static void remove_recursively(const size_t depth) +{ + const char *path = "d"; /* directory name */ + + int fd = openat(AT_FDCWD, ".", O_PATH); + + if (fd < 0) + err(1, "openat(.)"); + + for (size_t i = 0; i < depth - 1; i++) { + int oldfd = fd; + + fd = openat(fd, path, O_PATH); + if (fd < 0) + err(1, "openat(%s)", path); + close(oldfd); + } + + for (size_t i = 0; i < depth; i++) { + if (unlinkat(fd, path, AT_REMOVEDIR) < 0) + err(1, "unlinkat(%s)", path); + int newfd = openat(fd, "..", O_PATH); + + close(fd); + fd = newfd; + } + close(fd); +} + +int main(int argc, char *argv[]) +{ + bool use_landlock = true; + size_t num_iterations = 100000; + size_t num_subdirs = 10000; + int c, curr, fd; + struct tms start_time, end_time; + + setbuf(stdout, NULL); + while ((c = getopt(argc, argv, "hLd:n:")) != -1) { + switch (c) { + case 'h': + usage(argv[0]); + return EXIT_SUCCESS; + case 'L': + use_landlock = false; + break; + case 'd': + num_subdirs = atoi(optarg); + break; + case 'n': + num_iterations = atoi(optarg); + break; + default: + usage(argv[0]); + return EXIT_FAILURE; + } + } + + printf("*** Benchmark ***\n"); + printf("%zu dirs, %zu iterations, %s Landlock\n", num_subdirs, + num_iterations, use_landlock ? "with" : "without"); + + if (times(&start_time) == -1) + err(1, "times"); + + curr = build_directory(num_subdirs, use_landlock); + + for (int i = 0; i < num_iterations; i++) { + fd = openat(curr, "file.txt", O_CREAT | O_TRUNC | O_WRONLY, + 0600); + if (use_landlock) { + if (fd == 0) + errx(1, "openat succeeded, expected EACCES"); + if (errno != EACCES) + err(1, "openat expected EACCES, but got"); + } + if (fd != -1) + close(fd); + } + + if (times(&end_time) == -1) + err(1, "times"); + + printf("*** Benchmark concluded ***\n"); + printf("System: %ld clocks\n", + end_time.tms_stime - start_time.tms_stime); + printf("User : %ld clocks\n", + end_time.tms_utime - start_time.tms_utime); + printf("Clocks per second: %ld\n", CLOCKS_PER_SEC); + + close(curr); + + remove_recursively(num_subdirs); +} diff --git a/tools/testing/selftests/landlock/tsync_test.c b/tools/testing/selftests/landlock/tsync_test.c new file mode 100644 index 000000000000..37ef0d2270db --- /dev/null +++ b/tools/testing/selftests/landlock/tsync_test.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Landlock tests - Enforcing the same restrictions across multiple threads + * + * Copyright © 2025 Günther Noack <gnoack3000@gmail.com> + */ + +#define _GNU_SOURCE +#include <pthread.h> +#include <sys/prctl.h> +#include <linux/landlock.h> + +#include "common.h" + +/* create_ruleset - Create a simple ruleset FD common to all tests */ +static int create_ruleset(struct __test_metadata *const _metadata) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = (LANDLOCK_ACCESS_FS_WRITE_FILE | + LANDLOCK_ACCESS_FS_TRUNCATE), + }; + const int ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + + ASSERT_LE(0, ruleset_fd) + { + TH_LOG("landlock_create_ruleset: %s", strerror(errno)); + } + return ruleset_fd; +} + +TEST(single_threaded_success) +{ + const int ruleset_fd = create_ruleset(_metadata); + + disable_caps(_metadata); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, + LANDLOCK_RESTRICT_SELF_TSYNC)); + + EXPECT_EQ(0, close(ruleset_fd)); +} + +static void store_no_new_privs(void *data) +{ + bool *nnp = data; + + if (!nnp) + return; + *nnp = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); +} + +static void *idle(void *data) +{ + pthread_cleanup_push(store_no_new_privs, data); + + while (true) + sleep(1); + + pthread_cleanup_pop(1); +} + +TEST(multi_threaded_success) +{ + pthread_t t1, t2; + bool no_new_privs1, no_new_privs2; + const int ruleset_fd = create_ruleset(_metadata); + + disable_caps(_metadata); + + ASSERT_EQ(0, pthread_create(&t1, NULL, idle, &no_new_privs1)); + ASSERT_EQ(0, pthread_create(&t2, NULL, idle, &no_new_privs2)); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + + EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, + LANDLOCK_RESTRICT_SELF_TSYNC)); + + ASSERT_EQ(0, pthread_cancel(t1)); + ASSERT_EQ(0, pthread_cancel(t2)); + ASSERT_EQ(0, pthread_join(t1, NULL)); + ASSERT_EQ(0, pthread_join(t2, NULL)); + + /* The no_new_privs flag was implicitly enabled on all threads. */ + EXPECT_TRUE(no_new_privs1); + EXPECT_TRUE(no_new_privs2); + + EXPECT_EQ(0, close(ruleset_fd)); +} + +TEST(multi_threaded_success_despite_diverging_domains) +{ + pthread_t t1, t2; + const int ruleset_fd = create_ruleset(_metadata); + + disable_caps(_metadata); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + + ASSERT_EQ(0, pthread_create(&t1, NULL, idle, NULL)); + ASSERT_EQ(0, pthread_create(&t2, NULL, idle, NULL)); + + /* + * The main thread enforces a ruleset, + * thereby bringing the threads' Landlock domains out of sync. + */ + EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, 0)); + + /* Still, TSYNC succeeds, bringing the threads in sync again. */ + EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, + LANDLOCK_RESTRICT_SELF_TSYNC)); + + ASSERT_EQ(0, pthread_cancel(t1)); + ASSERT_EQ(0, pthread_cancel(t2)); + ASSERT_EQ(0, pthread_join(t1, NULL)); + ASSERT_EQ(0, pthread_join(t2, NULL)); + EXPECT_EQ(0, close(ruleset_fd)); +} + +struct thread_restrict_data { + pthread_t t; + int ruleset_fd; + int result; +}; + +static void *thread_restrict(void *data) +{ + struct thread_restrict_data *d = data; + + d->result = landlock_restrict_self(d->ruleset_fd, + LANDLOCK_RESTRICT_SELF_TSYNC); + return NULL; +} + +TEST(competing_enablement) +{ + const int ruleset_fd = create_ruleset(_metadata); + struct thread_restrict_data d[] = { + { .ruleset_fd = ruleset_fd }, + { .ruleset_fd = ruleset_fd }, + }; + + disable_caps(_metadata); + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, pthread_create(&d[0].t, NULL, thread_restrict, &d[0])); + ASSERT_EQ(0, pthread_create(&d[1].t, NULL, thread_restrict, &d[1])); + + /* Wait for threads to finish. */ + ASSERT_EQ(0, pthread_join(d[0].t, NULL)); + ASSERT_EQ(0, pthread_join(d[1].t, NULL)); + + /* Expect that both succeeded. */ + EXPECT_EQ(0, d[0].result); + EXPECT_EQ(0, d[1].result); + + EXPECT_EQ(0, close(ruleset_fd)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index 67cd53715d93..e62b85b591be 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -11,6 +11,8 @@ EXCEPTION #CORRUPT_STACK Crashes entire system on success #CORRUPT_STACK_STRONG Crashes entire system on success ARRAY_BOUNDS call trace:|UBSAN: array-index-out-of-bounds +FAM_BOUNDS call trace:|UBSAN: array-index-out-of-bounds +PTR_BOUNDS call trace:|UBSAN: array-index-out-of-bounds CORRUPT_LIST_ADD list_add corruption CORRUPT_LIST_DEL list_del corruption STACK_GUARD_PAGE_LEADING diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index eaf9312097f7..905f1e034963 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for mm selftests +# IMPORTANT: If you add a new test CATEGORY please add a simple wrapper +# script so kunit knows to run it, and add it to the list below. +# If you do not YOUR TESTS WILL NOT RUN IN THE CI. + LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h LOCAL_HDRS += $(selfdir)/mm/mseal_helpers.h @@ -44,14 +48,10 @@ LDLIBS = -lrt -lpthread -lm # warnings. CFLAGS += -U_FORTIFY_SOURCE -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../..)) ifneq (,$(wildcard $(KDIR)/Module.symvers)) -ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h)) TEST_GEN_MODS_DIR := page_frag else -PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel" -endif -else PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first" endif @@ -140,13 +140,36 @@ endif ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch -ifneq ($(ARCH),riscv64) -TEST_GEN_FILES += virtual_address_range -endif TEST_GEN_FILES += write_to_hugetlbfs endif -TEST_PROGS := run_vmtests.sh +TEST_PROGS += ksft_compaction.sh +TEST_PROGS += ksft_cow.sh +TEST_PROGS += ksft_gup_test.sh +TEST_PROGS += ksft_hmm.sh +TEST_PROGS += ksft_hugetlb.sh +TEST_PROGS += ksft_hugevm.sh +TEST_PROGS += ksft_ksm.sh +TEST_PROGS += ksft_ksm_numa.sh +TEST_PROGS += ksft_madv_guard.sh +TEST_PROGS += ksft_madv_populate.sh +TEST_PROGS += ksft_memfd_secret.sh +TEST_PROGS += ksft_migration.sh +TEST_PROGS += ksft_mkdirty.sh +TEST_PROGS += ksft_mlock.sh +TEST_PROGS += ksft_mmap.sh +TEST_PROGS += ksft_mremap.sh +TEST_PROGS += ksft_pagemap.sh +TEST_PROGS += ksft_pfnmap.sh +TEST_PROGS += ksft_pkey.sh +TEST_PROGS += ksft_process_madv.sh +TEST_PROGS += ksft_process_mrelease.sh +TEST_PROGS += ksft_rmap.sh +TEST_PROGS += ksft_soft_dirty.sh +TEST_PROGS += ksft_thp.sh +TEST_PROGS += ksft_userfaultfd.sh +TEST_PROGS += ksft_vma_merge.sh +TEST_PROGS += ksft_vmalloc.sh TEST_FILES := test_vmalloc.sh TEST_FILES += test_hmm.sh @@ -154,6 +177,7 @@ TEST_FILES += va_high_addr_switch.sh TEST_FILES += charge_reserved_hugetlb.sh TEST_FILES += hugetlb_reparenting_test.sh TEST_FILES += test_page_frag.sh +TEST_FILES += run_vmtests.sh # required by charge_reserved_hugetlb.sh TEST_FILES += write_hugetlb_memory.sh @@ -234,7 +258,7 @@ $(OUTPUT)/migration: LDLIBS += -lnuma $(OUTPUT)/rmap: LDLIBS += -lnuma local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) + CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh EXTRA_CLEAN += local_config.mk local_config.h diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index e1fe16bcbbe8..447769657634 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -100,7 +100,7 @@ function setup_cgroup() { echo writing cgroup limit: "$cgroup_limit" echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - echo writing reseravation limit: "$reservation_limit" + echo writing reservation limit: "$reservation_limit" echo "$reservation_limit" > \ $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file @@ -112,41 +112,50 @@ function setup_cgroup() { fi } +function wait_for_file_value() { + local path="$1" + local expect="$2" + local max_tries=60 + + if [[ ! -r "$path" ]]; then + echo "ERROR: cannot read '$path', missing or permission denied" + return 1 + fi + + for ((i=1; i<=max_tries; i++)); do + local cur="$(cat "$path")" + if [[ "$cur" == "$expect" ]]; then + return 0 + fi + echo "Waiting for $path to become '$expect' (current: '$cur') (try $i/$max_tries)" + sleep 1 + done + + echo "ERROR: timeout waiting for $path to become '$expect'" + return 1 +} + function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "0" } function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done + + wait_for_file_value "$path" "$size" } function write_hugetlbfs_and_get_usage() { @@ -290,7 +299,7 @@ function run_test() { setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ @@ -344,7 +353,7 @@ function run_multiple_cgroup_test() { setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index 3954f4746161..b84c82bbf875 100755 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -16,8 +16,7 @@ echo "#include <sys/types.h>" > $tmpfile_c echo "#include <liburing.h>" >> $tmpfile_c echo "int func(void) { return 0; }" >> $tmpfile_c -CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 +$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index accfd198dbda..d9c69c04b67d 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -75,6 +75,18 @@ static bool range_is_swapped(void *addr, size_t size) return true; } +static bool populate_page_checked(char *addr) +{ + bool ret; + + FORCE_READ(*addr); + ret = pagemap_is_populated(pagemap_fd, addr); + if (!ret) + ksft_print_msg("Failed to populate page\n"); + + return ret; +} + struct comm_pipes { int child_ready[2]; int parent_ready[2]; @@ -1549,8 +1561,10 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(*mem); - FORCE_READ(*smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1612,8 +1626,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) * the first sub-page and test if we get another sub-page populated * automatically. */ - FORCE_READ(mem); - FORCE_READ(smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || !pagemap_is_populated(pagemap_fd, smem + pagesize)) { ksft_test_result_skip("Did not get THPs populated\n"); @@ -1663,8 +1680,10 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1719,8 +1738,10 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, pagesize); munmap: @@ -1773,8 +1794,10 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, } /* Fault the page in. */ - FORCE_READ(mem); - FORCE_READ(smem); + if (!populate_page_checked(mem) || !populate_page_checked(smem)) { + log_test_result(KSFT_FAIL); + goto munmap; + } fn(mem, smem, hugetlbsize); munmap: diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 05d9d2805ae4..5b12041fa310 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -47,14 +47,7 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i; - - for (i = 0; i < nr_pages; i++) { - unsigned long *addr2 = - ((unsigned long *)(addr + (i * huge_page_size))); - /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(*addr2); - } + force_read_pages(addr, nr_pages, huge_page_size); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/ksft_compaction.sh b/tools/testing/selftests/mm/ksft_compaction.sh new file mode 100755 index 000000000000..1f38f4228a34 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_compaction.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t compaction diff --git a/tools/testing/selftests/mm/ksft_cow.sh b/tools/testing/selftests/mm/ksft_cow.sh new file mode 100755 index 000000000000..1e03a95fd5f6 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_cow.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t cow diff --git a/tools/testing/selftests/mm/ksft_gup_test.sh b/tools/testing/selftests/mm/ksft_gup_test.sh new file mode 100755 index 000000000000..09e586d2f446 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_gup_test.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t gup_test diff --git a/tools/testing/selftests/mm/ksft_hmm.sh b/tools/testing/selftests/mm/ksft_hmm.sh new file mode 100755 index 000000000000..0a7b04f454d5 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_hmm.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t hmm diff --git a/tools/testing/selftests/mm/ksft_hugetlb.sh b/tools/testing/selftests/mm/ksft_hugetlb.sh new file mode 100755 index 000000000000..4f92974a4eb5 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_hugetlb.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t hugetlb diff --git a/tools/testing/selftests/mm/ksft_hugevm.sh b/tools/testing/selftests/mm/ksft_hugevm.sh new file mode 100755 index 000000000000..377967fe9c91 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_hugevm.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t hugevm diff --git a/tools/testing/selftests/mm/ksft_ksm.sh b/tools/testing/selftests/mm/ksft_ksm.sh new file mode 100755 index 000000000000..f6a6fe13a3b0 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_ksm.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t ksm diff --git a/tools/testing/selftests/mm/ksft_ksm_numa.sh b/tools/testing/selftests/mm/ksft_ksm_numa.sh new file mode 100755 index 000000000000..144b41a5e3bb --- /dev/null +++ b/tools/testing/selftests/mm/ksft_ksm_numa.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t ksm_numa diff --git a/tools/testing/selftests/mm/ksft_madv_guard.sh b/tools/testing/selftests/mm/ksft_madv_guard.sh new file mode 100755 index 000000000000..2d810c049182 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_madv_guard.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t madv_guard diff --git a/tools/testing/selftests/mm/ksft_madv_populate.sh b/tools/testing/selftests/mm/ksft_madv_populate.sh new file mode 100755 index 000000000000..127e22ed02c4 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_madv_populate.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t madv_populate diff --git a/tools/testing/selftests/mm/ksft_mdwe.sh b/tools/testing/selftests/mm/ksft_mdwe.sh new file mode 100755 index 000000000000..3dcae95ddabc --- /dev/null +++ b/tools/testing/selftests/mm/ksft_mdwe.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t mdwe diff --git a/tools/testing/selftests/mm/ksft_memfd_secret.sh b/tools/testing/selftests/mm/ksft_memfd_secret.sh new file mode 100755 index 000000000000..56e82dd648a7 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_memfd_secret.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t memfd_secret diff --git a/tools/testing/selftests/mm/ksft_migration.sh b/tools/testing/selftests/mm/ksft_migration.sh new file mode 100755 index 000000000000..7cf37c72d26e --- /dev/null +++ b/tools/testing/selftests/mm/ksft_migration.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t migration diff --git a/tools/testing/selftests/mm/ksft_mkdirty.sh b/tools/testing/selftests/mm/ksft_mkdirty.sh new file mode 100755 index 000000000000..dd6332df3204 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_mkdirty.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t mkdirty diff --git a/tools/testing/selftests/mm/ksft_mlock.sh b/tools/testing/selftests/mm/ksft_mlock.sh new file mode 100755 index 000000000000..1e25ab9fdc8b --- /dev/null +++ b/tools/testing/selftests/mm/ksft_mlock.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t mlock diff --git a/tools/testing/selftests/mm/ksft_mmap.sh b/tools/testing/selftests/mm/ksft_mmap.sh new file mode 100755 index 000000000000..2c3137ae8bc8 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_mmap.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t mmap diff --git a/tools/testing/selftests/mm/ksft_mremap.sh b/tools/testing/selftests/mm/ksft_mremap.sh new file mode 100755 index 000000000000..4101670d0e19 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_mremap.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t mremap diff --git a/tools/testing/selftests/mm/ksft_page_frag.sh b/tools/testing/selftests/mm/ksft_page_frag.sh new file mode 100755 index 000000000000..216e20ffe390 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_page_frag.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t page_frag diff --git a/tools/testing/selftests/mm/ksft_pagemap.sh b/tools/testing/selftests/mm/ksft_pagemap.sh new file mode 100755 index 000000000000..b8d270fdd43e --- /dev/null +++ b/tools/testing/selftests/mm/ksft_pagemap.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t pagemap diff --git a/tools/testing/selftests/mm/ksft_pfnmap.sh b/tools/testing/selftests/mm/ksft_pfnmap.sh new file mode 100755 index 000000000000..75758de968bb --- /dev/null +++ b/tools/testing/selftests/mm/ksft_pfnmap.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t pfnmap diff --git a/tools/testing/selftests/mm/ksft_pkey.sh b/tools/testing/selftests/mm/ksft_pkey.sh new file mode 100755 index 000000000000..ac944233b7f7 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_pkey.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t pkey diff --git a/tools/testing/selftests/mm/ksft_process_madv.sh b/tools/testing/selftests/mm/ksft_process_madv.sh new file mode 100755 index 000000000000..2c3137ae8bc8 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_process_madv.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t mmap diff --git a/tools/testing/selftests/mm/ksft_process_mrelease.sh b/tools/testing/selftests/mm/ksft_process_mrelease.sh new file mode 100755 index 000000000000..f560aa5e4218 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_process_mrelease.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t process_mrelease diff --git a/tools/testing/selftests/mm/ksft_rmap.sh b/tools/testing/selftests/mm/ksft_rmap.sh new file mode 100755 index 000000000000..974742b9b02f --- /dev/null +++ b/tools/testing/selftests/mm/ksft_rmap.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t rmap diff --git a/tools/testing/selftests/mm/ksft_soft_dirty.sh b/tools/testing/selftests/mm/ksft_soft_dirty.sh new file mode 100755 index 000000000000..d160d7fea0a9 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_soft_dirty.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t soft_dirty diff --git a/tools/testing/selftests/mm/ksft_thp.sh b/tools/testing/selftests/mm/ksft_thp.sh new file mode 100755 index 000000000000..95321aecabdb --- /dev/null +++ b/tools/testing/selftests/mm/ksft_thp.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t thp diff --git a/tools/testing/selftests/mm/ksft_userfaultfd.sh b/tools/testing/selftests/mm/ksft_userfaultfd.sh new file mode 100755 index 000000000000..92667abde6c6 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_userfaultfd.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t userfaultfd diff --git a/tools/testing/selftests/mm/ksft_vma_merge.sh b/tools/testing/selftests/mm/ksft_vma_merge.sh new file mode 100755 index 000000000000..68449d840680 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_vma_merge.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t vma_merge diff --git a/tools/testing/selftests/mm/ksft_vmalloc.sh b/tools/testing/selftests/mm/ksft_vmalloc.sh new file mode 100755 index 000000000000..0b5019a76612 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_vmalloc.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 + +./run_vmtests.sh -t vmalloc diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile index 8c8bb39ffa28..96e5f646e69b 100644 --- a/tools/testing/selftests/mm/page_frag/Makefile +++ b/tools/testing/selftests/mm/page_frag/Makefile @@ -1,5 +1,5 @@ PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) -KDIR ?= /lib/modules/$(shell uname -r)/build +KDIR ?= $(if $(O),$(O),$(realpath ../../../../..)) ifeq ($(V),1) Q = diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 2cb5441f29c7..2ca8a7e3c27e 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1052,11 +1052,10 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - long ret, fd, i, buf_size; + long ret, fd, i, buf_size, nr_pages; struct page_region *vec; char *mem, *fmem; struct stat sbuf; - char *tmp_buf; /* 1. wrong operation */ mem_size = 10 * page_size; @@ -1167,14 +1166,14 @@ int sanity_tests(void) if (fmem == MAP_FAILED) ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); - tmp_buf = malloc(sbuf.st_size); - memcpy(tmp_buf, fmem, sbuf.st_size); + nr_pages = (sbuf.st_size + page_size - 1) / page_size; + force_read_pages(fmem, nr_pages, page_size); ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS); ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && - LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) && + LEN(vec[0]) == nr_pages && (vec[0].categories & PAGE_IS_FILE), "%s Memory mapped file\n", __func__); @@ -1553,7 +1552,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) ksft_print_header(); if (init_uffd()) - ksft_exit_pass(); + ksft_exit_skip("Failed to initialize userfaultfd\n"); ksft_set_plan(117); @@ -1562,7 +1561,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) pagemap_fd = open(PAGEMAP, O_RDONLY); if (pagemap_fd < 0) - return -EINVAL; + ksft_exit_fail_msg("Failed to open " PAGEMAP "\n"); /* 1. Sanity testing */ sanity_tests_sd(); @@ -1734,5 +1733,5 @@ int main(int __attribute__((unused)) argc, char *argv[]) zeropfn_tests(); close(pagemap_fd); - ksft_exit_pass(); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index f546dfb10cae..4f550822385a 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -25,8 +25,12 @@ #include "kselftest_harness.h" #include "vm_util.h" +#define DEV_MEM_NPAGES 2 + static sigjmp_buf sigjmp_buf_env; static char *file = "/dev/mem"; +static off_t file_offset; +static int fd; static void signal_handler(int sig) { @@ -35,18 +39,15 @@ static void signal_handler(int sig) static int test_read_access(char *addr, size_t size, size_t pagesize) { - size_t offs; int ret; if (signal(SIGSEGV, signal_handler) == SIG_ERR) return -EINVAL; ret = sigsetjmp(sigjmp_buf_env, 1); - if (!ret) { - for (offs = 0; offs < size; offs += pagesize) - /* Force a read that the compiler cannot optimize out. */ - *((volatile char *)(addr + offs)); - } + if (!ret) + force_read_pages(addr, size/pagesize, pagesize); + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) return -EINVAL; @@ -91,7 +92,7 @@ static int find_ram_target(off_t *offset, break; /* We need two pages. */ - if (end > start + 2 * pagesize) { + if (end > start + DEV_MEM_NPAGES * pagesize) { fclose(file); *offset = start; return 0; @@ -100,11 +101,48 @@ static int find_ram_target(off_t *offset, return -ENOENT; } +static void pfnmap_init(void) +{ + size_t pagesize = getpagesize(); + size_t size = DEV_MEM_NPAGES * pagesize; + void *addr; + + if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { + int err = find_ram_target(&file_offset, pagesize); + + if (err) + ksft_exit_skip("Cannot find ram target in '/proc/iomem': %s\n", + strerror(-err)); + } else { + file_offset = 0; + } + + fd = open(file, O_RDONLY); + if (fd < 0) + ksft_exit_skip("Cannot open '%s': %s\n", file, strerror(errno)); + + /* + * Make sure we can map the file, and perform some basic checks; skip + * the whole suite if anything goes wrong. + * A fresh mapping is then created for every test case by + * FIXTURE_SETUP(pfnmap). + */ + addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset); + if (addr == MAP_FAILED) + ksft_exit_skip("Cannot mmap '%s': %s\n", file, strerror(errno)); + + if (!check_vmflag_pfnmap(addr)) + ksft_exit_skip("Invalid file: '%s'. Not pfnmap'ed\n", file); + + if (test_read_access(addr, size, pagesize)) + ksft_exit_skip("Cannot read-access mmap'ed '%s'\n", file); + + munmap(addr, size); +} + FIXTURE(pfnmap) { - off_t offset; size_t pagesize; - int dev_mem_fd; char *addr1; size_t size1; char *addr2; @@ -115,31 +153,10 @@ FIXTURE_SETUP(pfnmap) { self->pagesize = getpagesize(); - if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { - /* We'll require two physical pages throughout our tests ... */ - if (find_ram_target(&self->offset, self->pagesize)) - SKIP(return, - "Cannot find ram target in '/proc/iomem'\n"); - } else { - self->offset = 0; - } - - self->dev_mem_fd = open(file, O_RDONLY); - if (self->dev_mem_fd < 0) - SKIP(return, "Cannot open '%s'\n", file); - - self->size1 = self->pagesize * 2; + self->size1 = DEV_MEM_NPAGES * self->pagesize; self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); - if (self->addr1 == MAP_FAILED) - SKIP(return, "Cannot mmap '%s'\n", file); - - if (!check_vmflag_pfnmap(self->addr1)) - SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file); - - /* ... and want to be able to read from them. */ - if (test_read_access(self->addr1, self->size1, self->pagesize)) - SKIP(return, "Cannot read-access mmap'ed '%s'\n", file); + fd, file_offset); + ASSERT_NE(self->addr1, MAP_FAILED); self->size2 = 0; self->addr2 = MAP_FAILED; @@ -151,8 +168,6 @@ FIXTURE_TEARDOWN(pfnmap) munmap(self->addr2, self->size2); if (self->addr1 != MAP_FAILED) munmap(self->addr1, self->size1); - if (self->dev_mem_fd >= 0) - close(self->dev_mem_fd); } TEST_F(pfnmap, madvise_disallowed) @@ -192,7 +207,7 @@ TEST_F(pfnmap, munmap_split) */ self->size2 = self->pagesize; self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->offset); + fd, file_offset); ASSERT_NE(self->addr2, MAP_FAILED); } @@ -262,8 +277,12 @@ int main(int argc, char **argv) if (strcmp(argv[i], "--") == 0) { if (i + 1 < argc && strlen(argv[i + 1]) > 0) file = argv[i + 1]; - return test_harness_run(i, argv); + argc = i; + break; } } + + pfnmap_init(); + return test_harness_run(argc, argv); } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d9173f2312b7..29be9038bfb0 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -2,6 +2,10 @@ # SPDX-License-Identifier: GPL-2.0 # Please run as root +# IMPORTANT: If you add a new test CATEGORY please add a simple wrapper +# script so kunit knows to run it, and add it to the list below. +# If you do not YOUR TESTS WILL NOT RUN IN THE CI. + # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -399,28 +403,8 @@ CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison fi if [ $VADDR64 -ne 0 ]; then - - # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel - # allows high virtual address allocation requests independent - # of platform's physical memory. - - if [ -x ./virtual_address_range ]; then - prev_policy=$(cat /proc/sys/vm/overcommit_memory) - echo 1 > /proc/sys/vm/overcommit_memory - CATEGORY="hugevm" run_test ./virtual_address_range - echo $prev_policy > /proc/sys/vm/overcommit_memory - fi - # va high address boundary switch test - ARCH_ARM64="arm64" - prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo 6 > /proc/sys/vm/nr_hugepages - fi CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh - if [ "$ARCH" == "$ARCH_ARM64" ]; then - echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages - fi fi # VADDR64 # vmalloc stability smoke test diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 40799f3f0213..e0167111bdd1 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -652,11 +652,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) { - char *addr2 = *addr + i; - - FORCE_READ(*addr2); - } + force_read_pages(*addr, fd_size / pmd_pagesize, pmd_pagesize); if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh index d39096723fca..b23d705bf570 100755 --- a/tools/testing/selftests/mm/test_vmalloc.sh +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -13,6 +13,9 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" NUM_CPUS=`grep -c ^processor /proc/cpuinfo` +# Default number of times we allocate percpu objects: +NR_PCPU_OBJECTS=35000 + # 1 if fails exitcode=1 @@ -27,6 +30,8 @@ PERF_PARAM="sequential_test_order=1 test_repeat_count=3" SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" +PCPU_OBJ_PARAM="nr_pcpu_objects=$NR_PCPU_OBJECTS" + check_test_requirements() { uid=$(id -u) @@ -47,12 +52,30 @@ check_test_requirements() fi } +check_memory_requirement() +{ + # The pcpu_alloc_test allocates nr_pcpu_objects per cpu. If the + # PAGE_SIZE is on the larger side it is easier to set a value + # that can cause oom events during testing. Since we are + # testing the functionality of vmalloc and not the oom-killer, + # calculate what is 90% of available memory and divide it by + # the number of online CPUs. + pages=$(($(getconf _AVPHYS_PAGES) * 90 / 100 / $NUM_CPUS)) + + if (($pages < $NR_PCPU_OBJECTS)); then + echo "Updated nr_pcpu_objects to 90% of available memory." + echo "nr_pcpu_objects is now set to: $pages." + PCPU_OBJ_PARAM="nr_pcpu_objects=$pages" + fi +} + run_performance_check() { echo "Run performance tests to evaluate how fast vmalloc allocation is." echo "It runs all test cases on one single CPU with sequential order." - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $PERF_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel message buffer to see the summary." } @@ -63,7 +86,8 @@ run_stability_check() echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $STRESS_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } @@ -74,7 +98,8 @@ run_smoke_check() echo "Please check $0 output how it can be used" echo "for deep performance analysis as well as stress testing." - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + check_memory_requirement + modprobe $DRIVER $SMOKE_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1 echo "Done." echo "Check the kernel ring buffer to see the summary." } diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 02f290a69132..51401e081b20 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -322,7 +322,7 @@ static int supported_arch(void) int main(int argc, char **argv) { - int ret; + int ret, hugetlb_ret = KSFT_PASS; if (!supported_arch()) return KSFT_SKIP; @@ -331,6 +331,10 @@ int main(int argc, char **argv) ret = run_test(testcases, sz_testcases); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); - return ret; + hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); + + if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS) + return KSFT_PASS; + else + return KSFT_FAIL; } diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index a7d4b02b21dd..9492c2d72634 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -61,9 +61,9 @@ check_supported_ppc64() check_test_requirements() { - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. + # The test supports x86_64, powerpc64 and arm64. There's check for arm64 + # in va_high_addr_switch.c. The test itself will reject other architectures. + case `uname -m` in "x86_64") check_supported_x86_64 @@ -111,7 +111,9 @@ setup_nr_hugepages() check_test_requirements save_nr_hugepages -# 4 keep_mapped pages, and one for tmp usage -setup_nr_hugepages 5 +# The HugeTLB tests require 6 pages +setup_nr_hugepages 6 ./va_high_addr_switch --run-hugetlb +retcode=$? restore_nr_hugepages +exit $retcode diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c deleted file mode 100644 index 4f0923825ed7..000000000000 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ /dev/null @@ -1,260 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <errno.h> -#include <sys/prctl.h> -#include <sys/mman.h> -#include <sys/time.h> -#include <fcntl.h> - -#include "vm_util.h" -#include "kselftest.h" - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 1GB. Hence 1GB is - * chosen as the single chunk size for address space - * mapping. - */ - -#define SZ_1GB (1024 * 1024 * 1024UL) -#define SZ_1TB (1024 * 1024 * 1024 * 1024UL) - -#define MAP_CHUNK_SIZE SZ_1GB - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and support for - * high mappings up to 4PB virtual address space has - * been added. - * - * On PowerPC64, the address space up to 128TB can be - * mapped without a hint. Addresses beyond 128TB, up to - * 4PB, can be mapped with a hint. - * - */ - -#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) -#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) -#define NR_CHUNKS_3968TB (NR_CHUNKS_128TB * 31UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB -#elif defined(__PPC64__) -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_3968TB -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hint_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static void validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); - return; - } - - if (addr > HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); -} - -static void mark_range(char *ptr, size_t size) -{ - if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) { - if (errno == EINVAL) { - /* Depends on CONFIG_ANON_VMA_NAME */ - ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n"); - ksft_finished(); - } else { - ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n"); - } - } -} - -static int is_marked_vma(const char *vma_name) -{ - return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n"); -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -static int validate_complete_va_space(void) -{ - unsigned long start_addr, end_addr, prev_end_addr; - char line[400]; - char prot[6]; - FILE *file; - int fd; - - fd = open("va_dump", O_CREAT | O_WRONLY, 0600); - unlink("va_dump"); - if (fd < 0) { - ksft_test_result_skip("cannot create or open dump file\n"); - ksft_finished(); - } - - file = fopen("/proc/self/maps", "r"); - if (file == NULL) - ksft_exit_fail_msg("cannot open /proc/self/maps\n"); - - prev_end_addr = 0; - while (fgets(line, sizeof(line), file)) { - const char *vma_name = NULL; - int vma_name_start = 0; - unsigned long hop; - - if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n", - &start_addr, &end_addr, prot, &vma_name_start) != 3) - ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); - - if (vma_name_start) - vma_name = line + vma_name_start; - - /* end of userspace mappings; ignore vsyscall mapping */ - if (start_addr & (1UL << 63)) - return 0; - - /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ - if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) - return 1; - - prev_end_addr = end_addr; - - if (prot[0] != 'r') - continue; - - if (check_vmflag_io((void *)start_addr)) - continue; - - /* - * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. - * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 - * addresses after that. If the address was not held by this - * process, write would fail with errno set to EFAULT. - * Anyways, if write returns anything apart from 1, exit the - * program since that would mean a bug in /proc/self/maps. - */ - hop = 0; - while (start_addr + hop < end_addr) { - if (write(fd, (void *)(start_addr + hop), 1) != 1) - return 1; - lseek(fd, 0, SEEK_SET); - - if (is_marked_vma(vma_name)) - munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE); - - hop += MAP_CHUNK_SIZE; - } - } - return 0; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char **hptr; - char *hint; - unsigned long i, lchunks, hchunks; - - ksft_print_header(); - ksft_set_plan(1); - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); - break; - } - - mark_range(ptr[i], MAP_CHUNK_SIZE); - validate_addr(ptr[i], 0); - } - lchunks = i; - hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); - if (hptr == NULL) { - ksft_test_result_skip("Memory constraint not fulfilled\n"); - ksft_finished(); - } - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hint_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - mark_range(hptr[i], MAP_CHUNK_SIZE); - validate_addr(hptr[i], 1); - } - hchunks = i; - if (validate_complete_va_space()) { - ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); - ksft_finished(); - } - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - free(hptr); - - ksft_test_result_pass("Test\n"); - ksft_finished(); -} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 6ad32b1830f1..522f7f9050f5 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -54,6 +54,13 @@ static inline unsigned int pshift(void) return __page_shift; } +static inline void force_read_pages(char *addr, unsigned int nr_pages, + size_t pagesize) +{ + for (unsigned int i = 0; i < nr_pages; i++) + FORCE_READ(addr[i * pagesize]); +} + bool detect_huge_zeropage(void); /* diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 34c91f7e6128..ecb5f7619960 100644 --- a/tools/testing/selftests/mm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -68,7 +68,7 @@ int main(int argc, char **argv) int key = 0; int *ptr = NULL; int c = 0; - int size = 0; + size_t size = 0; char path[256] = ""; enum method method = MAX_METHOD; int want_sleep = 0, private = 0; @@ -86,7 +86,10 @@ int main(int argc, char **argv) while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { switch (c) { case 's': - size = atoi(optarg); + if (sscanf(optarg, "%zu", &size) != 1) { + perror("Invalid -s."); + exit_usage(); + } break; case 'p': strncpy(path, optarg, sizeof(path) - 1); @@ -131,7 +134,7 @@ int main(int argc, char **argv) } if (size != 0) { - printf("Writing this size: %d\n", size); + printf("Writing this size: %zu\n", size); } else { errno = EINVAL; perror("size not found"); diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 6930fe926c58..97ad4d551d44 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -7,6 +7,7 @@ cmsg_sender epoll_busy_poll fin_ack_lat hwtstamp_config +icmp_rfc4884 io_uring_zerocopy_tx ioam6_parser ip_defrag diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 45c4ea381bc3..afdea6d95bde 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -22,6 +22,7 @@ TEST_PROGS := \ cmsg_so_mark.sh \ cmsg_so_priority.sh \ cmsg_time.sh \ + double_udp_encap.sh \ drop_monitor_tests.sh \ fcnal-ipv4.sh \ fcnal-ipv6.sh \ @@ -167,6 +168,7 @@ TEST_GEN_PROGS := \ bind_timewait \ bind_wildcard \ epoll_busy_poll \ + icmp_rfc4884 \ ipv6_fragmentation \ proc_net_pktgen \ reuseaddr_conflict \ @@ -181,7 +183,6 @@ TEST_GEN_PROGS := \ tap \ tcp_port_share \ tls \ - tun \ # end of TEST_GEN_PROGS TEST_FILES := \ @@ -193,7 +194,11 @@ TEST_FILES := \ # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller -YNL_GEN_PROGS := netlink-dumps +YNL_GEN_PROGS := \ + netlink-dumps \ + tun \ +# end of YNL_GEN_PROGS + TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_GEN_PROGS += $(YNL_GEN_PROGS) @@ -204,7 +209,14 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk # YNL build -YNL_GENS := netdev +YNL_GENS := \ + netdev \ + rt-addr \ + rt-link \ + rt-neigh \ + rt-route \ +# end of YNL_GENS + include ynl.mk $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index b84362b9b508..cd49b7dfe216 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -77,6 +77,7 @@ CONFIG_NET_DROP_MONITOR=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NETFILTER_XTABLES_LEGACY=y +CONFIG_NETFILTER_XT_MATCH_BPF=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_NETFILTER_XT_NAT=m diff --git a/tools/testing/selftests/net/double_udp_encap.sh b/tools/testing/selftests/net/double_udp_encap.sh new file mode 100755 index 000000000000..9aaf97cdf141 --- /dev/null +++ b/tools/testing/selftests/net/double_udp_encap.sh @@ -0,0 +1,393 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +# shellcheck disable=SC2155 # prefer RO variable over return value from cmd +readonly CLI="$(dirname "$(readlink -f "$0")")/../../../net/ynl/pyynl/cli.py" + +readonly SRC=1 +readonly DST=2 + +readonly NET_V4=192.168.1. +readonly NET_V6=2001:db8:: +readonly OL1_NET_V4=172.16.1. +readonly OL1_NET_V6=2001:db8:1:: +readonly OL2_NET_V4=172.16.2. +readonly OL2_NET_V6=2001:db8:2:: + +trap cleanup_all_ns EXIT + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_gnv_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r gnv_dev=$3 + local -r gnv_id=$4 + local opts=$5 + local gnv_json + local rem + + if is_ipv6 "$bm_rem_addr"; then + rem=remote6 + else + rem=remote + fi + + # add ynl opt separator, if needed + [ -n "$opts" ] && opts=", $opts" + + gnv_json="{ \"id\": $gnv_id, \"$rem\": \"$bm_rem_addr\"$opts }" + ip netns exec "$netns" "$CLI" --family rt-link --create --excl \ + --do newlink --json "{\"ifname\": \"$gnv_dev\", + \"linkinfo\": {\"kind\":\"geneve\", + \"data\": $gnv_json } }" > /dev/null + ip -n "$netns" link set dev "$gnv_dev" up +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r vxlan_dev=$3 + local -r vxlan_id=$4 + local -r opts_str=$5 + local oldifs + local -a opts + local opt + + # convert the arguments from yaml format + oldifs=$IFS + IFS=',' + for opt in $opts_str; do + local pattern='"port":' + + [ -n "$opt" ] || continue + + opts+=("${opt/$pattern*/dstport}" "${opt/$pattern/}") + done + IFS=$oldifs + [ ${#opts[@]} -gt 0 ] || opts+=("dstport" "4789") + + ip -n "$netns" link add "$vxlan_dev" type vxlan id "$vxlan_id" \ + remote "$bm_rem_addr" "${opts[@]}" + ip -n "$netns" link set dev "$vxlan_dev" up +} + +create_ns() { + local nested_opt='"port":6082' + local create_endpoint + local options="$1" + local feature + local dev + local id + local ns + + RET=0 + + # +-------------+ +-------------+ + # | NS_SRC | | NS_NST_DST | + # | | | | + # | gnv_nst1 | | gnv_nst2 | + # | + | | + | + # | | | | | | + # | + | | + | + # | gnv1 | | gnv2 | + # | + | | + | + # | | | | | | + # | + veth1 +--------+ veth2 + | + # | | | | + # +-------------+ +-------------+ + + setup_ns NS_SRC NS_DST + + # concatenate caller provided options and default one + [ -n "$2" ] && nested_opt="$nested_opt,$2" + + ip link add name "veth$SRC" netns "$NS_SRC" type veth \ + peer name "veth$DST" netns "$NS_DST" + case "$ENCAP" in + vxlan) + create_endpoint=create_vxlan_endpoint + dev=vx + ;; + geneve) + create_endpoint=create_gnv_endpoint + dev=gnv + ;; + esac + + id=1 + for ns in "${NS_LIST[@]}"; do + ip -n "$ns" link set dev "veth$id" up + + # ensure the sender can do large write just after 3whs + ip netns exec "$ns" \ + sysctl -qw net.ipv4.tcp_wmem="4096 4194304 4194304" + + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + if [ $FAMILY = "4" ]; then + ip -n "$ns" addr add dev "veth$id" "$NET_V4$id/24" + $create_endpoint "$ns" "$NET_V4$((3 - id))" \ + "$dev$id" 4 "$options" + ip -n "$ns" addr add dev "$dev$id" "$OL1_NET_V4$id/24" + + # nested tunnel devices + # pmtu can't be propagated to upper layer devices; + # need manual adjust + $create_endpoint "$ns" "$OL1_NET_V4$((3 - id))" \ + "$dev"_nst"$id" 40 "$nested_opt" + ip -n "$ns" addr add dev "$dev"_nst"$id" \ + "$OL2_NET_V4$id/24" + ip -n "$ns" link set dev "$dev"_nst"$id" mtu 1392 + else + ip -n "$ns" addr add dev "veth$id" "$NET_V6$id/64" \ + nodad + $create_endpoint "$ns" "$NET_V6$((3 - id))" \ + "$dev"6"$id" 6 "$options" + ip -n "$ns" addr add dev "$dev"6"$id" \ + "$OL1_NET_V6$id/64" nodad + + $create_endpoint "$ns" "$OL1_NET_V6$((3 - id))" \ + "$dev"6_nst"$id" 60 "$nested_opt" + ip -n "$ns" addr add dev "$dev"6_nst"$id" \ + "$OL2_NET_V6$id/64" nodad + ip -n "$ns" link set dev "$dev"6_nst"$id" mtu 1352 + fi + id=$((id+1)) + done + + # enable GRO heuristic on the veth peer and ensure UDP L4 over tunnel is + # actually segmented + for feature in tso tx-udp_tnl-segmentation; do + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" \ + "$feature" off 2>/dev/null + done +} + +create_ns_gso() { + local dev + + create_ns "$@" + if [ "$ENCAP" = "geneve" ]; then + dev=gnv + else + dev=vx + fi + [ "$FAMILY" = "6" ] && dev="$dev"6 + ip netns exec "$NS_SRC" ethtool -K "$dev$SRC" \ + tx-gso-partial on \ + tx-udp_tnl-segmentation on \ + tx-udp_tnl-csum-segmentation on +} + +create_ns_gso_gro() { + create_ns_gso "$@" + ip netns exec "$NS_DST" ethtool -K "veth$DST" gro on + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" tx off >/dev/null 2>&1 +} + +run_test() { + local -r dst=$NET$DST + local -r msg=$1 + local -r total_size=$2 + local -r encappkts=$3 + local inner_proto_offset=0 + local inner_maclen=14 + local rx_family="-4" + local ipt=iptables + local bpf_filter + local -a rx_args + local wire_pkts + local rcvpkts + local encl=8 + local dport + local pkts + local snd + + if [ $FAMILY = "6" ]; then + ipt=ip6tables + else + # rx program does not support '-6' and implies ipv6 usage by + # default + rx_args=("$rx_family") + fi + + # The received can only check fixed size packet + pkts=$((total_size / GSO_SIZE)) + if [ -n "$4" ]; then + wire_pkts=$4 + elif [ $((total_size % GSO_SIZE)) -eq 0 ]; then + wire_pkts=1 + rx_args+=("-l" "$GSO_SIZE") + else + wire_pkts=2 + pkts=$((pkts + 1)) + fi + + if [ "$ENCAP" = "geneve" ]; then + dport=6081 + else + dport=4789 + fi + + # Either: + # - IPv4, nested tunnel carries UDP over IPv4, with dport 6082, + # innermost is TCP over IPv4 on port 8000 + # - IPv6, nested tunnel carries UDP over IPv6, with dport 6082, + # innermost is TCP over IPv6 on port 8000 + # The nested tunnel port is 6082 and the nested encap len is 8 + # regardless of the encap type (no geneve opts). + # In inherit protocol mode there is no nested mac hdr and the nested + # l3 protocol type field belongs to the geneve hdr. + [ "$USE_HINT" = true ] && encl=16 + [ "$INHERIT" = true ] && inner_maclen=0 + [ "$INHERIT" = true ] && inner_proto_offset=-4 + local inner=$((inner_maclen+encl)) + local proto=$((inner_maclen+encl+inner_proto_offset)) + bpf_filter=$(nfbpf_compile "(ip && + ip[$((40+encl))] == 0x08 && ip[$((41+encl))] == 0x00 && + ip[$((51+encl))] == 0x11 && + ip[$((64+encl))] == 0x17 && ip[$((65+encl))] == 0xc2 && + ip[$((76+proto))] == 0x08 && ip[$((77+proto))] == 0x00 && + ip[$((87+inner))] == 0x6 && + ip[$((100+inner))] == 0x1f && ip[$((101+inner))] == 0x40) || + (ip6 && + ip6[$((60+encl))] == 0x86 && ip6[$((61+encl))] == 0xdd && + ip6[$((68+encl))] == 0x11 && + ip6[$((104+encl))] == 0x17 && ip6[$((105+encl))] == 0xc2 && + ip6[$((116+proto))] == 0x86 && ip6[$((117+proto))] == 0xdd && + ip6[$((124+inner))] == 0x6 && + ip6[$((160+inner))] == 0x1f && ip6[$((161+inner))] == 0x40)") + + # ignore shorts packet, to avoid arp/mld induced noise + ip netns exec "$NS_SRC" "$ipt" -A OUTPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" "$ipt" -A INPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" ./udpgso_bench_rx -C 2000 -t -R 100 \ + -n "$pkts" "${rx_args[@]}" & + local pid=$! + wait_local_port_listen "$NS_DST" 8000 tcp + ip netns exec "$NS_SRC" ./udpgso_bench_tx -"$FAMILY" -t -M 1 \ + -s "$total_size" -D "$dst" + local ret=$? + check_err "$ret" "client failure exit code $ret" + wait "$pid" + ret=$? + check_err "$ret" "sever failure exit code $ret" + + snd=$(ip netns exec "$NS_SRC" "$ipt"-save -c | + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$snd" = "$wire_pkts" ] + # shellcheck disable=SC2319 # known false positive + check_err $? "send $snd packets on the lowest link, expected $wire_pkts" + + rcvpkts=$(ip netns exec "$NS_DST" "$ipt"-save -c | \ + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$rcvpkts" = "$encappkts" ] + check_err $? "received $rcvpkts $ENCAP packets, expected $encappkts" + log_test "$msg" +} + +run_tests() { + for FAMILY in 4 6; do + NET=$OL2_NET_V4 + GSO_SIZE=1340 # 1392 - 20 - 32 + + if [ $FAMILY = 6 ]; then + NET=$OL2_NET_V6 + GSO_SIZE=1280 # 1352 - 40 - 32 + fi + + echo "IPv$FAMILY" + + unset USE_HINT + unset INHERIT + + # "geneve" must be last encap in list, so that later + # test cases will run on it + for ENCAP in "vxlan" "geneve"; do + create_ns + run_test "No GSO - $ENCAP" $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + + create_ns_gso + run_test "GSO without GRO - $ENCAP" $((GSO_SIZE * 4)) \ + 4 1 + cleanup_all_ns + + # IPv4 only test + [ $FAMILY = "4" ] || continue + create_ns_gso + ip netns exec "$NS_SRC" \ + sysctl -qw net.ipv4.ip_no_pmtu_disc=1 + run_test "GSO disable due to no fixedid - $ENCAP" \ + $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + done + + # GRO tests imply/require geneve encap, the only one providing + # GRO hints + create_ns_gso_gro + run_test "double tunnel GRO, no hints" $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + # hint option is expected for all the following tests in the RX + # path + USE_HINT=true + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO" $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' '"udp-csum":1' + run_test "double tunnel GRO - csum complete" $((GSO_SIZE * 4))\ + 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1' \ + '"udp-csum":0,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO - no nested csum" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-csum":1' + run_test "double tunnel GRO - nested csum, outer 0-csum, skip"\ + $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + INHERIT=true + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' \ + '"udp-csum":1,"inner-proto-inherit":1' + run_test "double tunnel GRO - nested inherit proto" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + unset INHERIT + + create_ns_gso_gro '"gro-hint":1' + run_test "double tunnel GRO - short last pkt" \ + $((GSO_SIZE * 4 + GSO_SIZE / 2)) 2 + cleanup_all_ns + done +} + +require_command nfbpf_compile +require_command jq + +# tcp retransmisions will break the accounting +xfail_on_slow run_tests +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh index c01be076b210..e0d45292a298 100755 --- a/tools/testing/selftests/net/fib-onlink-tests.sh +++ b/tools/testing/selftests/net/fib-onlink-tests.sh @@ -72,7 +72,8 @@ declare -A TEST_NET4IN6IN6 TEST_NET4IN6[1]=10.1.1.254 TEST_NET4IN6[2]=10.2.1.254 -# mcast address +# mcast addresses +MCAST4=233.252.0.1 MCAST6=ff02::1 VRF=lisa @@ -260,11 +261,15 @@ valid_onlink_ipv4() run_ip 254 ${TEST_NET4[1]}.1 ${CONGW[1]} ${NETIFS[p1]} 0 "unicast connected" run_ip 254 ${TEST_NET4[1]}.2 ${RECGW4[1]} ${NETIFS[p1]} 0 "unicast recursive" + run_ip 254 ${TEST_NET4[1]}.9 ${CONGW[1]} ${NETIFS[p3]} 0 \ + "nexthop device mismatch" log_subsection "VRF ${VRF}" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.1 ${CONGW[3]} ${NETIFS[p5]} 0 "unicast connected" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.2 ${RECGW4[2]} ${NETIFS[p5]} 0 "unicast recursive" + run_ip ${VRF_TABLE} ${TEST_NET4[2]}.10 ${CONGW[3]} ${NETIFS[p7]} 0 \ + "nexthop device mismatch" log_subsection "VRF device, PBR table" @@ -300,17 +305,15 @@ invalid_onlink_ipv4() { run_ip 254 ${TEST_NET4[1]}.11 ${V4ADDRS[p1]} ${NETIFS[p1]} 2 \ "Invalid gw - local unicast address" + run_ip 254 ${TEST_NET4[1]}.12 ${MCAST4} ${NETIFS[p1]} 2 \ + "Invalid gw - multicast address" run_ip ${VRF_TABLE} ${TEST_NET4[2]}.11 ${V4ADDRS[p5]} ${NETIFS[p5]} 2 \ "Invalid gw - local unicast address, VRF" + run_ip ${VRF_TABLE} ${TEST_NET4[2]}.12 ${MCAST4} ${NETIFS[p5]} 2 \ + "Invalid gw - multicast address, VRF" run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given" - - run_ip 254 ${TEST_NET4[1]}.102 ${V4ADDRS[p3]} ${NETIFS[p1]} 2 \ - "Gateway resolves to wrong nexthop device" - - run_ip ${VRF_TABLE} ${TEST_NET4[2]}.103 ${V4ADDRS[p7]} ${NETIFS[p5]} 2 \ - "Gateway resolves to wrong nexthop device - VRF" } ################################################################################ @@ -357,12 +360,16 @@ valid_onlink_ipv6() run_ip6 254 ${TEST_NET6[1]}::1 ${V6ADDRS[p1]/::*}::64 ${NETIFS[p1]} 0 "unicast connected" run_ip6 254 ${TEST_NET6[1]}::2 ${RECGW6[1]} ${NETIFS[p1]} 0 "unicast recursive" run_ip6 254 ${TEST_NET6[1]}::3 ::ffff:${TEST_NET4IN6[1]} ${NETIFS[p1]} 0 "v4-mapped" + run_ip6 254 ${TEST_NET6[1]}::a ${V6ADDRS[p1]/::*}::64 ${NETIFS[p3]} 0 \ + "nexthop device mismatch" log_subsection "VRF ${VRF}" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::1 ${V6ADDRS[p5]/::*}::64 ${NETIFS[p5]} 0 "unicast connected" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::2 ${RECGW6[2]} ${NETIFS[p5]} 0 "unicast recursive" run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::3 ::ffff:${TEST_NET4IN6[2]} ${NETIFS[p5]} 0 "v4-mapped" + run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::b ${V6ADDRS[p5]/::*}::64 \ + ${NETIFS[p7]} 0 "nexthop device mismatch" log_subsection "VRF device, PBR table" @@ -428,13 +435,6 @@ invalid_onlink_ipv6() run_ip6 254 ${TEST_NET6[1]}::101 ${V6ADDRS[p1]} "" 2 \ "No nexthop device given" - - # default VRF validation is done against LOCAL table - # run_ip6 254 ${TEST_NET6[1]}::102 ${V6ADDRS[p3]/::[0-9]/::64} ${NETIFS[p1]} 2 \ - # "Gateway resolves to wrong nexthop device" - - run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::103 ${V6ADDRS[p7]/::[0-9]/::64} ${NETIFS[p5]} 2 \ - "Gateway resolves to wrong nexthop device - VRF" } run_onlink_tests() diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index 892895659c7e..1f2bf6e81847 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -306,39 +306,39 @@ run_test() if [ $skip_ptp = false ]; then check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \ - "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \ + "ethertype PTP (0x88f7).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \ - "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \ + "ethertype IPv4 (0x0800).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: sync msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: follow up msg" \ true "$test_name" check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \ - "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \ + "ethertype IPv6 (0x86dd).* PTPv2.* msg type *: peer delay req msg" \ true "$test_name" fi diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile index 4b6afc0fe9f8..31fb9326cf53 100644 --- a/tools/testing/selftests/net/hsr/Makefile +++ b/tools/testing/selftests/net/hsr/Makefile @@ -5,6 +5,8 @@ top_srcdir = ../../../../.. TEST_PROGS := \ hsr_ping.sh \ hsr_redbox.sh \ + link_faults.sh \ + prp_ping.sh \ # end of TEST_PROGS TEST_FILES += hsr_common.sh diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh index 5a65f4f836be..f4d685df4345 100755 --- a/tools/testing/selftests/net/hsr/hsr_ping.sh +++ b/tools/testing/selftests/net/hsr/hsr_ping.sh @@ -27,31 +27,34 @@ while getopts "$optstring" option;do esac done -do_complete_ping_test() +do_ping_tests() { - echo "INFO: Initial validation ping." - # Each node has to be able each one. - do_ping "$ns1" 100.64.0.2 - do_ping "$ns2" 100.64.0.1 - do_ping "$ns3" 100.64.0.1 - stop_if_error "Initial validation failed." - - do_ping "$ns1" 100.64.0.3 - do_ping "$ns2" 100.64.0.3 - do_ping "$ns3" 100.64.0.2 + local netid="$1" - do_ping "$ns1" dead:beef:1::2 - do_ping "$ns1" dead:beef:1::3 - do_ping "$ns2" dead:beef:1::1 - do_ping "$ns2" dead:beef:1::2 - do_ping "$ns3" dead:beef:1::1 - do_ping "$ns3" dead:beef:1::2 + echo "INFO: Running ping tests." - stop_if_error "Initial validation failed." + echo "INFO: Initial validation ping." + # Each node has to be able to reach each one. + do_ping "$ns1" "100.64.$netid.2" + do_ping "$ns1" "100.64.$netid.3" + do_ping "$ns2" "100.64.$netid.1" + do_ping "$ns2" "100.64.$netid.3" + do_ping "$ns3" "100.64.$netid.1" + do_ping "$ns3" "100.64.$netid.2" + stop_if_error "Initial validation failed on IPv4." + + do_ping "$ns1" "dead:beef:$netid::2" + do_ping "$ns1" "dead:beef:$netid::3" + do_ping "$ns2" "dead:beef:$netid::1" + do_ping "$ns2" "dead:beef:$netid::2" + do_ping "$ns3" "dead:beef:$netid::1" + do_ping "$ns3" "dead:beef:$netid::2" + stop_if_error "Initial validation failed on IPv6." # Wait until supervisor all supervision frames have been processed and the node # entries have been merged. Otherwise duplicate frames will be observed which is # valid at this stage. + echo "INFO: Wait for node table entries to be merged." WAIT=5 while [ ${WAIT} -gt 0 ] do @@ -68,62 +71,30 @@ do_complete_ping_test() sleep 1 echo "INFO: Longer ping test." - do_ping_long "$ns1" 100.64.0.2 - do_ping_long "$ns1" dead:beef:1::2 - do_ping_long "$ns1" 100.64.0.3 - do_ping_long "$ns1" dead:beef:1::3 - - stop_if_error "Longer ping test failed." - - do_ping_long "$ns2" 100.64.0.1 - do_ping_long "$ns2" dead:beef:1::1 - do_ping_long "$ns2" 100.64.0.3 - do_ping_long "$ns2" dead:beef:1::2 - stop_if_error "Longer ping test failed." - - do_ping_long "$ns3" 100.64.0.1 - do_ping_long "$ns3" dead:beef:1::1 - do_ping_long "$ns3" 100.64.0.2 - do_ping_long "$ns3" dead:beef:1::2 - stop_if_error "Longer ping test failed." - - echo "INFO: Cutting one link." - do_ping_long "$ns1" 100.64.0.3 & - - sleep 3 - ip -net "$ns3" link set ns3eth1 down - wait - - ip -net "$ns3" link set ns3eth1 up - - stop_if_error "Failed with one link down." - - echo "INFO: Delay the link and drop a few packages." - tc -net "$ns3" qdisc add dev ns3eth1 root netem delay 50ms - tc -net "$ns2" qdisc add dev ns2eth1 root netem delay 5ms loss 25% - - do_ping_long "$ns1" 100.64.0.2 - do_ping_long "$ns1" 100.64.0.3 - - stop_if_error "Failed with delay and packetloss." - - do_ping_long "$ns2" 100.64.0.1 - do_ping_long "$ns2" 100.64.0.3 - - stop_if_error "Failed with delay and packetloss." - - do_ping_long "$ns3" 100.64.0.1 - do_ping_long "$ns3" 100.64.0.2 - stop_if_error "Failed with delay and packetloss." - - echo "INFO: All good." + do_ping_long "$ns1" "100.64.$netid.2" + do_ping_long "$ns1" "dead:beef:$netid::2" + do_ping_long "$ns1" "100.64.$netid.3" + do_ping_long "$ns1" "dead:beef:$netid::3" + stop_if_error "Longer ping test failed (ns1)." + + do_ping_long "$ns2" "100.64.$netid.1" + do_ping_long "$ns2" "dead:beef:$netid::1" + do_ping_long "$ns2" "100.64.$netid.3" + do_ping_long "$ns2" "dead:beef:$netid::3" + stop_if_error "Longer ping test failed (ns2)." + + do_ping_long "$ns3" "100.64.$netid.1" + do_ping_long "$ns3" "dead:beef:$netid::1" + do_ping_long "$ns3" "100.64.$netid.2" + do_ping_long "$ns3" "dead:beef:$netid::2" + stop_if_error "Longer ping test failed (ns3)." } setup_hsr_interfaces() { local HSRv="$1" - echo "INFO: preparing interfaces for HSRv${HSRv}." + echo "INFO: Preparing interfaces for HSRv${HSRv}." # Three HSR nodes. Each node has one link to each of its neighbour, two links in total. # # ns1eth1 ----- ns2eth1 @@ -140,17 +111,20 @@ setup_hsr_interfaces() ip link add ns3eth2 netns "$ns3" type veth peer name ns2eth2 netns "$ns2" # HSRv0/1 - ip -net "$ns1" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version $HSRv proto 0 - ip -net "$ns2" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 supervision 45 version $HSRv proto 0 - ip -net "$ns3" link add name hsr3 type hsr slave1 ns3eth1 slave2 ns3eth2 supervision 45 version $HSRv proto 0 + ip -net "$ns1" link add name hsr1 type hsr slave1 ns1eth1 \ + slave2 ns1eth2 supervision 45 version "$HSRv" proto 0 + ip -net "$ns2" link add name hsr2 type hsr slave1 ns2eth1 \ + slave2 ns2eth2 supervision 45 version "$HSRv" proto 0 + ip -net "$ns3" link add name hsr3 type hsr slave1 ns3eth1 \ + slave2 ns3eth2 supervision 45 version "$HSRv" proto 0 # IP for HSR ip -net "$ns1" addr add 100.64.0.1/24 dev hsr1 - ip -net "$ns1" addr add dead:beef:1::1/64 dev hsr1 nodad + ip -net "$ns1" addr add dead:beef:0::1/64 dev hsr1 nodad ip -net "$ns2" addr add 100.64.0.2/24 dev hsr2 - ip -net "$ns2" addr add dead:beef:1::2/64 dev hsr2 nodad + ip -net "$ns2" addr add dead:beef:0::2/64 dev hsr2 nodad ip -net "$ns3" addr add 100.64.0.3/24 dev hsr3 - ip -net "$ns3" addr add dead:beef:1::3/64 dev hsr3 nodad + ip -net "$ns3" addr add dead:beef:0::3/64 dev hsr3 nodad ip -net "$ns1" link set address 00:11:22:00:01:01 dev ns1eth1 ip -net "$ns1" link set address 00:11:22:00:01:02 dev ns1eth2 @@ -177,113 +151,56 @@ setup_hsr_interfaces() setup_vlan_interfaces() { ip -net "$ns1" link add link hsr1 name hsr1.2 type vlan id 2 - ip -net "$ns1" link add link hsr1 name hsr1.3 type vlan id 3 - ip -net "$ns1" link add link hsr1 name hsr1.4 type vlan id 4 - ip -net "$ns1" link add link hsr1 name hsr1.5 type vlan id 5 - ip -net "$ns2" link add link hsr2 name hsr2.2 type vlan id 2 - ip -net "$ns2" link add link hsr2 name hsr2.3 type vlan id 3 - ip -net "$ns2" link add link hsr2 name hsr2.4 type vlan id 4 - ip -net "$ns2" link add link hsr2 name hsr2.5 type vlan id 5 - ip -net "$ns3" link add link hsr3 name hsr3.2 type vlan id 2 - ip -net "$ns3" link add link hsr3 name hsr3.3 type vlan id 3 - ip -net "$ns3" link add link hsr3 name hsr3.4 type vlan id 4 - ip -net "$ns3" link add link hsr3 name hsr3.5 type vlan id 5 ip -net "$ns1" addr add 100.64.2.1/24 dev hsr1.2 - ip -net "$ns1" addr add 100.64.3.1/24 dev hsr1.3 - ip -net "$ns1" addr add 100.64.4.1/24 dev hsr1.4 - ip -net "$ns1" addr add 100.64.5.1/24 dev hsr1.5 + ip -net "$ns1" addr add dead:beef:2::1/64 dev hsr1.2 nodad ip -net "$ns2" addr add 100.64.2.2/24 dev hsr2.2 - ip -net "$ns2" addr add 100.64.3.2/24 dev hsr2.3 - ip -net "$ns2" addr add 100.64.4.2/24 dev hsr2.4 - ip -net "$ns2" addr add 100.64.5.2/24 dev hsr2.5 + ip -net "$ns2" addr add dead:beef:2::2/64 dev hsr2.2 nodad ip -net "$ns3" addr add 100.64.2.3/24 dev hsr3.2 - ip -net "$ns3" addr add 100.64.3.3/24 dev hsr3.3 - ip -net "$ns3" addr add 100.64.4.3/24 dev hsr3.4 - ip -net "$ns3" addr add 100.64.5.3/24 dev hsr3.5 + ip -net "$ns3" addr add dead:beef:2::3/64 dev hsr3.2 nodad ip -net "$ns1" link set dev hsr1.2 up - ip -net "$ns1" link set dev hsr1.3 up - ip -net "$ns1" link set dev hsr1.4 up - ip -net "$ns1" link set dev hsr1.5 up - ip -net "$ns2" link set dev hsr2.2 up - ip -net "$ns2" link set dev hsr2.3 up - ip -net "$ns2" link set dev hsr2.4 up - ip -net "$ns2" link set dev hsr2.5 up - ip -net "$ns3" link set dev hsr3.2 up - ip -net "$ns3" link set dev hsr3.3 up - ip -net "$ns3" link set dev hsr3.4 up - ip -net "$ns3" link set dev hsr3.5 up } -hsr_vlan_ping() { - do_ping "$ns1" 100.64.2.2 - do_ping "$ns1" 100.64.3.2 - do_ping "$ns1" 100.64.4.2 - do_ping "$ns1" 100.64.5.2 - - do_ping "$ns1" 100.64.2.3 - do_ping "$ns1" 100.64.3.3 - do_ping "$ns1" 100.64.4.3 - do_ping "$ns1" 100.64.5.3 - - do_ping "$ns2" 100.64.2.1 - do_ping "$ns2" 100.64.3.1 - do_ping "$ns2" 100.64.4.1 - do_ping "$ns2" 100.64.5.1 - - do_ping "$ns2" 100.64.2.3 - do_ping "$ns2" 100.64.3.3 - do_ping "$ns2" 100.64.4.3 - do_ping "$ns2" 100.64.5.3 - - do_ping "$ns3" 100.64.2.1 - do_ping "$ns3" 100.64.3.1 - do_ping "$ns3" 100.64.4.1 - do_ping "$ns3" 100.64.5.1 - - do_ping "$ns3" 100.64.2.2 - do_ping "$ns3" 100.64.3.2 - do_ping "$ns3" 100.64.4.2 - do_ping "$ns3" 100.64.5.2 +run_ping_tests() +{ + echo "INFO: Running ping tests." + do_ping_tests 0 } -run_vlan_tests() { +run_vlan_tests() +{ vlan_challenged_hsr1=$(ip net exec "$ns1" ethtool -k hsr1 | grep "vlan-challenged" | awk '{print $2}') vlan_challenged_hsr2=$(ip net exec "$ns2" ethtool -k hsr2 | grep "vlan-challenged" | awk '{print $2}') vlan_challenged_hsr3=$(ip net exec "$ns3" ethtool -k hsr3 | grep "vlan-challenged" | awk '{print $2}') if [[ "$vlan_challenged_hsr1" = "off" || "$vlan_challenged_hsr2" = "off" || "$vlan_challenged_hsr3" = "off" ]]; then - echo "INFO: Running VLAN tests" + echo "INFO: Running VLAN ping tests" setup_vlan_interfaces - hsr_vlan_ping + do_ping_tests 2 else echo "INFO: Not Running VLAN tests as the device does not support VLAN" fi } check_prerequisites -setup_ns ns1 ns2 ns3 - trap cleanup_all_ns EXIT +setup_ns ns1 ns2 ns3 setup_hsr_interfaces 0 -do_complete_ping_test - +run_ping_tests run_vlan_tests setup_ns ns1 ns2 ns3 - setup_hsr_interfaces 1 -do_complete_ping_test - +run_ping_tests run_vlan_tests exit $ret diff --git a/tools/testing/selftests/net/hsr/link_faults.sh b/tools/testing/selftests/net/hsr/link_faults.sh new file mode 100755 index 000000000000..be526281571c --- /dev/null +++ b/tools/testing/selftests/net/hsr/link_faults.sh @@ -0,0 +1,378 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# shellcheck disable=SC2329 + +source ../lib.sh + +ALL_TESTS=" + test_clean_hsrv0 + test_cut_link_hsrv0 + test_packet_loss_hsrv0 + test_high_packet_loss_hsrv0 + test_reordering_hsrv0 + + test_clean_hsrv1 + test_cut_link_hsrv1 + test_packet_loss_hsrv1 + test_high_packet_loss_hsrv1 + test_reordering_hsrv1 + + test_clean_prp + test_cut_link_prp + test_packet_loss_prp + test_high_packet_loss_prp + test_reordering_prp +" + +# The tests are running ping for 5sec with a relatively short interval in +# different scenarios with faulty links (cut links, packet loss, delay, +# reordering) that should be recoverable by HSR/PRP. The ping interval (10ms) +# is short enough that the base delay (50ms) leads to a queue in the netem +# qdiscs which is needed for reordering. + +setup_hsr_topo() +{ + # Three HSR nodes in a ring, every node has a LAN A interface connected + # to the LAN B interface of the next node. + # + # node1 node2 + # + # vethA -------- vethB + # hsr1 hsr2 + # vethB vethA + # \ / + # vethA vethB + # hsr3 + # + # node3 + + local ver="$1" + + setup_ns node1 node2 node3 + + # veth links + # shellcheck disable=SC2154 # variables assigned by setup_ns + ip link add vethA netns "$node1" type veth peer name vethB netns "$node2" + # shellcheck disable=SC2154 # variables assigned by setup_ns + ip link add vethA netns "$node2" type veth peer name vethB netns "$node3" + ip link add vethA netns "$node3" type veth peer name vethB netns "$node1" + + # MAC addresses (not needed for HSR operation, but helps with debugging) + ip -net "$node1" link set address 00:11:22:00:01:01 dev vethA + ip -net "$node1" link set address 00:11:22:00:01:02 dev vethB + + ip -net "$node2" link set address 00:11:22:00:02:01 dev vethA + ip -net "$node2" link set address 00:11:22:00:02:02 dev vethB + + ip -net "$node3" link set address 00:11:22:00:03:01 dev vethA + ip -net "$node3" link set address 00:11:22:00:03:02 dev vethB + + # HSR interfaces + ip -net "$node1" link add name hsr1 type hsr proto 0 version "$ver" \ + slave1 vethA slave2 vethB supervision 45 + ip -net "$node2" link add name hsr2 type hsr proto 0 version "$ver" \ + slave1 vethA slave2 vethB supervision 45 + ip -net "$node3" link add name hsr3 type hsr proto 0 version "$ver" \ + slave1 vethA slave2 vethB supervision 45 + + # IP addresses + ip -net "$node1" addr add 100.64.0.1/24 dev hsr1 + ip -net "$node2" addr add 100.64.0.2/24 dev hsr2 + ip -net "$node3" addr add 100.64.0.3/24 dev hsr3 + + # Set all links up + ip -net "$node1" link set vethA up + ip -net "$node1" link set vethB up + ip -net "$node1" link set hsr1 up + + ip -net "$node2" link set vethA up + ip -net "$node2" link set vethB up + ip -net "$node2" link set hsr2 up + + ip -net "$node3" link set vethA up + ip -net "$node3" link set vethB up + ip -net "$node3" link set hsr3 up +} + +setup_prp_topo() +{ + # Two PRP nodes, connected by two links (treated as LAN A and LAN B). + # + # vethA ----- vethA + # prp1 prp2 + # vethB ----- vethB + # + # node1 node2 + + setup_ns node1 node2 + + # veth links + ip link add vethA netns "$node1" type veth peer name vethA netns "$node2" + ip link add vethB netns "$node1" type veth peer name vethB netns "$node2" + + # MAC addresses will be copied from LAN A interface + ip -net "$node1" link set address 00:11:22:00:00:01 dev vethA + ip -net "$node2" link set address 00:11:22:00:00:02 dev vethA + + # PRP interfaces + ip -net "$node1" link add name prp1 type hsr \ + slave1 vethA slave2 vethB supervision 45 proto 1 + ip -net "$node2" link add name prp2 type hsr \ + slave1 vethA slave2 vethB supervision 45 proto 1 + + # IP addresses + ip -net "$node1" addr add 100.64.0.1/24 dev prp1 + ip -net "$node2" addr add 100.64.0.2/24 dev prp2 + + # All links up + ip -net "$node1" link set vethA up + ip -net "$node1" link set vethB up + ip -net "$node1" link set prp1 up + + ip -net "$node2" link set vethA up + ip -net "$node2" link set vethB up + ip -net "$node2" link set prp2 up +} + +wait_for_hsr_node_table() +{ + log_info "Wait for node table entries to be merged." + WAIT=5 + while [ "${WAIT}" -gt 0 ]; do + nts=$(cat /sys/kernel/debug/hsr/hsr*/node_table) + + # We need entries in the node tables, and they need to be merged + if (echo "$nts" | grep -qE "^([0-9a-f]{2}:){5}") && \ + ! (echo "$nts" | grep -q "00:00:00:00:00:00"); then + return + fi + + sleep 1 + ((WAIT--)) + done + check_err 1 "Failed to wait for merged node table entries" +} + +setup_topo() +{ + local proto="$1" + + if [ "$proto" = "HSRv0" ]; then + setup_hsr_topo 0 + wait_for_hsr_node_table + elif [ "$proto" = "HSRv1" ]; then + setup_hsr_topo 1 + wait_for_hsr_node_table + elif [ "$proto" = "PRP" ]; then + setup_prp_topo + else + check_err 1 "Unknown protocol (${proto})" + fi +} + +check_ping() +{ + local node="$1" + local dst="$2" + local accepted_dups="$3" + local ping_args="-q -i 0.01 -c 400" + + log_info "Running ping $node -> $dst" + # shellcheck disable=SC2086 + output=$(ip netns exec "$node" ping $ping_args "$dst" | \ + grep "packets transmitted") + log_info "$output" + + dups=0 + loss=0 + + if [[ "$output" =~ \+([0-9]+)" duplicates" ]]; then + dups="${BASH_REMATCH[1]}" + fi + if [[ "$output" =~ ([0-9\.]+\%)" packet loss" ]]; then + loss="${BASH_REMATCH[1]}" + fi + + if [ "$dups" -gt "$accepted_dups" ]; then + check_err 1 "Unexpected duplicate packets (${dups})" + fi + if [ "$loss" != "0%" ]; then + check_err 1 "Unexpected packet loss (${loss})" + fi +} + +test_clean() +{ + local proto="$1" + + RET=0 + tname="${FUNCNAME[0]} - ${proto}" + + setup_topo "$proto" + if ((RET != ksft_pass)); then + log_test "${tname} setup" + return + fi + + check_ping "$node1" "100.64.0.2" 0 + + log_test "${tname}" +} + +test_clean_hsrv0() +{ + test_clean "HSRv0" +} + +test_clean_hsrv1() +{ + test_clean "HSRv1" +} + +test_clean_prp() +{ + test_clean "PRP" +} + +test_cut_link() +{ + local proto="$1" + + RET=0 + tname="${FUNCNAME[0]} - ${proto}" + + setup_topo "$proto" + if ((RET != ksft_pass)); then + log_test "${tname} setup" + return + fi + + # Cutting link from subshell, so check_ping can run in the normal shell + # with access to global variables from the test harness. + ( + sleep 2 + log_info "Cutting link" + ip -net "$node1" link set vethB down + ) & + check_ping "$node1" "100.64.0.2" 0 + + wait + log_test "${tname}" +} + + +test_cut_link_hsrv0() +{ + test_cut_link "HSRv0" +} + +test_cut_link_hsrv1() +{ + test_cut_link "HSRv1" +} + +test_cut_link_prp() +{ + test_cut_link "PRP" +} + +test_packet_loss() +{ + local proto="$1" + local loss="$2" + + RET=0 + tname="${FUNCNAME[0]} - ${proto}, ${loss}" + + setup_topo "$proto" + if ((RET != ksft_pass)); then + log_test "${tname} setup" + return + fi + + # Packet loss with lower delay makes sure the packets on the lossy link + # arrive first. + tc -net "$node1" qdisc add dev vethA root netem delay 50ms + tc -net "$node1" qdisc add dev vethB root netem delay 20ms loss "$loss" + + check_ping "$node1" "100.64.0.2" 40 + + log_test "${tname}" +} + +test_packet_loss_hsrv0() +{ + test_packet_loss "HSRv0" "20%" +} + +test_packet_loss_hsrv1() +{ + test_packet_loss "HSRv1" "20%" +} + +test_packet_loss_prp() +{ + test_packet_loss "PRP" "20%" +} + +test_high_packet_loss_hsrv0() +{ + test_packet_loss "HSRv0" "80%" +} + +test_high_packet_loss_hsrv1() +{ + test_packet_loss "HSRv1" "80%" +} + +test_high_packet_loss_prp() +{ + test_packet_loss "PRP" "80%" +} + +test_reordering() +{ + local proto="$1" + + RET=0 + tname="${FUNCNAME[0]} - ${proto}" + + setup_topo "$proto" + if ((RET != ksft_pass)); then + log_test "${tname} setup" + return + fi + + tc -net "$node1" qdisc add dev vethA root netem delay 50ms + tc -net "$node1" qdisc add dev vethB root netem delay 50ms reorder 20% + + check_ping "$node1" "100.64.0.2" 40 + + log_test "${tname}" +} + +test_reordering_hsrv0() +{ + test_reordering "HSRv0" +} + +test_reordering_hsrv1() +{ + test_reordering "HSRv1" +} + +test_reordering_prp() +{ + test_reordering "PRP" +} + +cleanup() +{ + cleanup_all_ns +} + +trap cleanup EXIT + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/hsr/prp_ping.sh b/tools/testing/selftests/net/hsr/prp_ping.sh new file mode 100755 index 000000000000..fd2ba9f05d4c --- /dev/null +++ b/tools/testing/selftests/net/hsr/prp_ping.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ipv6=true + +source ./hsr_common.sh + +optstring="h4" +usage() { + echo "Usage: $0 [OPTION]" + echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)" +} + +while getopts "$optstring" option;do + case "$option" in + "h") + usage "$0" + exit 0 + ;; + "4") + ipv6=false + ;; + "?") + usage "$0" + exit 1 + ;; +esac +done + +setup_prp_interfaces() +{ + echo "INFO: Preparing interfaces for PRP" +# Two PRP nodes, connected by two links (treated as LAN A and LAN B). +# +# vethA ----- vethA +# prp1 prp2 +# vethB ----- vethB +# +# node1 node2 + + # Interfaces + # shellcheck disable=SC2154 # variables assigned by setup_ns + ip link add vethA netns "$node1" type veth peer name vethA netns "$node2" + ip link add vethB netns "$node1" type veth peer name vethB netns "$node2" + + # MAC addresses will be copied from LAN A interface + ip -net "$node1" link set address 00:11:22:00:00:01 dev vethA + ip -net "$node2" link set address 00:11:22:00:00:02 dev vethA + + # PRP + ip -net "$node1" link add name prp1 type hsr \ + slave1 vethA slave2 vethB supervision 45 proto 1 + ip -net "$node2" link add name prp2 type hsr \ + slave1 vethA slave2 vethB supervision 45 proto 1 + + # IP addresses + ip -net "$node1" addr add 100.64.0.1/24 dev prp1 + ip -net "$node1" addr add dead:beef:0::1/64 dev prp1 nodad + ip -net "$node2" addr add 100.64.0.2/24 dev prp2 + ip -net "$node2" addr add dead:beef:0::2/64 dev prp2 nodad + + # All links up + ip -net "$node1" link set vethA up + ip -net "$node1" link set vethB up + ip -net "$node1" link set prp1 up + + ip -net "$node2" link set vethA up + ip -net "$node2" link set vethB up + ip -net "$node2" link set prp2 up +} + +setup_vlan_interfaces() +{ + # Interfaces + ip -net "$node1" link add link prp1 name prp1.2 type vlan id 2 + ip -net "$node2" link add link prp2 name prp2.2 type vlan id 2 + + # IP addresses + ip -net "$node1" addr add 100.64.2.1/24 dev prp1.2 + ip -net "$node1" addr add dead:beef:2::1/64 dev prp1.2 nodad + + ip -net "$node2" addr add 100.64.2.2/24 dev prp2.2 + ip -net "$node2" addr add dead:beef:2::2/64 dev prp2.2 nodad + + # All links up + ip -net "$node1" link set prp1.2 up + ip -net "$node2" link set prp2.2 up +} + +do_ping_tests() +{ + local netid="$1" + + echo "INFO: Initial validation ping" + + do_ping "$node1" "100.64.$netid.2" + do_ping "$node2" "100.64.$netid.1" + stop_if_error "Initial validation failed on IPv4" + + do_ping "$node1" "dead:beef:$netid::2" + do_ping "$node2" "dead:beef:$netid::1" + stop_if_error "Initial validation failed on IPv6" + + echo "INFO: Longer ping test." + + do_ping_long "$node1" "100.64.$netid.2" + do_ping_long "$node2" "100.64.$netid.1" + stop_if_error "Longer ping test failed on IPv4." + + do_ping_long "$node1" "dead:beef:$netid::2" + do_ping_long "$node2" "dead:beef:$netid::1" + stop_if_error "Longer ping test failed on IPv6." +} + +run_ping_tests() +{ + echo "INFO: Running ping tests" + do_ping_tests 0 +} + +run_vlan_ping_tests() +{ + vlan_challenged_prp1=$(ip net exec "$node1" ethtool -k prp1 | \ + grep "vlan-challenged" | awk '{print $2}') + vlan_challenged_prp2=$(ip net exec "$node2" ethtool -k prp2 | \ + grep "vlan-challenged" | awk '{print $2}') + + if [[ "$vlan_challenged_prp1" = "off" || \ + "$vlan_challenged_prp2" = "off" ]]; then + echo "INFO: Running VLAN ping tests" + setup_vlan_interfaces + do_ping_tests 2 + else + echo "INFO: Not Running VLAN tests as the device does not support VLAN" + fi +} + +check_prerequisites +trap cleanup_all_ns EXIT + +setup_ns node1 node2 +setup_prp_interfaces + +run_ping_tests +run_vlan_ping_tests + +exit $ret diff --git a/tools/testing/selftests/net/hsr/settings b/tools/testing/selftests/net/hsr/settings index 0fbc037f2aa8..a953c96aa16e 100644 --- a/tools/testing/selftests/net/hsr/settings +++ b/tools/testing/selftests/net/hsr/settings @@ -1 +1 @@ -timeout=50 +timeout=180 diff --git a/tools/testing/selftests/net/icmp_rfc4884.c b/tools/testing/selftests/net/icmp_rfc4884.c new file mode 100644 index 000000000000..cd826b913557 --- /dev/null +++ b/tools/testing/selftests/net/icmp_rfc4884.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <arpa/inet.h> +#include <error.h> +#include <linux/errqueue.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <netinet/in.h> +#include <netinet/udp.h> +#include <poll.h> +#include <sched.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <sys/socket.h> + +#include "../kselftest_harness.h" + +static const unsigned short src_port = 44444; +static const unsigned short dst_port = 55555; +static const int min_orig_dgram_len = 128; +static const int min_payload_len_v4 = + min_orig_dgram_len - sizeof(struct iphdr) - sizeof(struct udphdr); +static const int min_payload_len_v6 = + min_orig_dgram_len - sizeof(struct ipv6hdr) - sizeof(struct udphdr); +static const uint8_t orig_payload_byte = 0xAA; + +struct sockaddr_inet { + union { + struct sockaddr_in6 v6; + struct sockaddr_in v4; + struct sockaddr sa; + }; + socklen_t len; +}; + +struct ip_case_info { + int domain; + int level; + int opt1; + int opt2; + int proto; + int (*build_func)(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len); + int min_payload; +}; + +static int bringup_loopback(void) +{ + struct ifreq ifr = { + .ifr_name = "lo" + }; + int fd; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -1; + + if (ioctl(fd, SIOCGIFFLAGS, &ifr) < 0) + goto err; + + ifr.ifr_flags = ifr.ifr_flags | IFF_UP; + + if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0) + goto err; + + close(fd); + return 0; + +err: + close(fd); + return -1; +} + +static uint16_t csum(const void *buf, size_t len) +{ + const uint8_t *data = buf; + uint32_t sum = 0; + + while (len > 1) { + sum += (data[0] << 8) | data[1]; + data += 2; + len -= 2; + } + + if (len == 1) + sum += data[0] << 8; + + while (sum >> 16) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum & 0xFFFF; +} + +static int poll_err(int fd) +{ + struct pollfd pfd; + + memset(&pfd, 0, sizeof(pfd)); + pfd.fd = fd; + + if (poll(&pfd, 1, 5000) != 1 || pfd.revents != POLLERR) + return -1; + + return 0; +} + +static void set_addr(struct sockaddr_inet *addr, int domain, + unsigned short port) +{ + memset(addr, 0, sizeof(*addr)); + + switch (domain) { + case AF_INET: + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = htons(port); + addr->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr->len = sizeof(addr->v4); + break; + case AF_INET6: + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = htons(port); + addr->v6.sin6_addr = in6addr_loopback; + addr->len = sizeof(addr->v6); + break; + } +} + +static int bind_and_setsockopt(int fd, const struct ip_case_info *info) +{ + struct sockaddr_inet addr; + int opt = 1; + + set_addr(&addr, info->domain, src_port); + + if (setsockopt(fd, info->level, info->opt1, &opt, sizeof(opt)) < 0) + return -1; + + if (setsockopt(fd, info->level, info->opt2, &opt, sizeof(opt)) < 0) + return -1; + + return bind(fd, &addr.sa, addr.len); +} + +static int build_rfc4884_ext(uint8_t *buf, size_t buflen, bool bad_csum, + bool bad_len, bool smaller_len) +{ + struct icmp_extobj_hdr *objh; + struct icmp_ext_hdr *exthdr; + size_t obj_len, ext_len; + uint16_t sum; + + /* Use an object payload of 4 bytes */ + obj_len = sizeof(*objh) + sizeof(uint32_t); + ext_len = sizeof(*exthdr) + obj_len; + + if (ext_len > buflen) + return -EINVAL; + + exthdr = (struct icmp_ext_hdr *)buf; + objh = (struct icmp_extobj_hdr *)(buf + sizeof(*exthdr)); + + exthdr->version = 2; + /* When encoding a bad object length, either encode a length too small + * to fit the object header or too big to fit in the packet. + */ + if (bad_len) + obj_len = smaller_len ? sizeof(*objh) - 1 : obj_len * 2; + objh->length = htons(obj_len); + + sum = csum(buf, ext_len); + exthdr->checksum = htons(bad_csum ? sum - 1 : sum); + + return ext_len; +} + +static int build_orig_dgram_v4(uint8_t *buf, ssize_t buflen, int payload_len) +{ + struct udphdr *udph; + struct iphdr *iph; + size_t len = 0; + + len = sizeof(*iph) + sizeof(*udph) + payload_len; + if (len > buflen) + return -EINVAL; + + iph = (struct iphdr *)buf; + udph = (struct udphdr *)(buf + sizeof(*iph)); + + iph->version = 4; + iph->ihl = 5; + iph->protocol = IPPROTO_UDP; + iph->saddr = htonl(INADDR_LOOPBACK); + iph->daddr = htonl(INADDR_LOOPBACK); + iph->tot_len = htons(len); + iph->check = htons(csum(iph, sizeof(*iph))); + + udph->source = htons(src_port); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + payload_len); + + memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte, + payload_len); + + return len; +} + +static int build_orig_dgram_v6(uint8_t *buf, ssize_t buflen, int payload_len) +{ + struct udphdr *udph; + struct ipv6hdr *iph; + size_t len = 0; + + len = sizeof(*iph) + sizeof(*udph) + payload_len; + if (len > buflen) + return -EINVAL; + + iph = (struct ipv6hdr *)buf; + udph = (struct udphdr *)(buf + sizeof(*iph)); + + iph->version = 6; + iph->payload_len = htons(sizeof(*udph) + payload_len); + iph->nexthdr = IPPROTO_UDP; + iph->saddr = in6addr_loopback; + iph->daddr = in6addr_loopback; + + udph->source = htons(src_port); + udph->dest = htons(dst_port); + udph->len = htons(sizeof(*udph) + payload_len); + + memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte, + payload_len); + + return len; +} + +static int build_icmpv4_pkt(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len) +{ + struct icmphdr *icmph; + int len, ret; + + len = sizeof(*icmph); + memset(buf, 0, buflen); + + icmph = (struct icmphdr *)buf; + icmph->type = ICMP_DEST_UNREACH; + icmph->code = ICMP_PORT_UNREACH; + icmph->checksum = 0; + + ret = build_orig_dgram_v4(buf + len, buflen - len, payload_len); + if (ret < 0) + return ret; + + len += ret; + + icmph->un.reserved[1] = (len - sizeof(*icmph)) / sizeof(uint32_t); + + if (with_ext) { + ret = build_rfc4884_ext(buf + len, buflen - len, + bad_csum, bad_len, smaller_len); + if (ret < 0) + return ret; + + len += ret; + } + + icmph->checksum = htons(csum(icmph, len)); + return len; +} + +static int build_icmpv6_pkt(uint8_t *buf, ssize_t buflen, bool with_ext, + int payload_len, bool bad_csum, bool bad_len, + bool smaller_len) +{ + struct icmp6hdr *icmph; + int len, ret; + + len = sizeof(*icmph); + memset(buf, 0, buflen); + + icmph = (struct icmp6hdr *)buf; + icmph->icmp6_type = ICMPV6_DEST_UNREACH; + icmph->icmp6_code = ICMPV6_PORT_UNREACH; + icmph->icmp6_cksum = 0; + + ret = build_orig_dgram_v6(buf + len, buflen - len, payload_len); + if (ret < 0) + return ret; + + len += ret; + + icmph->icmp6_datagram_len = (len - sizeof(*icmph)) / sizeof(uint64_t); + + if (with_ext) { + ret = build_rfc4884_ext(buf + len, buflen - len, + bad_csum, bad_len, smaller_len); + if (ret < 0) + return ret; + + len += ret; + } + + icmph->icmp6_cksum = htons(csum(icmph, len)); + return len; +} + +FIXTURE(rfc4884) {}; + +FIXTURE_SETUP(rfc4884) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(ret, 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + } + + ret = bringup_loopback(); + ASSERT_EQ(ret, 0) TH_LOG("Failed to bring up loopback interface"); +} + +FIXTURE_TEARDOWN(rfc4884) +{ +} + +const struct ip_case_info ipv4_info = { + .domain = AF_INET, + .level = SOL_IP, + .opt1 = IP_RECVERR, + .opt2 = IP_RECVERR_RFC4884, + .proto = IPPROTO_ICMP, + .build_func = build_icmpv4_pkt, + .min_payload = min_payload_len_v4, +}; + +const struct ip_case_info ipv6_info = { + .domain = AF_INET6, + .level = SOL_IPV6, + .opt1 = IPV6_RECVERR, + .opt2 = IPV6_RECVERR_RFC4884, + .proto = IPPROTO_ICMPV6, + .build_func = build_icmpv6_pkt, + .min_payload = min_payload_len_v6, +}; + +FIXTURE_VARIANT(rfc4884) { + /* IPv4/v6 related information */ + struct ip_case_info info; + /* Whether to append an ICMP extension or not */ + bool with_ext; + /* UDP payload length */ + int payload_len; + /* Whether to generate a bad checksum in the ICMP extension structure */ + bool bad_csum; + /* Whether to generate a bad length in the ICMP object header */ + bool bad_len; + /* Whether it is too small to fit the object header or too big to fit + * in the packet + */ + bool smaller_len; +}; + +/* Tests that a valid ICMPv4 error message with extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_small_payload) { + .info = ipv4_info, + .with_ext = true, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message with extension and 128 bytes original + * datagram, generates an error with the expected offset, and does not raise the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message with extension and the original + * datagram is larger than 128 bytes, generates an error with the expected + * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_large_payload) { + .info = ipv4_info, + .with_ext = true, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_small_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and 128 bytes + * original datagram, generates an error with zero offset, and does not raise + * the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_min_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv4 error message without extension and the original + * datagram is larger than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_large_payload) { + .info = ipv4_info, + .with_ext = false, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that an ICMPv4 error message with extension and an invalid checksum, + * generates an error with the expected offset, and raises the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_checksum) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = true, + .bad_len = false, +}; + +/* Tests that an ICMPv4 error message with extension and an object length + * smaller than the object header, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_small) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = true, + .smaller_len = true, +}; + +/* Tests that an ICMPv4 error message with extension and an object length that + * is too big to fit in the packet, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_large) { + .info = ipv4_info, + .with_ext = true, + .payload_len = min_payload_len_v4, + .bad_csum = false, + .bad_len = true, + .smaller_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_small_payload) { + .info = ipv6_info, + .with_ext = true, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and 128 bytes original + * datagram, generates an error with the expected offset, and does not raise the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message with extension and the original + * datagram is larger than 128 bytes, generates an error with the expected + * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_large_payload) { + .info = ipv6_info, + .with_ext = true, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; +/* Tests that a valid ICMPv6 error message without extension and the original + * datagram is smaller than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_small_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = 64, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message without extension and 128 bytes + * original datagram, generates an error with zero offset, and does not + * raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_min_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that a valid ICMPv6 error message without extension and the original + * datagram is larger than 128 bytes, generates an error with zero offset, + * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_large_payload) { + .info = ipv6_info, + .with_ext = false, + .payload_len = 256, + .bad_csum = false, + .bad_len = false, +}; + +/* Tests that an ICMPv6 error message with extension and an invalid checksum, + * generates an error with the expected offset, and raises the + * SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_checksum) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = true, + .bad_len = false, +}; + +/* Tests that an ICMPv6 error message with extension and an object length + * smaller than the object header, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_small) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = true, + .smaller_len = true, +}; + +/* Tests that an ICMPv6 error message with extension and an object length that + * is too big to fit in the packet, generates an error with the expected offset, + * and raises the SO_EE_RFC4884_FLAG_INVALID flag. + */ +FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_large) { + .info = ipv6_info, + .with_ext = true, + .payload_len = min_payload_len_v6, + .bad_csum = false, + .bad_len = true, + .smaller_len = false, +}; + +static void +check_rfc4884_offset(struct __test_metadata *_metadata, int sock, + const FIXTURE_VARIANT(rfc4884) *v) +{ + char rxbuf[1024]; + char ctrl[1024]; + struct iovec iov = { + .iov_base = rxbuf, + .iov_len = sizeof(rxbuf) + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = ctrl, + .msg_controllen = sizeof(ctrl), + }; + struct cmsghdr *cmsg; + int recv; + + ASSERT_EQ(poll_err(sock), 0); + + recv = recvmsg(sock, &msg, MSG_ERRQUEUE); + ASSERT_GE(recv, 0) TH_LOG("recvmsg(MSG_ERRQUEUE) failed"); + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + bool is_invalid, expected_invalid; + struct sock_extended_err *ee; + int expected_off; + uint16_t off; + + if (cmsg->cmsg_level != v->info.level || + cmsg->cmsg_type != v->info.opt1) { + TH_LOG("Unrelated cmsgs were encountered in recvmsg()"); + continue; + } + + ee = (struct sock_extended_err *)CMSG_DATA(cmsg); + off = ee->ee_rfc4884.len; + is_invalid = ee->ee_rfc4884.flags & SO_EE_RFC4884_FLAG_INVALID; + + expected_invalid = v->bad_csum || v->bad_len; + ASSERT_EQ(is_invalid, expected_invalid) { + TH_LOG("Expected invalidity flag to be %d, but got %d", + expected_invalid, is_invalid); + } + + expected_off = + (v->with_ext && v->payload_len >= v->info.min_payload) ? + v->payload_len : 0; + ASSERT_EQ(off, expected_off) { + TH_LOG("Expected RFC4884 offset %u, got %u", + expected_off, off); + } + break; + } +} + +TEST_F(rfc4884, rfc4884) +{ + const typeof(variant) v = variant; + struct sockaddr_inet addr; + uint8_t pkt[1024]; + int dgram, raw; + int len, sent; + int err; + + dgram = socket(v->info.domain, SOCK_DGRAM, 0); + ASSERT_GE(dgram, 0) TH_LOG("Opening datagram socket failed"); + + err = bind_and_setsockopt(dgram, &v->info); + ASSERT_EQ(err, 0) TH_LOG("Bind failed"); + + raw = socket(v->info.domain, SOCK_RAW, v->info.proto); + ASSERT_GE(raw, 0) TH_LOG("Opening raw socket failed"); + + len = v->info.build_func(pkt, sizeof(pkt), v->with_ext, v->payload_len, + v->bad_csum, v->bad_len, v->smaller_len); + ASSERT_GT(len, 0) TH_LOG("Building packet failed"); + + set_addr(&addr, v->info.domain, 0); + sent = sendto(raw, pkt, len, 0, &addr.sa, addr.len); + ASSERT_EQ(len, sent) TH_LOG("Sending packet failed"); + + check_rfc4884_offset(_metadata, dgram, v); + + close(dgram); + close(raw); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index 845c26dd01a9..b2b99889942f 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -273,8 +273,8 @@ setup() ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null - ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null + ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha link set veth0 up &>/dev/null ip -netns $ioam_node_alpha link set lo up &>/dev/null ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \ diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index 0ccf484b1d9d..f4afef51b930 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -43,6 +43,10 @@ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#ifndef offsetof +#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + #define IPV4_STR_SZ 16 /* xxx.xxx.xxx.xxx is longest + \0 */ #define MAX_PAYLOAD 2048 #define XFRM_ALGO_KEY_BUF_SIZE 512 @@ -827,13 +831,16 @@ static int xfrm_fill_key(char *name, char *buf, static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz, struct xfrm_desc *desc) { - struct { + union { union { struct xfrm_algo alg; struct xfrm_algo_aead aead; struct xfrm_algo_auth auth; } u; - char buf[XFRM_ALGO_KEY_BUF_SIZE]; + struct { + unsigned char __offset_to_FAM[offsetof(struct xfrm_algo_auth, alg_key)]; + char buf[XFRM_ALGO_KEY_BUF_SIZE]; + }; } alg = {}; size_t alen, elen, clen, aelen; unsigned short type; diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c index 27437590eeb5..e28884ce3ab3 100644 --- a/tools/testing/selftests/net/lib/csum.c +++ b/tools/testing/selftests/net/lib/csum.c @@ -707,7 +707,7 @@ static uint32_t recv_get_packet_csum_status(struct msghdr *msg) cm->cmsg_level, cm->cmsg_type); if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata))) - error(1, 0, "cmsg: len=%lu expected=%lu", + error(1, 0, "cmsg: len=%zu expected=%zu", cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata))); aux = (void *)CMSG_DATA(cm); diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 531e7fa1b3ea..6cdfb8afccb5 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -8,7 +8,7 @@ import time import traceback from collections import namedtuple from .consts import KSFT_MAIN_NAME -from .utils import global_defer_queue +from . import utils KSFT_RESULT = None KSFT_RESULT_ALL = True @@ -32,8 +32,23 @@ class KsftTerminate(KeyboardInterrupt): def ksft_pr(*objs, **kwargs): + """ + Print logs to stdout. + + Behaves like print() but log lines will be prefixed + with # to prevent breaking the TAP output formatting. + + Extra arguments (on top of what print() supports): + line_pfx - add extra string before each line + """ + sep = kwargs.pop("sep", " ") + pfx = kwargs.pop("line_pfx", "") + pfx = "#" + (" " + pfx if pfx else "") kwargs["flush"] = True - print("#", *objs, **kwargs) + + text = sep.join(str(obj) for obj in objs) + prefixed = f"\n{pfx} ".join(text.split('\n')) + print(pfx, prefixed, **kwargs) def _fail(*args): @@ -153,21 +168,24 @@ def ktap_result(ok, cnt=1, case_name="", comment=""): print(res, flush=True) +def _ksft_defer_arm(state): + """ Allow or disallow the use of defer() """ + utils.GLOBAL_DEFER_ARMED = state + + def ksft_flush_defer(): global KSFT_RESULT i = 0 - qlen_start = len(global_defer_queue) - while global_defer_queue: + qlen_start = len(utils.GLOBAL_DEFER_QUEUE) + while utils.GLOBAL_DEFER_QUEUE: i += 1 - entry = global_defer_queue.pop() + entry = utils.GLOBAL_DEFER_QUEUE.pop() try: entry.exec_only() except Exception: ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!") - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Defer Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Defer Exception|") KSFT_RESULT = False @@ -315,6 +333,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): comment = "" cnt_key = "" + _ksft_defer_arm(True) try: func(*args) except KsftSkipEx as e: @@ -325,20 +344,17 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cnt_key = 'xfail' except BaseException as e: stop |= isinstance(e, KeyboardInterrupt) - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Exception|") if stop: ksft_pr(f"Stopping tests due to {type(e).__name__}.") KSFT_RESULT = False cnt_key = 'fail' + _ksft_defer_arm(False) try: ksft_flush_defer() except BaseException as e: - tb = traceback.format_exc() - for line in tb.strip().split('\n'): - ksft_pr("Exception|", line) + ksft_pr(traceback.format_exc(), line_pfx="Exception|") if isinstance(e, KeyboardInterrupt): ksft_pr() ksft_pr("WARN: defer() interrupted, cleanup may be incomplete.") diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 106ee1f2df86..85884f3e827b 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -41,7 +41,9 @@ class cmd: self.ret = None self.ksft_term_fd = None + self.host = host self.comm = comm + if host: self.proc = host.cmd(comm) else: @@ -99,6 +101,27 @@ class cmd: raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" % (self.proc.args, stdout, stderr), self) + def __repr__(self): + def str_fmt(name, s): + name += ': ' + return (name + s.strip().replace('\n', '\n' + ' ' * len(name))) + + ret = "CMD" + if self.host: + ret += "[remote]" + if self.ret is None: + ret += f" (unterminated): {self.comm}\n" + elif self.ret == 0: + ret += f" (success): {self.comm}\n" + else: + ret += f": {self.comm}\n" + ret += f" EXIT: {self.ret}\n" + if self.stdout: + ret += str_fmt(" STDOUT", self.stdout) + "\n" + if self.stderr: + ret += str_fmt(" STDERR", self.stderr) + "\n" + return ret.strip() + class bkg(cmd): """ @@ -137,11 +160,12 @@ class bkg(cmd): def __exit__(self, ex_type, ex_value, ex_tb): # Force termination on exception - terminate = self.terminate or (self._exit_wait and ex_type) + terminate = self.terminate or (self._exit_wait and ex_type is not None) return self.process(terminate=terminate, fail=self.check_fail) -global_defer_queue = [] +GLOBAL_DEFER_QUEUE = [] +GLOBAL_DEFER_ARMED = False class defer: @@ -153,7 +177,9 @@ class defer: self.args = args self.kwargs = kwargs - self._queue = global_defer_queue + if not GLOBAL_DEFER_ARMED: + raise Exception("defer queue not armed, did you use defer() outside of a test case?") + self._queue = GLOBAL_DEFER_QUEUE self._queue.append(self) def __enter__(self): diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index 4dd6278cd3dd..22ba0da2adb8 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -11,6 +11,7 @@ TEST_PROGS := \ mptcp_connect_checksum.sh \ mptcp_connect_mmap.sh \ mptcp_connect_sendfile.sh \ + mptcp_connect_splice.sh \ mptcp_join.sh \ mptcp_sockopt.sh \ pm_netlink.sh \ diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 10f6f99cfd4e..cbe573c4ab3a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -52,6 +52,7 @@ enum cfg_mode { CFG_MODE_POLL, CFG_MODE_MMAP, CFG_MODE_SENDFILE, + CFG_MODE_SPLICE, }; enum cfg_peek { @@ -124,7 +125,7 @@ static void die_usage(void) fprintf(stderr, "\t-j -- add additional sleep at connection start and tear down " "-- for MPJ tests\n"); fprintf(stderr, "\t-l -- listens mode, accepts incoming connection\n"); - fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); + fprintf(stderr, "\t-m [poll|mmap|sendfile|splice] -- use poll(default)/mmap+write/sendfile/splice\n"); fprintf(stderr, "\t-M mark -- set socket packet mark\n"); fprintf(stderr, "\t-o option -- test sockopt <option>\n"); fprintf(stderr, "\t-p num -- use port num\n"); @@ -258,7 +259,7 @@ static void set_transparent(int fd, int pf) } } -static void set_mptfo(int fd, int pf) +static void set_mptfo(int fd) { int qlen = 25; @@ -335,7 +336,7 @@ static int sock_listen_mptcp(const char * const listenaddr, set_transparent(sock, pf); if (cfg_sockopt_types.mptfo) - set_mptfo(sock, pf); + set_mptfo(sock); if (bind(sock, a->ai_addr, a->ai_addrlen) == 0) break; /* success */ @@ -406,21 +407,18 @@ static int sock_connect_mptcp(const char * const remoteaddr, *peer = a; break; /* success */ } + perror("sendto()"); } else { if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) { *peer = a; break; /* success */ } - } - if (cfg_sockopt_types.mptfo) { - perror("sendto()"); - close(sock); - sock = -1; - } else { perror("connect()"); - close(sock); - sock = -1; } + + /* error */ + close(sock); + sock = -1; } freeaddrinfo(addr); @@ -935,6 +933,71 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd, return err; } +static int do_splice(const int infd, const int outfd, const size_t len, + struct wstate *winfo) +{ + ssize_t in_bytes, out_bytes; + int pipefd[2]; + int err; + + err = pipe(pipefd); + if (err) { + perror("pipe"); + return 2; + } + +again: + in_bytes = splice(infd, NULL, pipefd[1], NULL, len - winfo->total_len, + SPLICE_F_MOVE | SPLICE_F_MORE); + if (in_bytes < 0) { + perror("splice in"); + err = 3; + } else if (in_bytes > 0) { + out_bytes = splice(pipefd[0], NULL, outfd, NULL, in_bytes, + SPLICE_F_MOVE | SPLICE_F_MORE); + if (out_bytes < 0) { + perror("splice out"); + err = 4; + } else if (in_bytes != out_bytes) { + fprintf(stderr, "Unexpected transfer: %zu vs %zu\n", + in_bytes, out_bytes); + err = 5; + } else { + goto again; + } + } + + close(pipefd[0]); + close(pipefd[1]); + + return err; +} + +static int copyfd_io_splice(int infd, int peerfd, int outfd, unsigned int size, + bool *in_closed_after_out, struct wstate *winfo) +{ + int err; + + if (listen_mode) { + err = do_splice(peerfd, outfd, size, winfo); + if (err) + return err; + + err = do_splice(infd, peerfd, size, winfo); + } else { + err = do_splice(infd, peerfd, size, winfo); + if (err) + return err; + + shut_wr(peerfd); + + err = do_splice(peerfd, outfd, size, winfo); + *in_closed_after_out = true; + } + + return err; +} + static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct wstate *winfo) { bool in_closed_after_out = false; @@ -967,6 +1030,14 @@ static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct &in_closed_after_out, winfo); break; + case CFG_MODE_SPLICE: + file_size = get_infd_size(infd); + if (file_size < 0) + return file_size; + ret = copyfd_io_splice(infd, peerfd, outfd, file_size, + &in_closed_after_out, winfo); + break; + default: fprintf(stderr, "Invalid mode %d\n", cfg_mode); @@ -1296,8 +1367,8 @@ void xdisconnect(int fd) int main_loop(void) { + struct addrinfo *peer = NULL; int fd = 0, ret, fd_in = 0; - struct addrinfo *peer; struct wstate winfo; if (cfg_input && cfg_sockopt_types.mptfo) { @@ -1380,12 +1451,15 @@ int parse_mode(const char *mode) return CFG_MODE_MMAP; if (!strcasecmp(mode, "sendfile")) return CFG_MODE_SENDFILE; + if (!strcasecmp(mode, "splice")) + return CFG_MODE_SPLICE; fprintf(stderr, "Unknown test mode: %s\n", mode); fprintf(stderr, "Supported modes are:\n"); fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n"); fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n"); fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n"); + fprintf(stderr, "\t\t\"splice\" - send entire input file (splice), then read response (-l will read input first)\n"); die_usage(); diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh b/tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh new file mode 100755 index 000000000000..241254a966c9 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +MPTCP_LIB_KSFT_TEST="$(basename "${0}" .sh)" \ + "$(dirname "${0}")/mptcp_connect.sh" -m splice "${@}" diff --git a/tools/testing/selftests/net/mptcp/mptcp_diag.c b/tools/testing/selftests/net/mptcp/mptcp_diag.c index 8e0b1b8d84b6..5e222ba977e4 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_diag.c +++ b/tools/testing/selftests/net/mptcp/mptcp_diag.c @@ -1,21 +1,24 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2025, Kylin Software */ -#include <linux/sock_diag.h> -#include <linux/rtnetlink.h> -#include <linux/inet_diag.h> -#include <linux/netlink.h> -#include <linux/compiler.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + #include <sys/socket.h> -#include <netinet/in.h> -#include <linux/tcp.h> + #include <arpa/inet.h> -#include <unistd.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> -#include <stdio.h> +#include <netinet/in.h> + +#include <linux/compiler.h> +#include <linux/inet_diag.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/sock_diag.h> +#include <linux/tcp.h> #ifndef IPPROTO_MPTCP #define IPPROTO_MPTCP 262 diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e70d3420954f..dc1f200aaa81 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -603,8 +603,7 @@ wait_rm_addr() local old_cnt="${2}" local cnt - local i - for i in $(seq 10); do + for _ in $(seq 10); do cnt=$(rm_addr_count ${ns}) [ "$cnt" = "${old_cnt}" ] || break sleep 0.1 @@ -623,25 +622,22 @@ wait_rm_sf() local old_cnt="${2}" local cnt - local i - for i in $(seq 10); do + for _ in $(seq 10); do cnt=$(rm_sf_count ${ns}) [ "$cnt" = "${old_cnt}" ] || break sleep 0.1 done } +# $1: expected MPJ ACK Rx counter in $ns1 wait_mpj() { - local ns="${1}" - local cnt old_cnt - - old_cnt=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPJoinAckRx") + local exp_cnt="${1}" + local cnt - local i - for i in $(seq 10); do - cnt=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPJoinAckRx") - [ "$cnt" = "${old_cnt}" ] || break + for _ in $(seq 10); do + cnt=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx") + [ "${cnt}" = "${exp_cnt}" ] && break sleep 0.1 done } @@ -650,8 +646,7 @@ wait_ll_ready() { local ns="${1}" - local i - for i in $(seq 50); do + for _ in $(seq 50); do ip -n "${ns}" -6 addr show scope link | grep "inet6 fe80" | grep -qw "tentative" || break sleep 0.1 @@ -1407,7 +1402,7 @@ chk_join_tx_nr() count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxCreatSkErr") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$create" ]; then rc=${KSFT_FAIL} print_check "syn tx create socket error" @@ -1416,7 +1411,7 @@ chk_join_tx_nr() count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxBindErr") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$bind" ]; then rc=${KSFT_FAIL} print_check "syn tx bind error" @@ -1425,7 +1420,7 @@ chk_join_tx_nr() count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxConnectErr") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$connect" ]; then rc=${KSFT_FAIL} print_check "syn tx connect error" @@ -1451,7 +1446,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtInfiniteMapTx") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$infinite_map_tx" ]; then rc=${KSFT_FAIL} print_check "$ns infinite map tx fallback" @@ -1460,7 +1455,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtDSSCorruptionFallback") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$dss_corruption" ]; then rc=${KSFT_FAIL} print_check "$ns dss corruption fallback" @@ -1469,7 +1464,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtSimultConnectFallback") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$simult_conn" ]; then rc=${KSFT_FAIL} print_check "$ns simult conn fallback" @@ -1478,7 +1473,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableFallbackACK") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$mpc_passive" ]; then rc=${KSFT_FAIL} print_check "$ns mpc passive fallback" @@ -1487,7 +1482,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableFallbackSYNACK") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$mpc_active" ]; then rc=${KSFT_FAIL} print_check "$ns mpc active fallback" @@ -1496,7 +1491,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableDataFallback") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$mpc_data" ]; then rc=${KSFT_FAIL} print_check "$ns mpc data fallback" @@ -1505,7 +1500,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMD5SigFallback") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$md5_sig" ]; then rc=${KSFT_FAIL} print_check "$ns MD5 Sig fallback" @@ -1514,7 +1509,7 @@ chk_fallback_nr() count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtDssFallback") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$dss" ]; then rc=${KSFT_FAIL} print_check "$ns dss fallback" @@ -1590,7 +1585,7 @@ chk_join_nr() count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckHMacFailure") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "0" ]; then rc=${KSFT_FAIL} print_check "synack HMAC" @@ -1599,7 +1594,7 @@ chk_join_nr() count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$ack_nr" ]; then rc=${KSFT_FAIL} print_check "ack rx" @@ -1608,7 +1603,7 @@ chk_join_nr() count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckHMacFailure") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "0" ]; then rc=${KSFT_FAIL} print_check "ack HMAC" @@ -1617,7 +1612,7 @@ chk_join_nr() count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinRejected") if [ -z "$count" ]; then - rc=${KSFT_SKIP} + : # ignore skip elif [ "$count" != "$syn_rej" ]; then rc=${KSFT_FAIL} print_check "syn rejected" @@ -1650,7 +1645,6 @@ chk_stale_nr() local stale_min=$2 local stale_max=$3 local stale_delta=$4 - local dump_stats local stale_nr local recover_nr @@ -1666,16 +1660,11 @@ chk_stale_nr() fail_test "got $stale_nr stale[s] $recover_nr recover[s], " \ " expected stale in range [$stale_min..$stale_max]," \ " stale-recover delta $stale_delta" - dump_stats=1 + echo $ns stats + ip -n $ns -s link show else print_ok fi - - if [ "${dump_stats}" = 1 ]; then - echo $ns stats - ip netns exec $ns ip -s link show - ip netns exec $ns nstat -as | grep MPTcp - fi } chk_add_nr() @@ -3718,7 +3707,6 @@ userspace_pm_add_addr() tk=$(mptcp_lib_evts_get_info token "$evts") ip netns exec $1 ./pm_nl_ctl ann $2 token $tk id $3 - sleep 1 } # $1: ns ; $2: id @@ -3749,7 +3737,6 @@ userspace_pm_add_sf() ip netns exec $1 ./pm_nl_ctl csf lip $2 lid $3 \ rip $da rport $dp token $tk - sleep 1 } # $1: ns ; $2: addr $3: event type @@ -3999,9 +3986,11 @@ userspace_tests() { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns1 + wait_event ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 userspace_pm_add_addr $ns1 10.0.2.1 10 + wait_event ns2 MPTCP_LIB_EVENT_ANNOUNCED 1 userspace_pm_add_addr $ns1 10.0.3.1 20 + wait_event ns2 MPTCP_LIB_EVENT_ANNOUNCED 2 chk_join_nr 2 2 2 chk_add_nr 2 2 chk_mptcp_info subflows 2 subflows 2 @@ -4032,8 +4021,9 @@ userspace_tests() { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns2 + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 userspace_pm_add_sf $ns2 10.0.3.2 20 + wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1 chk_join_nr 1 1 1 chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 @@ -4060,10 +4050,11 @@ userspace_tests() { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns2 + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 userspace_pm_add_sf $ns2 10.0.3.2 0 + wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1 userspace_pm_chk_dump_addr "${ns2}" \ "id 0 flags subflow 10.0.3.2" "id 0 subflow" chk_join_nr 1 1 1 @@ -4081,8 +4072,9 @@ userspace_tests() { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns2 + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 userspace_pm_add_sf $ns2 10.0.3.2 20 + wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1 chk_join_nr 1 1 1 chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 @@ -4105,8 +4097,9 @@ userspace_tests() { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns1 + wait_event ns1 MPTCP_LIB_EVENT_ESTABLISHED 1 userspace_pm_add_addr $ns1 10.0.2.1 10 + wait_event ns2 MPTCP_LIB_EVENT_ANNOUNCED 1 chk_join_nr 1 1 1 chk_add_nr 1 1 chk_mptcp_info subflows 1 subflows 1 @@ -4133,6 +4126,7 @@ userspace_tests() local tests_pid=$! wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 userspace_pm_add_sf $ns2 10.0.3.2 20 + wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1 chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 @@ -4158,7 +4152,7 @@ endpoint_tests() { # subflow_rebuild_header is needed to support the implicit flag # userspace pm type prevents add_addr - if reset "implicit EP" && + if reset_with_events "implicit EP" && continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 @@ -4167,7 +4161,7 @@ endpoint_tests() run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns1 + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 pm_nl_check_endpoint "creation" \ $ns2 10.0.2.2 id 1 flags implicit chk_mptcp_info subflows 1 subflows 1 @@ -4181,6 +4175,7 @@ endpoint_tests() pm_nl_check_endpoint "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal mptcp_lib_kill_group_wait $tests_pid + kill_events_pids fi if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && @@ -4194,7 +4189,7 @@ endpoint_tests() run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns2 + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 pm_nl_check_endpoint "creation" \ $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 chk_subflow_nr "before delete id 2" 2 @@ -4206,7 +4201,7 @@ endpoint_tests() chk_mptcp_info subflows 0 subflows 0 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - wait_mpj $ns2 + wait_mpj 2 chk_subflow_nr "after re-add id 2" 2 chk_mptcp_info subflows 1 subflows 1 @@ -4218,7 +4213,7 @@ endpoint_tests() ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT pm_nl_del_endpoint $ns2 3 10.0.3.2 pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - wait_mpj $ns2 + wait_mpj 3 chk_subflow_nr "after no reject" 3 chk_mptcp_info subflows 2 subflows 2 @@ -4230,7 +4225,7 @@ endpoint_tests() chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow - wait_mpj $ns2 + wait_mpj $((3 + i)) chk_subflow_nr "after re-add id 0 ($i)" 3 chk_mptcp_info subflows 3 subflows 3 done @@ -4272,7 +4267,7 @@ endpoint_tests() run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! - wait_mpj $ns2 + wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1 pm_nl_check_endpoint "creation" \ $ns1 10.0.2.1 id 1 flags signal chk_subflow_nr "before delete" 2 @@ -4288,7 +4283,7 @@ endpoint_tests() pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal - wait_mpj $ns2 + wait_mpj 3 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 @@ -4300,7 +4295,7 @@ endpoint_tests() chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal - wait_mpj $ns2 + wait_mpj 4 chk_subflow_nr "after re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 @@ -4312,7 +4307,7 @@ endpoint_tests() chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal - wait_mpj $ns2 + wait_mpj 5 chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 @@ -4361,9 +4356,9 @@ endpoint_tests() wait_rm_addr $ns2 0 ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - wait_mpj $ns2 + wait_mpj 1 pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal - wait_mpj $ns2 + wait_mpj 2 mptcp_lib_kill_group_wait $tests_pid join_syn_tx=3 join_connect_err=1 \ diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 12ce61fa15a8..979cff56e1f5 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -29,6 +29,7 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_SCTP=m CONFIG_IPV6=y CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_TUNNEL=m CONFIG_IP_VS=m CONFIG_IP_VS_PROTO_TCP=y CONFIG_IP_VS_RR=m diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a68bc882fa4e..7a34ef468975 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -592,16 +592,33 @@ ip -net "$nsr1" link set tun0 up ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null +ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 +ip -net "$nsr1" link set tun6 up +ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad + ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 ip -net "$nsr2" link set tun0 up ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null +ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 || ret=1 +ip -net "$nsr2" link set tun6 up +ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad + ip -net "$nsr1" route change default via 192.168.100.2 ip -net "$nsr2" route change default via 192.168.100.1 + +# do not use "route change" and delete old default so +# socat fails to connect in case new default can't be added. +ip -6 -net "$nsr1" route delete default +ip -6 -net "$nsr1" route add default via fee1:3::2 +ip -6 -net "$nsr2" route delete default +ip -6 -net "$nsr2" route add default via fee1:3::1 ip -net "$ns2" route add default via 10.0.2.1 +ip -6 -net "$ns2" route add default via dead:2::1 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6 accept' ip netns exec "$nsr1" nft -a insert rule inet filter forward \ 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' @@ -611,28 +628,53 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Create vlan tagged devices for IPIP traffic. ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 ip -net "$nsr1" link set veth1.10 up ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip -net "$nsr1" addr add fee1:4::1/64 dev veth1.10 nodad ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' -ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 -ip -net "$nsr1" link set tun1 up -ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 + +ip -net "$nsr1" link add name tun0.10 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun0.10 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun0.10 ip -net "$nsr1" route change default via 192.168.200.2 -ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null -ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept' + +ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 +ip -net "$nsr1" link set tun6.10 up +ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad +ip -6 -net "$nsr1" route delete default +ip -6 -net "$nsr1" route add default via fee1:5::2 +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept' ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 ip -net "$nsr2" link set veth0.10 up ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip -net "$nsr2" addr add fee1:4::2/64 dev veth0.10 nodad ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null -ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 -ip -net "$nsr2" link set tun1 up -ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 + +ip -net "$nsr2" link add name tun0.10 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun0.10 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10 ip -net "$nsr2" route change default via 192.168.200.1 -ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 || ret=1 +ip -net "$nsr2" link set tun6.10 up +ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad +ip -6 -net "$nsr2" route delete default +ip -6 -net "$nsr2" route add default via fee1:5::1 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 @@ -640,10 +682,19 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel over vlan" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Restore the previous configuration ip -net "$nsr1" route change default via 192.168.10.2 ip -net "$nsr2" route change default via 192.168.10.1 ip -net "$ns2" route del default via 10.0.2.1 +ip -6 -net "$ns2" route del default via dead:2::1 } # Another test: diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index 6136ceec45e0..139bc1211878 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -510,7 +510,7 @@ EOF udp_listener_ready() { - ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345 + ss -S -N "$1" -uln -o "sport = :$2" | grep -q "$2" } output_files_written() @@ -518,7 +518,7 @@ output_files_written() test -s "$1" && test -s "$2" } -test_udp_ct_race() +test_udp_nat_race() { ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF flush ruleset @@ -545,8 +545,8 @@ EOF ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 & local nfqpid=$! - busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" - busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12345 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 12345 busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12 # Send two packets, one should end up in ns1, other in ns2. @@ -557,7 +557,7 @@ EOF busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2" - kill "$nfqpid" + kill "$nfqpid" "$rpid1" "$rpid2" if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then echo "FAIL: Expected One udp conntrack entry" @@ -585,6 +585,135 @@ EOF echo "PASS: both udp receivers got one packet each" } +# Make sure UDPGRO aggregated packets don't lose +# their skb->nfct entry when nfqueue passes the +# skb to userspace with software gso segmentation on. +test_udp_gro_ct() +{ + local errprefix="FAIL: test_udp_gro_ct:" + + ip netns exec "$nsrouter" conntrack -F 2>/dev/null + + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +flush ruleset +table inet udpq { + # Number of packets/bytes queued to userspace + counter toqueue { } + # Number of packets/bytes reinjected from userspace with 'ct new' intact + counter fromqueue { } + # These two counters should be identical and not 0. + + chain prerouting { + type filter hook prerouting priority -300; policy accept; + + # userspace sends small packets, if < 1000, UDPGRO did + # not kick in, but test needs a 'new' conntrack with udpgro skb. + meta iifname veth0 meta l4proto udp meta length > 1000 accept + + # don't pick up non-gso packets and don't queue them to + # userspace. + notrack + } + + chain postrouting { + type filter hook postrouting priority 0; policy accept; + + # Only queue unconfirmed fraglist gro skbs to userspace. + udp dport 12346 ct status ! confirmed counter name "toqueue" mark set 1 queue num 1 + } + + chain validate { + type filter hook postrouting priority 1; policy accept; + # ... and only count those that were reinjected with the + # skb->nfct intact. + mark 1 counter name "fromqueue" + } +} +EOF + timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc & + local rpid=$! + + ip netns exec "$nsrouter" ./nf_queue -G -c -q 1 -t 2 > "$TMPFILE2" & + local nfqpid=$! + + ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding on rx-gro-list on generic-receive-offload on + + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12346 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1 + + local bs=512 + local count=$(((32 * 1024 * 1024) / bs)) + dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 16); do + timeout 5 ip netns exec "$ns1" \ + socat -u -b 512 STDIN UDP-DATAGRAM:10.0.2.99:12346,reuseport,bind=0.0.0.0:55221 & + done + + busywait 10000 test -s "$TMPFILE1" + + kill "$rpid" + + wait + + local p + local b + local pqueued + local bqueued + + c=$(ip netns exec "$nsrouter" nft list counter inet udpq "toqueue" | grep packets) + read p pqueued b bqueued <<EOF +$c +EOF + local preinject + local breinject + c=$(ip netns exec "$nsrouter" nft list counter inet udpq "fromqueue" | grep packets) + read p preinject b breinject <<EOF +$c +EOF + ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding off + ip netns exec "$nsrouter" ethtool -K "veth1" rx-udp-gro-forwarding off + + if [ "$pqueued" -eq 0 ];then + # happens when gro did not build at least on aggregate + echo "SKIP: No packets were queued" + return + fi + + local saw_ct_entry=0 + if ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12346 2>/dev/null | wc -l | grep -q "^1"'; then + saw_ct_entry=1 + else + echo "$errprefix Expected udp conntrack entry" + ip netns exec "$nsrouter" conntrack -L + ret=1 + fi + + if [ "$pqueued" -ge "$preinject" ] ;then + echo "$errprefix Expected software segmentation to occur, had $pqueued and $preinject" + ret=1 + return + fi + + # sw segmentation adds extra udp and ip headers. + local breinject_expect=$((preinject * (512 + 20 + 8))) + + if [ "$breinject" -eq "$breinject_expect" ]; then + if [ "$saw_ct_entry" -eq 1 ];then + echo "PASS: fraglist gro skb passed with conntrack entry" + else + echo "$errprefix fraglist gro skb passed without conntrack entry" + ret=1 + fi + else + echo "$errprefix Counter mismatch, conntrack entry dropped by nfqueue? Queued: $pqueued, $bqueued. Post-queue: $preinject, $breinject. Expected $breinject_expect" + ret=1 + fi + + if ! ip netns exec "$nsrouter" nft delete table inet udpq; then + echo "$errprefix: Could not delete udpq table" + ret=1 + fi +} + test_queue_removal() { read tainted_then < /proc/sys/kernel/tainted @@ -663,7 +792,8 @@ test_tcp_localhost_connectclose test_tcp_localhost_requeue test_sctp_forward test_sctp_output -test_udp_ct_race +test_udp_nat_race +test_udp_gro_ct # should be last, adds vrf device in ns1 and changes routes test_icmp_vrf diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt new file mode 100644 index 000000000000..07e9936e70e6 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt @@ -0,0 +1,24 @@ +// 3rd ACK + 1st data segment lost, data segments with ce + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + ++0.05 < SEWA 0:0(0) win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> +// 3rd ACK lost +// 1st data segment lost ++0.05 < [ce] EAP. 1001:2001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1 ceb 1000 e0b 1,nop,nop,nop,sack 1001:2001> ++.002 accept(3, ..., ...) = 4 + ++0.2 < [ce] EAP. 1:1001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.001 > [ect0] EWA. 1:1(0) ack 2001 <ECN e1b 1 ceb 2000 e0b 1,nop> + ++0.05 < [ce] EAP. 2001:3001(1000) ack 1 win 264 ++.001 > [ect0] . 1:1(0) ack 3001 <ECN e1b 1 ceb 3000 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt new file mode 100644 index 000000000000..76b8422b34dc --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt @@ -0,0 +1,30 @@ +// 3rd ACK + 1st data segment lost, 2nd data segments with ce + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1016,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> +// 3rd ACK lost ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 write(4, ..., 2000) = 2000 +// 1st data segment lost + 2nd gets CE ++.002 > [ect0] .5 1:1005(1004) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++.000 > [ect0] P.5 1005:2001(996) ack 1 <ECN e1b 1 ceb 0 e0b 1, nop> ++0.05 < [ect0] .6 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 996 e1b 1,nop,nop,nop,sack 1005:2001> + ++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }% + ++0.002~+0.1 > [ect0] .5 1:1005(1004) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++.05 < [ect0] .6 1:1(0) ack 2001 win 264 <ECN e0b 1005 ceb 996 e1b 1,nop> + ++0.01 write(4, ..., 1000) = 1000 ++0~+0.002 > [ect0] P.5 2001:3001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.1 < [ect0] .5 1:1001(1000) ack 3001 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++0~+0.01 > [ect0] .5 3001:3001(0) ack 1001 <ECN e1b 1 ceb 0 e0b 1001,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt new file mode 100644 index 000000000000..84060e490589 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt @@ -0,0 +1,19 @@ +// Test 3rd ACK flags when SYN-ACK is rexmitted + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.1 < [ect0] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// Our code currently sends a challenge ACK +// when it receives a SYN in ESTABLISHED state +// based on the latest SYN ++.002 > [ect0] A. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt new file mode 100644 index 000000000000..d3fe09d0606f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt @@ -0,0 +1,18 @@ +// Third ACK CE increases r.cep + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + ++0.05 < SEWA 0:0(0) win 32767 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ce] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] WAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt new file mode 100644 index 000000000000..d28722db42b1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt @@ -0,0 +1,22 @@ +// 3rd ACK lost, CE for the first data segment + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + ++0.05 < SEWA 0:0(0) win 32767 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> +// 3rd ACK lost ++0.05 < [ce] EAP. 1:1001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] WA. 1:1(0) ack 1001 <ECN e1b 1 ceb 1000 e0b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.05 < [ce] EAP. 1001:2001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.001 > [ect0] EWA. 1:1(0) ack 2001 <ECN e1b 1 ceb 2000 e0b 1 ,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt new file mode 100644 index 000000000000..a4d808116e34 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt @@ -0,0 +1,26 @@ +// Test SYN/ACK rexmit triggered 3rd ACK duplicate + CE on first data seg + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> + +// SYN/ACK rexmitted => two 3rd ACKs in-flight ++1.0~+1.1 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> +// Delivered 1st 3rd ACK ++0.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + +// Duplicate 3rd ACK delivered ++1.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> + ++0.05 < [ce] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] WA. 1:1(0) ack 1001 <ECN e1b 1 ceb 1000 e0b 1,nop> + +0 read(4, ..., 1000) = 1000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt new file mode 100644 index 000000000000..410a303c6d49 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt @@ -0,0 +1,13 @@ +// Test that when accurate ECN is disabled, +// client uses RFC3168 ECN for SYN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=1 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,nop,nop,nop,wscale 8> ++.002 > [noecn] . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt new file mode 100644 index 000000000000..10728114b11b --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt @@ -0,0 +1,28 @@ +// Test that SYN-ACK with ACE flags and without +// ACE flags got dropped. Although we disable ECN, +// we shouldn't consider this as blackholed as +// these are dropped due to congestion + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < [ect0] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] SA. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> + +// Retransmit SYN ++0.1 < [noecn] S 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + ++0.1 < [noecn] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + +// Write with AccECN option but with ip-noecn since we received one SYN with ACE=0 ++0.01 write(4, ..., 100) = 100 ++.002 > [noecn] P5. 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt new file mode 100644 index 000000000000..04d928f0d44d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt @@ -0,0 +1,18 @@ +// Test AccECN -> RFC3168 fallback when sysctl asks for RFC3168 ECN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=1 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < . 1:1(0) ack 1 win 320 ++.002 accept(3, ..., ...) = 4 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] P. 1:1001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt new file mode 100644 index 000000000000..788af6bea69c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt @@ -0,0 +1,34 @@ +// Client negotiates AccECN and starts sending +// AccECN option in last ACK and data segments +// Middlebox drops AccECN option and client +// reverts to ACE flags only + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +sysctl -q net.ipv4.tcp_ecn_option_beacon=1 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.05 < [ect0] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] EA. 1:1(0) ack 1001 <ECN e1b 1 ceb 0 e0b 1001,nop> + +0 read(4, ..., 1000) = 1000 + ++0.05 < [ect0] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] EA. 1:1(0) ack 1001 <ECN e1b 1 ceb 0 e0b 2001,nop,nop,nop,sack 1:1001> + ++0.05 < [ect0] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] EA. 1:1(0) ack 1001 <nop,nop,sack 1:1001> + ++0.05 < [ect0] EAP. 1001:2001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] EA. 1:1(0) ack 2001 + +0 read(4, ..., 1000) = 1000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt new file mode 100644 index 000000000000..f5839c2e682d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt @@ -0,0 +1,38 @@ +// Client negotiates AccECN and starts sending +// AccECN option in last ACK and data segments +// Middlebox accepts AccECN option but some packets +// are lost due to congestion. Client should +// continue to send AccECN option + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.102 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.1 < [ect0] SW. 0:0(0) ack 1 win 32767 <mss 1024,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] A. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + +// Send ++0.01 write(4, ..., 3000) = 3000 ++.002 > [ect0] .5 1:1013(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++.002 > [ect0] P.5 1013:2025(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++.002 > [ect0] P.5 2025:3001(976) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + +// First two segments were lost due to congestion as SACK was +// received acknowledging 3rd segment ++0.1 < [ect0] .5 1:1(0) ack 1 win 264 <ECN e1b 1 ceb 0 e0b 977,nop,nop,nop,sack 2025:3001> + +// Since data with option was SACKed, we can +// continue to use AccECN option for the rest of +// the connection. This one is a rexmt ++.02~+0.5 > [ect0] .5 1:1013(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++0.1 < [ect0] .5 1:1(0) ack 3001 win 264 <ECN e1b 1 ceb 0 e0b 3000,nop> + +// Send new data, it should contain AccECN option ++0.01 write(4, ..., 2000) = 2000 ++.002 > [ect0] .5 3001:4013(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++.002 > [ect0] P.5 4013:5001(988) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt new file mode 100644 index 000000000000..c00b36d6a833 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt @@ -0,0 +1,12 @@ +// AccECN sysctl server-side only, no ECN/AccECN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=5 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < S. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,nop,nop,nop,wscale 8> ++.002 > . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt new file mode 100644 index 000000000000..f9c27f39f354 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt @@ -0,0 +1,25 @@ +// Test basic connection teardown where local process closes first: +// the local process calls close() first, so we send a FIN, and receive an ACK. +// Then we receive a FIN and ACK it. + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +.01...0.011 connect(3, ..., ...) = 0 + +0 > [noecn] SEWA 0:0(0) <...> + +0 < [ect1] SW. 0:0(0) ack 1 win 32768 <mss 1000,nop,wscale 6,nop,nop,sackOK> + +0 > [ect0] EW. 1:1(0) ack 1 + + +0 write(3, ..., 1000) = 1000 + +0 > [ect0] P5. 1:1001(1000) ack 1 + +0 < [ect0] .5 1:1(0) ack 1001 win 257 + + +0 close(3) = 0 + +0 > [ect0] F5. 1001:1001(0) ack 1 + +0 < [ect0] .5 1:1(0) ack 1002 win 257 + + +0 < [ect0] F5. 1:1(0) ack 1002 win 257 + +0 > [ect0] . 1002:1002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt new file mode 100644 index 000000000000..6d771234124a --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt @@ -0,0 +1,25 @@ +// Test a large ACK (> ACE field max) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 14600) = 14600 ++.002 > [ect0] P.5 1:14601(14600) ack 1 ++0.05 < [ect0] .5 1:1(0) ack 1461 win 264 ++0.05 < [ect0] .5 1:1(0) ack 14601 win 264 + ++0.01 %{ assert tcpi_delivered_ce == 8, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt new file mode 100644 index 000000000000..76384f52b021 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt @@ -0,0 +1,31 @@ +// Test false overflow detection with option used to rule out overflow + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + +// Stop sending option to allow easier testing ++0 `sysctl -q net.ipv4.tcp_ecn_option=0` + ++0.002 write(4, ..., 14600) = 14600 ++.002 > [ect0] P.5 1:14601(14600) ack 1 + ++0.05 < [ect0] .5 1:1(0) ack 1460 win 264 <ECN e0b 1461 ceb 0 e1b 1,nop> ++0.05 < [ect0] .5 1:1(0) ack 14601 win 264 <ECN e0b 14601 ceb 0 e1b 1,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 0, tcpi_delivered_ce +assert tcpi_delivered_e0_bytes == 14600, tcpi_delivered_e0_bytes +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt new file mode 100644 index 000000000000..8bce5dce35a2 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt @@ -0,0 +1,24 @@ +// Test a large ACK (> ACE field max) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 14600) = 14600 ++.002 > [ect0] P.5 1:14601(14600) ack 1 ++0.05 < [ect0] .5 1:1(0) ack 14601 win 264 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt new file mode 100644 index 000000000000..5f2b147214f4 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt @@ -0,0 +1,25 @@ +// Test a large ACK (> ACE field max) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 14600) = 14600 ++.002 > [ect0] P.5 1:14601(14600) ack 1 + // Fake CE ++0.05 < [ect0] .6 1:1(0) ack 14601 win 264 + ++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt new file mode 100644 index 000000000000..fd07bdc14f37 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt @@ -0,0 +1,25 @@ +// Test a large ACK (at ACE field max delta) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 14600) = 14600 ++.002 > [ect0] P.5 1:14601(14600) ack 1 + // Fake CE ++0.05 < [ect0] .4 1:1(0) ack 14601 win 264 + ++0.01 %{ assert tcpi_delivered_ce == 7, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt new file mode 100644 index 000000000000..cb1e70ff2d26 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt @@ -0,0 +1,70 @@ +// Test basic AccECN CEP/CEB/E0B/E1B functionality & CEP wrapping + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ +assert tcpi_delivered_ce == 0, tcpi_delivered_ce +assert tcpi_delivered_ce_bytes == 0, tcpi_delivered_ce_bytes +}% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + // Fake CE ++0.05 < [ect0] WA. 1:1(0) ack 1001 win 264 <ECN e0b 1 ceb 1000 e1b 1,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 1, tcpi_delivered_ce +assert tcpi_delivered_ce_bytes == 1000, tcpi_delivered_ce_bytes +}% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + // Fake ect0 ++0.05 < [ect0] WA. 1:1(0) ack 2001 win 264 <ECN e0b 1001 ceb 1000 e1b 1,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 1, tcpi_delivered_ce +assert tcpi_delivered_e0_bytes == 1000, tcpi_delivered_e0_bytes +}% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 2001:3001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + // Fake ce ++0.05 < [ect0] EWA. 1:1(0) ack 3001 win 264 <ECN e0b 1001 ceb 2000 e1b 1,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 2, tcpi_delivered_ce +assert tcpi_delivered_ce_bytes == 2000, tcpi_delivered_ce_bytes +}% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 3001:4001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + // Fake ect1 ++0.05 < [ect0] EWA. 1:1(0) ack 4001 win 264 <ECN e0b 1001 ceb 2000 e1b 1001,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 2, tcpi_delivered_ce +assert tcpi_delivered_e1_bytes == 1000, tcpi_delivered_e1_bytes +}% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 4001:5001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + // Fake ce ++0.05 < [ect0] . 1:1(0) ack 5001 win 264 <ECN e0b 1001 ceb 3000 e1b 1001,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 3, tcpi_delivered_ce +assert tcpi_delivered_ce_bytes == 3000, tcpi_delivered_ce_bytes +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt new file mode 100644 index 000000000000..6627c7bb2d26 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt @@ -0,0 +1,12 @@ +// Test that tcp_ecn=4 uses RFC3168 ECN for SYN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=4 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.05 connect(4, ..., ...) = 0 + ++.002 > SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt new file mode 100644 index 000000000000..51879477bb50 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt @@ -0,0 +1,35 @@ +// Test basic AccECN CEP/CEB/E0B/E1B functionality & CEP wrapping + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.05 < [ce] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] WA. 1:1(0) ack 1001 <ECN e1b 1 ceb 1000 e0b 1,nop> + +0 read(4, ..., 1000) = 1000 + ++0.05 < [ect0] EAP. 1001:2001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] WA. 1:1(0) ack 2001 <ECN e1b 1 ceb 1000 e0b 1001,nop> + +0 read(4, ..., 1000) = 1000 + ++0.05 < [ce] EAP. 2001:3001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] EWA. 1:1(0) ack 3001 <ECN e1b 1 ceb 2000 e0b 1001,nop> + +0 read(4, ..., 1000) = 1000 + ++0.05 < [ect1] EAP. 3001:4001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] EWA. 1:1(0) ack 4001 <ECN e1b 1001 ceb 2000 e0b 1001,nop> + +0 read(4, ..., 1000) = 1000 + ++0.05 < [ce] EAP. 4001:5001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] . 1:1(0) ack 5001 <ECN e1b 1001 ceb 3000 e0b 1001,nop> + +0 read(4, ..., 1000) = 1000 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt new file mode 100644 index 000000000000..0c72fa4a1251 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt @@ -0,0 +1,14 @@ +// Test IP flags drop +--tolerance_usecs=50000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 1.1 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.02 ~ +1.1 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt new file mode 100644 index 000000000000..171f9433e55f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt @@ -0,0 +1,16 @@ +// SYN/ACK option drop test + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.02 ~+2 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.02 ~+5 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.02 ~+8 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt new file mode 100644 index 000000000000..0f65cf56cd2b --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt @@ -0,0 +1,28 @@ +// Test that SYN-ACK with ACE flags and without +// ACE flags got dropped. Although we disable ECN, +// we shouldn't consider this as blackholed as +// these are dropped due to congestion + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> + +// Retransmit SYN-ACK without option ++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// SYN-ACK maybe getting blackholed, disable ECN ++2~+2.2 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++4~+4.4 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// Received an ACK after sending 3rd retransmission, not a blackhole ++0.1 < [noecn] . 1:1(0) ack 1 win 320 ++.002 accept(3, ..., ...) = 4 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt new file mode 100644 index 000000000000..343181633980 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt @@ -0,0 +1,18 @@ +// Test that SYN with ACE flags and without +// ACE flags got dropped. Although we disable +// ECN, we shouldn't consider this as blackholed +// as these are dropped due to congestion + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 3.1 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.02~+1.1 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.02~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.02~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.1 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0~+0.01 > [noecn] . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt new file mode 100644 index 000000000000..37dabc4603c8 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt @@ -0,0 +1,23 @@ +// Test AccECN flags bleach + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] . 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [noecn] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++0.05 < [ect0] EAP. 1:1(0) ack 1001 win 320 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt new file mode 100644 index 000000000000..5b14892fda51 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt @@ -0,0 +1,23 @@ +// Test basic AccECN negotiation + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++.05 < [ect0] EAP. 1:1(0) ack 1001 win 256 <ECN e0b 1001 ceb 0 e1b 0,nop> + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt new file mode 100644 index 000000000000..25f7cb2feb25 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt @@ -0,0 +1,26 @@ +// Test basic AccECN negotiation + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++0.05 < [ect0] EAP. 1:1(0) ack 1001 win 320 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt new file mode 100644 index 000000000000..50e08c492a69 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt @@ -0,0 +1,23 @@ +// Test basic AccECN negotiation without option + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 ++.05 < [ect0] EAP. 1:1(0) ack 1001 win 256 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1001:2001(1000) ack 1 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt new file mode 100644 index 000000000000..2904f1ba9975 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt @@ -0,0 +1,23 @@ +// Test basic AccECN negotiation, late option enable + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 ++.05 < [ect0] EAP. 1:1(0) ack 1001 win 256 <ECN e0b 1001 ceb 0 e1b 1,nop> + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt new file mode 100644 index 000000000000..64e0fc1c1f14 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt @@ -0,0 +1,20 @@ +// Test client behavior on receiving a non ECN SYN-ACK +// after receiving an AccECN SYN-ACK and moving to +// ESTABLISHED state + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> +// Receive an AccECN SYN-ACK and move to ESTABLISHED ++0.05 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + +// Receive a non ECN SYN-ACK and send a challenge ACK with ACE feedback ++0.1 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt new file mode 100644 index 000000000000..f407c629a3f7 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt @@ -0,0 +1,27 @@ +// Test basic AccECN negotiation with option off using sysctl + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 ++0.05 < [ect0] EAP. 1:1(0) ack 1001 win 320 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1001:2001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt new file mode 100644 index 000000000000..32454e7187f9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt @@ -0,0 +1,27 @@ +// Test no progress filtering + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + // Fake CE and claim no progress ++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 1000 e1b 1,nop> + ++0.01 %{ +assert tcpi_delivered_ce == 0, tcpi_delivered_ce +assert tcpi_delivered_ce_bytes == 0, tcpi_delivered_ce_bytes +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt new file mode 100644 index 000000000000..6597d5f2d778 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt @@ -0,0 +1,28 @@ +// Test that SYN-ACK with ACE flags and without +// ACE flags got dropped. Although we disable ECN, +// we shouldn't consider this as blackholed as +// these are dropped due to congestion + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < [noecn] S 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// Retransmit SYN ++0.1 < [ect0] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + ++0.1 < [noecn] . 1:1(0) ack 1 win 320 ++.002 accept(3, ..., ...) = 4 + +// Write with AccECN option but with ip-noecn since we received one SYN with ACE=0 ++0.01 write(4, ..., 100) = 100 ++.002 > [noecn] P. 1:101(100) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt new file mode 100644 index 000000000000..0f97dfcfa82d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt @@ -0,0 +1,18 @@ +// Test RFC3168 fallback when sysctl asks for AccECN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEW 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < . 1:1(0) ack 1 win 320 ++.002 accept(3, ..., ...) = 4 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] P. 1:1001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt new file mode 100644 index 000000000000..9baffdd66fe5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt @@ -0,0 +1,18 @@ +// Test RFC3168 ECN when sysctl asks for RFC3168 ECN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=1 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEW 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.05 < . 1:1(0) ack 1 win 320 ++.002 accept(3, ..., ...) = 4 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] P. 1:1001(1000) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt new file mode 100644 index 000000000000..3fc56f9c6a6f --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt @@ -0,0 +1,28 @@ +// Test SACK space grab to fit AccECN option +--tcp_ts_tick_usecs=1000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++.01 < [ect1] EAP. 1001:2001(1000) ack 1 win 264 ++0.002 > [ect0] EA. 1:1(0) ack 1 <ECN e1b 1001 ceb 0 e0b 1,nop,nop,nop,sack 1001:2001> ++.01 < [ect0] EAP. 3001:4001(1000) ack 1 win 264 ++0.002 > [ect0] EA. 1:1(0) ack 1 <ECN e1b 1001 ceb 0 e0b 1001,nop,nop,nop,sack 3001:4001 1001:2001> ++.01 < [ce] EAP. 5001:6001(1000) ack 1 win 264 ++0.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1001 ceb 1000 e0b 1001,nop,nop,nop,sack 5001:6001 3001:4001 1001:2001> +// DSACK works? ++.01 < [ect0] EAP. 5001:6001(1000) ack 1 win 264 ++0.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1001 ceb 1000 e0b 2001,nop,nop,nop,sack 5001:6001 5001:6001 3001:4001> ++.01 < [ect1] EAP. 6001:7001(1000) ack 1 win 264 ++0.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 2001 ceb 1000 e0b 2001,nop,nop,nop,sack 5001:7001 3001:4001 1001:2001> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt new file mode 100644 index 000000000000..1c075b5d81ae --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt @@ -0,0 +1,39 @@ +// Test SACK space grab to fit AccECN option +--tcp_ts_tick_usecs=1000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,sackOK,TS val 1 ecr 0,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 1,ECN e1b 1 ceb 0 e0b 1,nop,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 100,ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + +// One SACK block should allow all 3 AccECN fields: ++.01 < [ect1] EAP. 1001:2001(1000) ack 1 win 264 <nop,nop,TS val 3 ecr 100> ++0.002 > [ect0] EA. 1:1(0) ack 1 <nop,nop,TS val 160 ecr 2,ECN e1b 1001 ceb 0 e0b 1,nop,nop,nop,sack 1001:2001> + +// Two SACK blocks should fit w/ AccECN if we only need to use 2 AccECN fields: check ect1 arriving. ++.01 < [ect1] EAP. 3001:4001(1000) ack 1 win 264 <nop,nop,TS val 4 ecr 100> ++0.002 > [ect0] EA. 1:1(0) ack 1 <nop,nop,TS val 172 ecr 2,ECN e1b 2001 ceb 0,nop,nop,sack 3001:4001 1001:2001> + +// Two SACK blocks should fit w/ AccECN if we only need to use 2 AccECN fields: check CE arriving. ++.01 < [ce] EAP. 5001:6001(1000) ack 1 win 264 <nop,nop,TS val 5 ecr 100> ++0.002 > [ect0] WA. 1:1(0) ack 1 <nop,nop,TS val 184 ecr 2,ECN e1b 2001 ceb 1000,nop,nop,sack 5001:6001 3001:4001> + +// Check that DSACK works, using 2 SACK blocks in total, if we only need to use 2 AccECN fields: check ect1 arriving. ++.01 < [ect1] EAP. 5001:6001(1000) ack 1 win 264 <nop,nop,TS val 5 ecr 100> ++0.002 > [ect0] WA. 1:1(0) ack 1 <nop,nop,TS val 196 ecr 2,ECN e1b 3001 ceb 1000,nop,nop,sack 5001:6001 5001:6001> + +// Check the case where the AccECN option doesn't fit, because sending ect0 +// with order 1 would rquire 3 AccECN fields, +// and TS (12 bytes) + 2 SACK blocks (20 bytes) + 3 AccECN fields (2 + 3*3 bytes) > 40 bytes. +// That's OK; Linux TCP AccECN is optimized for the ECT1 case, not ECT0. ++.01 < [ect0] EAP. 6001:7001(1000) ack 1 win 264 <nop,nop,TS val 5 ecr 100> ++0.002 > [ect0] WA. 1:1(0) ack 1 <nop,nop,TS val 204 ecr 2,nop,nop,sack 5001:7001 3001:4001 1001:2001> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt new file mode 100644 index 000000000000..6b88ab78bfce --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt @@ -0,0 +1,20 @@ +// Test against classic ECN server +// Not-ECT on SYN and server sets 1|0|1 (AE is unused for classic ECN) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] SEA. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700> + ++0 write(4, ..., 100) = 100 ++.002 > [ect0] P.5 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700> ++0 close(4) = 0 + ++.002 > [ect0] F.5 101:101(0) ack 1 <nop,nop,TS val 400 ecr 700> ++0.1 < [noecn] R. 1:1(0) ack 102 win 4242 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt new file mode 100644 index 000000000000..d24ada008ece --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt @@ -0,0 +1,20 @@ +// Test against classic ECN server +// Not-ECT on SYN and server sets 0|0|1 + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] SE. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8> ++.002 > [noecn] . 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700> + ++0 write(4, ..., 100) = 100 ++.002 > [ect0] P. 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700> ++0 close(4) = 0 + ++0 > [noecn] F. 101:101(0) ack 1 <...> ++0.1 < R. 1:1(0) ack 102 win 4242 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt new file mode 100644 index 000000000000..a20d7e890ee1 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt @@ -0,0 +1,19 @@ +// Test against broken server (1|1|1) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] SEWA. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8> ++.002 > [noecn] . 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700> + ++0 write(4, ..., 100) = 100 ++.002 > [noecn] P. 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700> ++0 close(4) = 0 + ++.002 > [noecn] F. 101:101(0) ack 1 <...> ++0.1 < [noecn] R. 1:1(0) ack 102 win 4242 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt new file mode 100644 index 000000000000..428255bedab7 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt @@ -0,0 +1,19 @@ +// Test against Non ECN server (0|0|0) + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8> ++.002 > [noecn] . 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700> + ++0 write(4, ..., 100) = 100 ++.002 > [noecn] P. 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700> ++0 close(4) = 0 + ++.002 > [noecn] F. 101:101(0) ack 1 <nop,nop,TS val 400 ecr 700> ++0.1 < [noecn] R. 1:1(0) ack 102 win 4242 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt new file mode 100644 index 000000000000..e9a5a0d3677c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt @@ -0,0 +1,18 @@ +// Test AccECN with sysctl set to server-side only + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=5 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt new file mode 100644 index 000000000000..412fa903105c --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt @@ -0,0 +1,18 @@ +// Test that SYN with ACE flags was Acked +// after 2nd retransmission. In this case, +// since we got SYN-ACK that supports Accurate +// ECN, we consider this as successful negotiation + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 2.1 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++1~+1.1 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++1~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> + ++0.1 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1016,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++0~+0.01 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt new file mode 100644 index 000000000000..4622754a2270 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt @@ -0,0 +1,16 @@ +// Test that SYN with ACE flags got dropped +// We retry one more time with ACE and then +// fallback to disabled ECN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 2.1 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++1~+1.1 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++1~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.1 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0~+0.01 > [noecn] . 1:1(0) ack 1 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt new file mode 100644 index 000000000000..ee15f108cafe --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt @@ -0,0 +1,27 @@ +// Test that SYN-ACK with ACE flags was Acked +// after 2nd retransmission. In this case, +// since we got the last ACK that supports Accurate +// ECN, we consider this as successful negotiation + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> + +// Retransmit SYN-ACK without option ++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// SYN-ACK maybe getting blackholed, disable ECN ++2~+2.2 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// Received an ACK with ACE flags, state should be set to negotiation succeeded ++0.1 < [noecn] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt new file mode 100644 index 000000000000..ccfe353a8ee4 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt @@ -0,0 +1,26 @@ +// Test that SYN-ACK with ACE flags got dropped +// We retry one more time with ACE and then +// fallback to disabled ECN + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> + +// Retransmit SYN-ACK without option ++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// SYN-ACK maybe getting blackholed, disable ECN ++2~+2.2 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +// Received an ACK with no ACE flags, state should be set to blackholed ++0.1 < [noecn] . 1:1(0) ack 1 win 320 ++0 accept(3, ..., ...) = 4 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt new file mode 100644 index 000000000000..dc83f7a18180 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt @@ -0,0 +1,13 @@ +// Test AccECN ECN field reflector in SYNACK + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < [ce] SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SWA. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt new file mode 100644 index 000000000000..e63a8d018c37 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt @@ -0,0 +1,13 @@ +// Test AccECN ECN field reflector in SYNACK + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < [ect0] SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SA. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt new file mode 100644 index 000000000000..23c0e43b3dbe --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt @@ -0,0 +1,13 @@ +// Test AccECN ECN field reflector in SYNACK + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < [ect1] SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SEW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt new file mode 100644 index 000000000000..c3497738f680 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt @@ -0,0 +1,27 @@ +// Test SYNACK CE & received_ce update + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [ce] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 write(4, ..., 100) = 100 ++.002 > [ect0] P.6 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++0.05 < [ect0] P.5 1:101(100) ack 101 win 256 <ECN e0b 101 ceb 0 e1b 1,nop> ++.002 > [ect0] .6 101:101(0) ack 101 <ECN e1b 1 ceb 0 e0b 101,nop> + ++0.01 write(4, ..., 100) = 100 ++.002 > [ect0] P.6 101:201(100) ack 101 <ECN e1b 1 ceb 0 e0b 101,nop> + ++0.1 < [ect1] P.5 201:301(100) ack 201 win 256 <ECN e0b 101 ceb 0 e1b 1,nop> ++.002 > [ect0] .6 201:201(0) ack 101 <ECN e1b 101 ceb 0 e0b 101,nop,nop,nop,sack 201:301> + ++0.01 < [ce] .6 401:501(100) ack 201 win 256 <ECN e0b 101 ceb 0 e1b 1,nop> ++.002 > [ect0] .7 201:201(0) ack 101 <ECN e1b 101 ceb 100 e0b 101,nop,nop,nop,sack 401:501 201:301> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt new file mode 100644 index 000000000000..5fd77f466572 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt @@ -0,0 +1,22 @@ +// Reflected SYNACK CE mark increases delivered_ce + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_fallback=0 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + ++0.05 < SEWA 0:0(0) win 32767 <mss 1050,nop,nop,sackOK,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> +// Fake ce for prev, ECT validator must be disabled for this to work ++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt new file mode 100644 index 000000000000..f6ad1ea5c0c4 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt @@ -0,0 +1,24 @@ +// Test SYN=0 reflector + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [ect0] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] A. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 write(4, ..., 100) = 100 ++.002 > [ect0] P.5 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++0.05 < [ect0] P.5 1:1(0) ack 101 win 256 <ECN e0b 101 ceb 0 e1b 1,nop> + ++0.01 < [ect0] P.5 1:101(100) ack 101 win 256 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] .5 101:101(0) ack 101 <ECN e1b 1 ceb 0 e0b 101,nop> ++0 read(4, ..., 100) = 100 + ++0 close(4) = 0 ++0 > F.5 101:101(0) ack 101 <...> ++0.1 < R. 101:101(0) ack 102 win 4242 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt new file mode 100644 index 000000000000..7ecfc5fb9dbb --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt @@ -0,0 +1,24 @@ +// Test SYN=0 reflector + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < [ect1] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] EW. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.01 write(4, ..., 100) = 100 ++.002 > [ect0] P.5 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> ++0.05 < [ect1] P.5 1:1(0) ack 101 win 256 <ECN e0b 101 ceb 0 e1b 1,nop> + ++0.01 < [ect1] P.5 1:101(100) ack 101 win 256 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 > [ect0] .5 101:101(0) ack 101 <ECN e1b 101 ceb 0 e0b 1,nop> ++0 read(4, ..., 100) = 100 + ++0 close(4) = 0 ++0 > F5. 101:101(0) ack 101 <...> ++0.1 < R. 101:101(0) ack 102 win 4242 diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt new file mode 100644 index 000000000000..9e0959782ef5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt @@ -0,0 +1,15 @@ +// Test 3rd ACK flags when SYN-ACK is rexmitted + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 ++.002 ... 0.052 connect(4, ..., ...) = 0 + ++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> + ++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8> ++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt new file mode 100644 index 000000000000..a5a41633af07 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt @@ -0,0 +1,25 @@ +// Test that we retransmit SYN-ACK with ACE and without +// AccECN options after +// SYN-ACK was lost and TCP moved to TCPS_SYN_RECEIVED + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +sysctl -q net.ipv4.tcp_ecn_option=2 +` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 1) = 0 + ++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8> + +// Retransmit SYN-ACK without option ++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> ++0.1 < [noecn] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop> ++.002 accept(3, ..., ...) = 4 + +// We try to write with AccECN option ++0.01 write(4, ..., 100) = 100 ++.002 > [ect0] P5. 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop> diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt new file mode 100644 index 000000000000..f3fe2f098966 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt @@ -0,0 +1,26 @@ +// Test TS progress filtering +--tcp_ts_tick_usecs=1000 +--tolerance_usecs=7000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,sackOK,TS val 1 ecr 0,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,sackOK,TS val 10 ecr 1,ECN e1b 1 ceb 0 e0b 1,nop,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 10> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <nop,nop,TS val 83 ecr 2> + // Fake CE and claim no progress ++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 83> + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt new file mode 100644 index 000000000000..1446799d2481 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt @@ -0,0 +1,25 @@ +// Test TS progress filtering +--tcp_ts_tick_usecs=1000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_ecn=3 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < SEWA 0:0(0) win 32792 <mss 1050,sackOK,TS val 1 ecr 0,nop,wscale 8> ++.002 > SW. 0:0(0) ack 1 <mss 1460,sackOK,TS val 10 ecr 1,ECN e1b 1 ceb 0 e0b 1,nop,nop,wscale 8> ++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 10> ++.002 accept(3, ..., ...) = 4 + ++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }% + ++0.01 write(4, ..., 1000) = 1000 ++.002 > [ect0] EAP. 1:1001(1000) ack 1 <nop,nop,TS val 83 ecr 2> + // Fake CE and claim no progress ++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <nop,nop,TS val 3 ecr 83> + ++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }% diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt new file mode 100644 index 000000000000..319f81dd717d --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Minimal active open. +// First to close connection. + +`./defaults.sh` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 + + // Connect to server: active open: three-way handshake + +0...0 connect(4, ..., ...) = 0 + +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8> + +0 < S. 0:0(0) ack 1 win 65535 <mss 1460,sackOK,nop,nop,nop,wscale 7> + +0 > . 1:1(0) ack 1 + + // Send data + +0 send(4, ..., 1000, 0) = 1000 + +0 > P. 1:1001(1000) ack 1 + +0 < . 1:1(0) ack 1001 win 257 + + +0 close(4) = 0 + +0 > F. 1001:1001(0) ack 1 + +0 < F. 1:1(0) ack 1002 win 257 + +0 > . 1002:1002(0) ack 2 diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt new file mode 100644 index 000000000000..e72a291b666e --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Minimal passive open. +// Peer is first to close. + +`./defaults.sh` + + // Open listener socket + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + // Incoming connection: passive open: three-way handshake + +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 8> + +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> + +0 < . 1:1(0) ack 1 win 257 + + // Open connection socket and close listener socket + +0 accept(3, ..., ...) = 4 + +0 close(3) = 0 + + // Peer sends data: acknowledge and receive + +0 < P. 1:1001(1000) ack 1 win 257 + +0 > . 1:1(0) ack 1001 + +0 recv(4, ..., 1000, 0) = 1000 + + // Peer initiates connection close + +0 < F. 1001:1001(0) ack 1 win 257 + +.04 > . 1:1(0) ack 1002 + + // Local socket also closes its side + +0 close(4) = 0 + +0 > F. 1:1(0) ack 1002 + +0 < . 1002:1002(0) ack 2 win 257 diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt new file mode 100644 index 000000000000..95a1957a2cf9 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +// Test after "tcp: tcp_tx_timestamp() must look at the rtx queue" + +// This test is about receiving the SCM_TSTAMP_ACK, +// we do not care about its SCM_TIMESTAMPING precision. +--tolerance_usecs=1000000 + +`./defaults.sh +sysctl -q net.ipv4.tcp_min_tso_segs=70 +` + +// Create a socket and set it to non-blocking. + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 fcntl(3, F_GETFL) = 0x2 (flags O_RDWR) + +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +// Establish connection and verify that there was no error. + +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8> ++.010 < S. 0:0(0) ack 1 win 65535 <mss 1000,sackOK,TS val 700 ecr 100,nop,wscale 7> + +0 > . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700> + +0 getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0 + +0 setsockopt(3, SOL_SOCKET, SO_SNDBUF, [30000], 4) = 0 + + +0 write(3, ..., 9880) = 9880 + +0 > P. 1:9881(9880) ack 1 <nop,nop,TS val 200 ecr 700> ++.010 < . 1:1(0) ack 9881 win 10000 <nop,nop,TS val 701 ecr 200> + + +0 write(3, ..., 19760) = 19760 + +0 > P. 9881:29641(19760) ack 1 <nop,nop,TS val 201 ecr 701> ++.010 < . 1:1(0) ack 29641 win 10000 <nop,nop,TS val 702 ecr 201> + + +0 write(3, ..., 39520) = 39520 + +0 > P. 29641:69161(39520) ack 1 <nop,nop,TS val 202 ecr 702> ++.010 < . 1:1(0) ack 69161 win 10000 <nop,nop,TS val 703 ecr 202> + +// One more write to increase cwnd + +0 write(3, ..., 79040) = 79040 + +0 > P. 69161:108681(39520) ack 1 <nop,nop,TS val 203 ecr 703> + +0 > P. 108681:148201(39520) ack 1 <nop,nop,TS val 203 ecr 703> ++.010 < . 1:1(0) ack 148201 win 1000 <nop,nop,TS val 704 ecr 203> + + +0 setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING, + [SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID], 4) = 0 + +// We have one write filling one skb +// last byte can not be stored because of our small SO_SNDBUF + +0 write(3, ..., 65209) = 65208 + +0 > P. 148201:213409(65208) ack 1 <nop,nop,TS val 204 ecr 704> ++.010 < . 1:1(0) ack 213409 win 1000 <nop,nop,TS val 705 ecr 204> + +// SCM_TSTAMP_ACK should be received after the last ack at +// t=60ms. + +0 recvmsg(3, {msg_name(...)=..., + msg_iov(1)=[{...,0}], + msg_flags=MSG_ERRQUEUE|MSG_TRUNC, + msg_control=[ + {cmsg_level=SOL_SOCKET, + cmsg_type=SCM_TIMESTAMPING, + cmsg_data={scm_sec=0,scm_nsec=60000000}}, + {cmsg_level=CMSG_LEVEL_IP, + cmsg_type=CMSG_TYPE_RECVERR, + cmsg_data={ee_errno=ENOMSG, + ee_origin=SO_EE_ORIGIN_TIMESTAMPING, + ee_type=0, + ee_code=0, + ee_info=SCM_TSTAMP_ACK, + ee_data=65207}} + ]}, MSG_ERRQUEUE) = 0 diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c index 8d82140f0f76..3b1ee2d3d417 100644 --- a/tools/testing/selftests/net/tfo.c +++ b/tools/testing/selftests/net/tfo.c @@ -82,8 +82,10 @@ static void run_server(void) error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)"); if (read(connfd, buf, 64) < 0) - perror("read()"); - fprintf(outfile, "%d\n", opt); + error(1, errno, "read()"); + + if (fprintf(outfile, "%d\n", opt) < 0) + error(1, errno, "fprintf()"); fclose(outfile); close(connfd); @@ -92,14 +94,17 @@ static void run_server(void) static void run_client(void) { - int fd; + int fd, ret; char *msg = "Hello, world!"; fd = socket(AF_INET6, SOCK_STREAM, 0); if (fd == -1) error(1, errno, "socket()"); - sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + ret = sendto(fd, msg, strlen(msg), MSG_FASTOPEN, + (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + if (ret < 0) + error(1, errno, "sendto()"); close(fd); } diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh index a4550511830a..f116f888b794 100755 --- a/tools/testing/selftests/net/tfo_passive.sh +++ b/tools/testing/selftests/net/tfo_passive.sh @@ -85,12 +85,15 @@ timeout -k 1s 30s ip netns exec nssv ./tfo \ -s \ -p ${SERVER_PORT} \ -o ${out_file}& +server_pid="$!" wait_local_port_listen nssv ${SERVER_PORT} tcp ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT} +client_exit_status="$?" -wait +wait "$server_pid" +server_exit_status="$?" res=$(cat $out_file) rm $out_file @@ -101,6 +104,14 @@ if [ "$res" = "0" ]; then exit 1 fi +if [ "$client_exit_status" -ne 0 ] || [ "$server_exit_status" -ne 0 ]; then + # Note: timeout(1) exits with 124 if it timed out + echo "client exited with ${client_exit_status}" + echo "server exited with ${server_exit_status}" + cleanup_ns + exit 1 +fi + echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index a4d16a460fbe..9e2ccea13d70 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -3260,17 +3260,25 @@ TEST(data_steal) { ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0); /* Spawn a child and get it into the read wait path of the underlying - * TCP socket. + * TCP socket (before kernel .recvmsg is replaced with the TLS one). */ pid = fork(); ASSERT_GE(pid, 0); if (!pid) { - EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), - sizeof(buf) / 2); + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2 + 1, MSG_WAITALL), + sizeof(buf) / 2 + 1); exit(!__test_passed(_metadata)); } - usleep(10000); + /* Send a sync byte and poll until it's consumed to ensure + * the child is in recv() before we proceed to install TLS. + */ + ASSERT_EQ(send(fd, buf, 1, 0), 1); + do { + usleep(500); + } while (recv(cfd, buf, 1, MSG_PEEK | MSG_DONTWAIT) == 1); + EXPECT_EQ(errno, EAGAIN); + ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c index 0efc67b0357a..8a5cd5cb5472 100644 --- a/tools/testing/selftests/net/tun.c +++ b/tools/testing/selftests/net/tun.c @@ -8,14 +8,119 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> -#include <linux/if.h> #include <linux/if_tun.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> #include <sys/ioctl.h> #include <sys/socket.h> #include "kselftest_harness.h" +#include "tuntap_helpers.h" + +static const char param_dev_geneve_name[] = "geneve1"; +static unsigned char param_hwaddr_outer_dst[] = { 0x00, 0xfe, 0x98, + 0x14, 0x22, 0x42 }; +static unsigned char param_hwaddr_outer_src[] = { 0x00, 0xfe, 0x98, + 0x94, 0xd2, 0x43 }; +static unsigned char param_hwaddr_inner_dst[] = { 0x00, 0xfe, 0x98, + 0x94, 0x22, 0xcc }; +static unsigned char param_hwaddr_inner_src[] = { 0x00, 0xfe, 0x98, + 0x94, 0xd2, 0xdd }; + +static struct in_addr param_ipaddr4_outer_dst = { + __constant_htonl(0xac100001), +}; + +static struct in_addr param_ipaddr4_outer_src = { + __constant_htonl(0xac100002), +}; + +static struct in_addr param_ipaddr4_inner_dst = { + __constant_htonl(0xac100101), +}; + +static struct in_addr param_ipaddr4_inner_src = { + __constant_htonl(0xac100102), +}; + +static struct in6_addr param_ipaddr6_outer_dst = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } }, +}; + +static struct in6_addr param_ipaddr6_outer_src = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, +}; + +static struct in6_addr param_ipaddr6_inner_dst = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } }, +}; + +static struct in6_addr param_ipaddr6_inner_src = { + { { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } }, +}; + +#ifndef BIT +#define BIT(nr) (1UL << (nr)) +#endif + +#define VN_ID 1 +#define VN_PORT 4789 +#define UDP_SRC_PORT 22 +#define UDP_DST_PORT 48878 +#define IPPREFIX_LEN 24 +#define IP6PREFIX_LEN 64 +#define TIMEOUT_SEC 10 +#define TIMEOUT_USEC 100000 +#define MAX_RETRIES 20 + +#define UDP_TUNNEL_GENEVE_4IN4 0x01 +#define UDP_TUNNEL_GENEVE_6IN4 0x02 +#define UDP_TUNNEL_GENEVE_4IN6 0x04 +#define UDP_TUNNEL_GENEVE_6IN6 0x08 + +#define UDP_TUNNEL_MAX_SEGMENTS BIT(7) + +#define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4) +#define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6) + +#define UDP_TUNNEL_GENEVE_4IN4_HDRLEN \ + (ETH_HLEN + 2 * sizeof(struct iphdr) + GENEVE_HLEN + \ + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_6IN6_HDRLEN \ + (ETH_HLEN + 2 * sizeof(struct ipv6hdr) + GENEVE_HLEN + \ + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_4IN6_HDRLEN \ + (ETH_HLEN + sizeof(struct iphdr) + sizeof(struct ipv6hdr) + \ + GENEVE_HLEN + 2 * sizeof(struct udphdr)) +#define UDP_TUNNEL_GENEVE_6IN4_HDRLEN \ + (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct iphdr) + \ + GENEVE_HLEN + 2 * sizeof(struct udphdr)) + +#define UDP_TUNNEL_HDRLEN(type) \ + ((type) == UDP_TUNNEL_GENEVE_4IN4 ? UDP_TUNNEL_GENEVE_4IN4_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_6IN6 ? UDP_TUNNEL_GENEVE_6IN6_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_4IN6 ? UDP_TUNNEL_GENEVE_4IN6_HDRLEN : \ + (type) == UDP_TUNNEL_GENEVE_6IN4 ? UDP_TUNNEL_GENEVE_6IN4_HDRLEN : \ + 0) + +#define UDP_TUNNEL_MSS(type) (ETH_DATA_LEN - UDP_TUNNEL_HDRLEN(type)) +#define UDP_TUNNEL_MAX(type, is_tap) \ + (ETH_MAX_MTU - UDP_TUNNEL_HDRLEN(type) - ((is_tap) ? ETH_HLEN : 0)) + +#define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel) +#define MAX_VNET_TUNNEL_PACKET_SZ \ + (TUN_VNET_TNL_SIZE + ETH_HLEN + UDP_TUNNEL_GENEVE_6IN6_HDRLEN + \ + ETH_MAX_MTU) + +struct geneve_setup_config { + int family; + union { + struct in_addr r4; + struct in6_addr r6; + } remote; + __be32 vnid; + __be16 vnport; + unsigned char hwaddr[6]; + uint8_t csum; +}; static int tun_attach(int fd, char *dev) { @@ -25,7 +130,7 @@ static int tun_attach(int fd, char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_ATTACH_QUEUE; - return ioctl(fd, TUNSETQUEUE, (void *) &ifr); + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } static int tun_detach(int fd, char *dev) @@ -36,7 +141,7 @@ static int tun_detach(int fd, char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_DETACH_QUEUE; - return ioctl(fd, TUNSETQUEUE, (void *) &ifr); + return ioctl(fd, TUNSETQUEUE, (void *)&ifr); } static int tun_alloc(char *dev) @@ -54,7 +159,7 @@ static int tun_alloc(char *dev) strcpy(ifr.ifr_name, dev); ifr.ifr_flags = IFF_TAP | IFF_NAPI | IFF_MULTI_QUEUE; - err = ioctl(fd, TUNSETIFF, (void *) &ifr); + err = ioctl(fd, TUNSETIFF, (void *)&ifr); if (err < 0) { fprintf(stderr, "can't TUNSETIFF: %s\n", strerror(errno)); close(fd); @@ -66,42 +171,315 @@ static int tun_alloc(char *dev) static int tun_delete(char *dev) { - struct { - struct nlmsghdr nh; - struct ifinfomsg ifm; - unsigned char data[64]; - } req; - struct rtattr *rta; - int ret, rtnl; + return ip_link_del(dev); +} + +static int tun_open(char *dev, const int flags, const int hdrlen, + const int features, const unsigned char *mac_addr) +{ + struct ifreq ifr = { 0 }; + int fd, sk = -1; + + fd = open("/dev/net/tun", O_RDWR); + if (fd < 0) { + perror("open"); + return -1; + } + + ifr.ifr_flags = flags; + if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) { + perror("ioctl(TUNSETIFF)"); + goto err; + } + strcpy(dev, ifr.ifr_name); + + if (hdrlen > 0) { + if (ioctl(fd, TUNSETVNETHDRSZ, &hdrlen) < 0) { + perror("ioctl(TUNSETVNETHDRSZ)"); + goto err; + } + } + + if (features) { + if (ioctl(fd, TUNSETOFFLOAD, features) < 0) { + perror("ioctl(TUNSETOFFLOAD)"); + goto err; + } + } + + sk = socket(PF_INET, SOCK_DGRAM, 0); + if (sk < 0) { + perror("socket"); + goto err; + } + + if (ioctl(sk, SIOCGIFFLAGS, &ifr) < 0) { + perror("ioctl(SIOCGIFFLAGS)"); + goto err; + } + + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); + if (ioctl(sk, SIOCSIFFLAGS, &ifr) < 0) { + perror("ioctl(SIOCSIFFLAGS)"); + goto err; + } + + if (mac_addr && flags & IFF_TAP) { + ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; + memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETH_ALEN); + + if (ioctl(sk, SIOCSIFHWADDR, &ifr) < 0) { + perror("ioctl(SIOCSIFHWADDR)"); + goto err; + } + } + +out: + if (sk >= 0) + close(sk); + return fd; + +err: + close(fd); + fd = -1; + goto out; +} + +static size_t sockaddr_len(int family) +{ + return (family == AF_INET) ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6); +} + +static int geneve_fill_newlink(struct rt_link_newlink_req *req, void *data) +{ + struct geneve_setup_config *cfg = data; + +#define SET_GENEVE_REMOTE rt_link_newlink_req_set_linkinfo_data_geneve_remote +#define SET_GENEVE_REMOTE6 rt_link_newlink_req_set_linkinfo_data_geneve_remote6 + + rt_link_newlink_req_set_address(req, cfg->hwaddr, ETH_ALEN); + rt_link_newlink_req_set_linkinfo_data_geneve_id(req, cfg->vnid); + rt_link_newlink_req_set_linkinfo_data_geneve_port(req, cfg->vnport); + rt_link_newlink_req_set_linkinfo_data_geneve_udp_csum(req, cfg->csum); + + if (cfg->family == AF_INET) + SET_GENEVE_REMOTE(req, cfg->remote.r4.s_addr); + else + SET_GENEVE_REMOTE6(req, &cfg->remote.r6, + sizeof(cfg->remote.r6)); + + return 0; +} + +static int geneve_create(const char *dev, int family, void *remote, + void *hwaddr) +{ + struct geneve_setup_config geneve; + + memset(&geneve, 0, sizeof(geneve)); + geneve.vnid = VN_ID; + geneve.vnport = htons(VN_PORT); + geneve.csum = 1; + geneve.family = family; + if (family == AF_INET) + memcpy(&geneve.remote.r4, remote, sizeof(struct in_addr)); + else + memcpy(&geneve.remote.r6, remote, sizeof(struct in6_addr)); + memcpy(geneve.hwaddr, hwaddr, ETH_ALEN); + + return ip_link_add(dev, "geneve", geneve_fill_newlink, (void *)&geneve); +} + +static int set_pmtu_discover(int fd, bool is_ipv4) +{ + int level, name, val; + + if (is_ipv4) { + level = SOL_IP; + name = IP_MTU_DISCOVER; + val = IP_PMTUDISC_DO; + } else { + level = SOL_IPV6; + name = IPV6_MTU_DISCOVER; + val = IPV6_PMTUDISC_DO; + } + + return setsockopt(fd, level, name, &val, sizeof(val)); +} + +static int udp_socket_open(struct sockaddr_storage *ssa, bool do_frag, + bool do_connect, struct sockaddr_storage *dsa) +{ + struct timeval to = { .tv_sec = TIMEOUT_SEC }; + int fd, family = ssa->ss_family; + int salen = sockaddr_len(family); + + fd = socket(family, SOCK_DGRAM, 0); + if (fd < 0) + return -1; + + if (bind(fd, (struct sockaddr *)ssa, salen) < 0) { + perror("bind"); + goto err; + } + + if (do_connect && connect(fd, (struct sockaddr *)dsa, salen) < 0) { + perror("connect"); + goto err; + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)) < 0) { + perror("setsockopt(SO_RCVTIMEO)"); + goto err; + } + + if (!do_frag && set_pmtu_discover(fd, family == AF_INET) < 0) { + perror("set_pmtu_discover"); + goto err; + } + return fd; + +err: + close(fd); + return -1; +} + +static void parse_route_rsp(struct rt_route_getroute_rsp *rsp, void *rtm_type) +{ + *(uint8_t *)rtm_type = rsp->_hdr.rtm_type; +} + +static int ip_route_check(const char *intf, int family, void *addr) +{ + uint8_t rtm_type, table = RT_TABLE_LOCAL; + int retries = MAX_RETRIES; - rtnl = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); - if (rtnl < 0) { - fprintf(stderr, "can't open rtnl: %s\n", strerror(errno)); - return 1; + while (retries-- > 0) { + if (ip_route_get(intf, family, table, addr, parse_route_rsp, + &rtm_type) == 0 && + rtm_type == RTN_LOCAL) + break; + + usleep(TIMEOUT_USEC); } - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_ALIGN(NLMSG_LENGTH(sizeof(req.ifm))); - req.nh.nlmsg_flags = NLM_F_REQUEST; - req.nh.nlmsg_type = RTM_DELLINK; + if (retries < 0) + return -1; + + return 0; +} + +static int send_gso_udp_msg(int socket, struct sockaddr_storage *addr, + uint8_t *send_buf, int send_len, int gso_size) +{ + char control[CMSG_SPACE(sizeof(uint16_t))] = { 0 }; + int alen = sockaddr_len(addr->ss_family); + struct msghdr msg = { 0 }; + struct iovec iov = { 0 }; + int ret; + + iov.iov_base = send_buf; + iov.iov_len = send_len; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = addr; + msg.msg_namelen = alen; - req.ifm.ifi_family = AF_UNSPEC; + if (gso_size > 0) { + struct cmsghdr *cmsg; - rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); - rta->rta_type = IFLA_IFNAME; - rta->rta_len = RTA_LENGTH(IFNAMSIZ); - req.nh.nlmsg_len += rta->rta_len; - memcpy(RTA_DATA(rta), dev, IFNAMSIZ); + msg.msg_control = control; + msg.msg_controllen = sizeof(control); - ret = send(rtnl, &req, req.nh.nlmsg_len, 0); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_UDP; + cmsg->cmsg_type = UDP_SEGMENT; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint16_t)); + *(uint16_t *)CMSG_DATA(cmsg) = gso_size; + } + + ret = sendmsg(socket, &msg, 0); if (ret < 0) - fprintf(stderr, "can't send: %s\n", strerror(errno)); - ret = (unsigned int)ret != req.nh.nlmsg_len; + perror("sendmsg"); - close(rtnl); return ret; } +static int validate_hdrlen(uint8_t **cur, int *len, int x) +{ + if (*len < x) + return -1; + *cur += x; + *len -= x; + return 0; +} + +static int parse_udp_tunnel_vnet_packet(uint8_t *buf, int len, int tunnel_type, + bool is_tap) +{ + struct ipv6hdr *iph6; + struct udphdr *udph; + struct iphdr *iph4; + uint8_t *cur = buf; + + if (validate_hdrlen(&cur, &len, TUN_VNET_TNL_SIZE)) + return -1; + + if (is_tap) { + if (validate_hdrlen(&cur, &len, ETH_HLEN)) + return -1; + } + + if (tunnel_type & UDP_TUNNEL_OUTER_IPV4) { + iph4 = (struct iphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct iphdr))) + return -1; + if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP) + return -1; + } else { + iph6 = (struct ipv6hdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr))) + return -1; + if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP) + return -1; + } + + udph = (struct udphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct udphdr))) + return -1; + if (ntohs(udph->dest) != VN_PORT) + return -1; + + if (validate_hdrlen(&cur, &len, GENEVE_HLEN)) + return -1; + if (validate_hdrlen(&cur, &len, ETH_HLEN)) + return -1; + + if (tunnel_type & UDP_TUNNEL_INNER_IPV4) { + iph4 = (struct iphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct iphdr))) + return -1; + if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP) + return -1; + } else { + iph6 = (struct ipv6hdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr))) + return -1; + if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP) + return -1; + } + + udph = (struct udphdr *)cur; + if (validate_hdrlen(&cur, &len, sizeof(struct udphdr))) + return -1; + if (ntohs(udph->dest) != UDP_DST_PORT) + return -1; + + return len; +} + FIXTURE(tun) { char ifname[IFNAMSIZ]; @@ -127,31 +505,36 @@ FIXTURE_TEARDOWN(tun) close(self->fd2); } -TEST_F(tun, delete_detach_close) { +TEST_F(tun, delete_detach_close) +{ EXPECT_EQ(tun_delete(self->ifname), 0); EXPECT_EQ(tun_detach(self->fd, self->ifname), -1); EXPECT_EQ(errno, 22); } -TEST_F(tun, detach_delete_close) { +TEST_F(tun, detach_delete_close) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, detach_close_delete) { +TEST_F(tun, detach_close_delete) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); close(self->fd); self->fd = -1; EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, reattach_delete_close) { +TEST_F(tun, reattach_delete_close) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); EXPECT_EQ(tun_delete(self->ifname), 0); } -TEST_F(tun, reattach_close_delete) { +TEST_F(tun, reattach_close_delete) +{ EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); close(self->fd); @@ -159,4 +542,447 @@ TEST_F(tun, reattach_close_delete) { EXPECT_EQ(tun_delete(self->ifname), 0); } +FIXTURE(tun_vnet_udptnl) +{ + char ifname[IFNAMSIZ]; + int fd, sock; +}; + +FIXTURE_VARIANT(tun_vnet_udptnl) +{ + int tunnel_type; + int gso_size; + int data_size; + int r_num_mss; + bool is_tap, no_gso; +}; + +/* clang-format off */ +#define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc) \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1byte) { \ + /* no GSO: send a single byte */ \ + .tunnel_type = type, \ + .data_size = 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1mss) { \ + /* no GSO: send a single MSS, fall back to no GSO */ \ + .tunnel_type = type, \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_gtmss) { \ + /* no GSO: send a single MSS + 1B: fail */ \ + .tunnel_type = type, \ + .data_size = UDP_TUNNEL_MSS(type) + 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1byte) { \ + /* GSO: send 1 byte, gso 1 byte, fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = 1, \ + .data_size = 1, \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) { \ + /* send a single MSS: fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_ltgso) { \ + /* data <= MSS < gso: will fall back to no GSO */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type) + 1, \ + .data_size = UDP_TUNNEL_MSS(type), \ + .r_num_mss = 1, \ + .is_tap = true, \ + .no_gso = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_gtgso) { \ + /* GSO: a single MSS + 1B */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type) + 1, \ + .r_num_mss = 2, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_2mss) { \ + /* no GSO: send exactly 2 MSS */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MSS(type) * 2, \ + .r_num_mss = 2, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxbytes) { \ + /* GSO: send max bytes */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = UDP_TUNNEL_MAX(type, true), \ + .r_num_mss = UDP_TUNNEL_MAX(type, true) / \ + UDP_TUNNEL_MSS(type) + 1, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_over_maxbytes) { \ + /* GSO: send oversize max bytes: fail */ \ + .tunnel_type = type, \ + .gso_size = UDP_TUNNEL_MSS(type), \ + .data_size = ETH_MAX_MTU, \ + .r_num_mss = ETH_MAX_MTU / UDP_TUNNEL_MSS(type) + 1, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxsegs) { \ + /* GSO: send max number of min sized segments */ \ + .tunnel_type = type, \ + .gso_size = 1, \ + .data_size = UDP_TUNNEL_MAX_SEGMENTS, \ + .r_num_mss = UDP_TUNNEL_MAX_SEGMENTS, \ + .is_tap = true, \ + }; \ + FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_5byte) { \ + /* GSO: send 5 bytes, gso 2 bytes */ \ + .tunnel_type = type, \ + .gso_size = 2, \ + .data_size = 5, \ + .r_num_mss = 3, \ + .is_tap = true, \ + } /* clang-format on */ + +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN6, 4in6); +TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN6, 6in6); + +static void assign_ifaddr_vars(int family, int is_outer, void **srcip, + void **dstip, void **srcmac, void **dstmac) +{ + if (is_outer) { + if (family == AF_INET) { + *srcip = (void *)¶m_ipaddr4_outer_src; + *dstip = (void *)¶m_ipaddr4_outer_dst; + } else { + *srcip = (void *)¶m_ipaddr6_outer_src; + *dstip = (void *)¶m_ipaddr6_outer_dst; + } + *srcmac = param_hwaddr_outer_src; + *dstmac = param_hwaddr_outer_dst; + } else { + if (family == AF_INET) { + *srcip = (void *)¶m_ipaddr4_inner_src; + *dstip = (void *)¶m_ipaddr4_inner_dst; + } else { + *srcip = (void *)¶m_ipaddr6_inner_src; + *dstip = (void *)¶m_ipaddr6_inner_dst; + } + *srcmac = param_hwaddr_inner_src; + *dstmac = param_hwaddr_inner_dst; + } +} + +static void assign_sockaddr_vars(int family, int is_outer, + struct sockaddr_storage *src, + struct sockaddr_storage *dst) +{ + src->ss_family = family; + dst->ss_family = family; + + if (family == AF_INET) { + struct sockaddr_in *s4 = (struct sockaddr_in *)src; + struct sockaddr_in *d4 = (struct sockaddr_in *)dst; + + s4->sin_addr = is_outer ? param_ipaddr4_outer_src : + param_ipaddr4_inner_src; + d4->sin_addr = is_outer ? param_ipaddr4_outer_dst : + param_ipaddr4_inner_dst; + if (!is_outer) { + s4->sin_port = htons(UDP_SRC_PORT); + d4->sin_port = htons(UDP_DST_PORT); + } + } else { + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *d6 = (struct sockaddr_in6 *)dst; + + s6->sin6_addr = is_outer ? param_ipaddr6_outer_src : + param_ipaddr6_inner_src; + d6->sin6_addr = is_outer ? param_ipaddr6_outer_dst : + param_ipaddr6_inner_dst; + if (!is_outer) { + s6->sin6_port = htons(UDP_SRC_PORT); + d6->sin6_port = htons(UDP_DST_PORT); + } + } +} + +FIXTURE_SETUP(tun_vnet_udptnl) +{ + int ret, family, prefix, flags, features; + int tunnel_type = variant->tunnel_type; + struct sockaddr_storage ssa, dsa; + void *sip, *dip, *smac, *dmac; + + flags = (variant->is_tap ? IFF_TAP : IFF_TUN) | IFF_VNET_HDR | + IFF_MULTI_QUEUE | IFF_NO_PI; + features = TUN_F_CSUM | TUN_F_UDP_TUNNEL_GSO | + TUN_F_UDP_TUNNEL_GSO_CSUM | TUN_F_USO4 | TUN_F_USO6; + self->fd = tun_open(self->ifname, flags, TUN_VNET_TNL_SIZE, features, + param_hwaddr_outer_src); + ASSERT_GE(self->fd, 0); + + family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : AF_INET6; + prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN; + assign_ifaddr_vars(family, 1, &sip, &dip, &smac, &dmac); + + ret = ip_addr_add(self->ifname, family, sip, prefix); + ASSERT_EQ(ret, 0); + ret = ip_neigh_add(self->ifname, family, dip, dmac); + ASSERT_EQ(ret, 0); + ret = ip_route_check(self->ifname, family, sip); + ASSERT_EQ(ret, 0); + + ret = geneve_create(param_dev_geneve_name, family, dip, + param_hwaddr_inner_src); + ASSERT_EQ(ret, 0); + + family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : AF_INET6; + prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN; + assign_ifaddr_vars(family, 0, &sip, &dip, &smac, &dmac); + + ret = ip_addr_add(param_dev_geneve_name, family, sip, prefix); + ASSERT_EQ(ret, 0); + ret = ip_neigh_add(param_dev_geneve_name, family, dip, dmac); + ASSERT_EQ(ret, 0); + ret = ip_route_check(param_dev_geneve_name, family, sip); + ASSERT_EQ(ret, 0); + + assign_sockaddr_vars(family, 0, &ssa, &dsa); + self->sock = udp_socket_open(&ssa, false, true, &dsa); + ASSERT_GE(self->sock, 0); +} + +FIXTURE_TEARDOWN(tun_vnet_udptnl) +{ + int ret; + + if (self->sock != -1) + close(self->sock); + + ret = ip_link_del(param_dev_geneve_name); + EXPECT_EQ(ret, 0); + + ret = tun_delete(self->ifname); + EXPECT_EQ(ret, 0); +} + +static int build_gso_packet_into_tun(const FIXTURE_VARIANT(tun_vnet_udptnl) * + variant, + uint8_t *buf) +{ + int pktlen, hlen, proto, inner_family, outer_family; + int tunnel_type = variant->tunnel_type; + int payload_len = variant->data_size; + int gso_size = variant->gso_size; + uint8_t *outer_udph, *cur = buf; + void *sip, *dip, *smac, *dmac; + bool is_tap = variant->is_tap; + + hlen = (is_tap ? ETH_HLEN : 0) + UDP_TUNNEL_HDRLEN(tunnel_type); + inner_family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : + AF_INET6; + outer_family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : + AF_INET6; + + cur += build_virtio_net_hdr_v1_hash_tunnel(cur, is_tap, hlen, gso_size, + outer_family, inner_family); + + pktlen = hlen + payload_len; + assign_ifaddr_vars(outer_family, 1, &sip, &dip, &smac, &dmac); + + if (is_tap) { + proto = outer_family == AF_INET ? ETH_P_IP : ETH_P_IPV6; + pktlen -= ETH_HLEN; + cur += build_eth(cur, proto, dmac, smac); + } + + if (outer_family == AF_INET) { + pktlen = pktlen - sizeof(struct iphdr); + cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip); + } else { + pktlen = pktlen - sizeof(struct ipv6hdr); + cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip); + } + + outer_udph = cur; + assign_ifaddr_vars(inner_family, 0, &sip, &dip, &smac, &dmac); + + pktlen -= sizeof(struct udphdr); + proto = inner_family == AF_INET ? ETH_P_IP : ETH_P_IPV6; + cur += build_udp_header(cur, UDP_SRC_PORT, VN_PORT, pktlen); + cur += build_geneve_header(cur, VN_ID); + cur += build_eth(cur, proto, dmac, smac); + + pktlen = sizeof(struct udphdr) + payload_len; + if (inner_family == AF_INET) + cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip); + else + cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip); + + cur += build_udp_packet(cur, UDP_DST_PORT, UDP_SRC_PORT, payload_len, + inner_family, false); + + build_udp_packet_csum(outer_udph, outer_family, false); + + return cur - buf; +} + +static int +receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * variant, + int *r_num_mss) +{ + uint8_t packet_buf[MAX_VNET_TUNNEL_PACKET_SZ]; + int len, total_len = 0, socket = self->sock; + int payload_len = variant->data_size; + + while (total_len < payload_len) { + len = recv(socket, packet_buf, sizeof(packet_buf), 0); + if (len <= 0) { + if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + perror("recv"); + break; + } + + (*r_num_mss)++; + total_len += len; + } + + return total_len; +} + +static int send_gso_packet_into_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * + variant) +{ + int family = (variant->tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : + AF_INET6; + uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ] = { 0 }; + int payload_len = variant->data_size; + int gso_size = variant->gso_size; + struct sockaddr_storage ssa, dsa; + + assign_sockaddr_vars(family, 0, &ssa, &dsa); + return send_gso_udp_msg(self->sock, &dsa, buf, payload_len, gso_size); +} + +static int +receive_gso_packet_from_tun(FIXTURE_DATA(tun_vnet_udptnl) * self, + const FIXTURE_VARIANT(tun_vnet_udptnl) * variant, + struct virtio_net_hdr_v1_hash_tunnel *vnet_hdr) +{ + struct timeval timeout = { .tv_sec = TIMEOUT_SEC }; + uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ]; + int tunnel_type = variant->tunnel_type; + int payload_len = variant->data_size; + bool is_tap = variant->is_tap; + int ret, len, total_len = 0; + int tun_fd = self->fd; + fd_set fdset; + + while (total_len < payload_len) { + FD_ZERO(&fdset); + FD_SET(tun_fd, &fdset); + + ret = select(tun_fd + 1, &fdset, NULL, NULL, &timeout); + if (ret <= 0) { + perror("select"); + break; + } + if (!FD_ISSET(tun_fd, &fdset)) + continue; + + len = read(tun_fd, buf, sizeof(buf)); + if (len <= 0) { + if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + perror("read"); + break; + } + + len = parse_udp_tunnel_vnet_packet(buf, len, tunnel_type, + is_tap); + if (len < 0) + continue; + + if (total_len == 0) + memcpy(vnet_hdr, buf, TUN_VNET_TNL_SIZE); + + total_len += len; + } + + return total_len; +} + +TEST_F(tun_vnet_udptnl, send_gso_packet) +{ + uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ]; + int r_num_mss = 0; + int ret, off; + + memset(pkt, 0, sizeof(pkt)); + off = build_gso_packet_into_tun(variant, pkt); + ret = write(self->fd, pkt, off); + ASSERT_EQ(ret, off); + + ret = receive_gso_packet_from_tunnel(self, variant, &r_num_mss); + ASSERT_EQ(ret, variant->data_size); + ASSERT_EQ(r_num_mss, variant->r_num_mss); +} + +TEST_F(tun_vnet_udptnl, recv_gso_packet) +{ + struct virtio_net_hdr_v1_hash_tunnel vnet_hdr = { 0 }; + struct virtio_net_hdr_v1 *vh = &vnet_hdr.hash_hdr.hdr; + int ret, gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; + + ret = send_gso_packet_into_tunnel(self, variant); + ASSERT_EQ(ret, variant->data_size); + + memset(&vnet_hdr, 0, sizeof(vnet_hdr)); + ret = receive_gso_packet_from_tun(self, variant, &vnet_hdr); + ASSERT_EQ(ret, variant->data_size); + + if (!variant->no_gso) { + ASSERT_EQ(vh->gso_size, variant->gso_size); + gso_type |= (variant->tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? + (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4) : + (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6); + ASSERT_EQ(vh->gso_type, gso_type); + } +} + +XFAIL_ADD(tun_vnet_udptnl, 4in4_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_nogsosz_gtmss, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_nogsosz_gtmss, recv_gso_packet); + +XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, send_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, send_gso_packet); + +XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet); +XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet); + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/tuntap_helpers.h b/tools/testing/selftests/net/tuntap_helpers.h new file mode 100644 index 000000000000..d6c0437136ec --- /dev/null +++ b/tools/testing/selftests/net/tuntap_helpers.h @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _TUNTAP_HELPERS_H +#define _TUNTAP_HELPERS_H + +#include <errno.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <linux/virtio_net.h> +#include <netinet/in.h> +#include <netinet/if_ether.h> +#include <netinet/udp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <ynl.h> + +#include "rt-route-user.h" +#include "rt-addr-user.h" +#include "rt-neigh-user.h" +#include "rt-link-user.h" + +#define GENEVE_HLEN 8 +#define PKT_DATA 0xCB +#define TUNTAP_DEFAULT_TTL 8 +#define TUNTAP_DEFAULT_IPID 1337 + +unsigned int if_nametoindex(const char *ifname); + +static inline int ip_addr_len(int family) +{ + return (family == AF_INET) ? sizeof(struct in_addr) : + sizeof(struct in6_addr); +} + +static inline void fill_ifaddr_msg(struct ifaddrmsg *ifam, int family, + int prefix, int flags, const char *dev) +{ + ifam->ifa_family = family; + ifam->ifa_prefixlen = prefix; + ifam->ifa_index = if_nametoindex(dev); + ifam->ifa_flags = flags; + ifam->ifa_scope = RT_SCOPE_UNIVERSE; +} + +static inline int ip_addr_add(const char *dev, int family, void *addr, + uint8_t prefix) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + int ifa_flags = IFA_F_PERMANENT | IFA_F_NODAD; + int ret = -1, ipalen = ip_addr_len(family); + struct rt_addr_newaddr_req *req; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_addr_family, NULL); + if (!ys) + return -1; + + req = rt_addr_newaddr_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_ifaddr_msg(&req->_hdr, family, prefix, ifa_flags, dev); + rt_addr_newaddr_req_set_nlflags(req, nl_flags); + rt_addr_newaddr_req_set_local(req, addr, ipalen); + + ret = rt_addr_newaddr(ys, req); + rt_addr_newaddr_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline void fill_neigh_req_header(struct ndmsg *ndm, int family, + int state, const char *dev) +{ + ndm->ndm_family = family; + ndm->ndm_ifindex = if_nametoindex(dev); + ndm->ndm_state = state; + ndm->ndm_flags = 0; + ndm->ndm_type = RTN_UNICAST; +} + +static inline int ip_neigh_add(const char *dev, int family, void *addr, + unsigned char *lladdr) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + int ret = -1, ipalen = ip_addr_len(family); + struct rt_neigh_newneigh_req *req; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_neigh_family, NULL); + if (!ys) + return -1; + + req = rt_neigh_newneigh_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_neigh_req_header(&req->_hdr, family, NUD_PERMANENT, dev); + rt_neigh_newneigh_req_set_nlflags(req, nl_flags); + rt_neigh_newneigh_req_set_dst(req, addr, ipalen); + rt_neigh_newneigh_req_set_lladdr(req, lladdr, ETH_ALEN); + rt_neigh_newneigh_req_set_ifindex(req, if_nametoindex(dev)); + + ret = rt_neigh_newneigh(ys, req); + rt_neigh_newneigh_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline void fill_route_req_header(struct rtmsg *rtm, int family, + int table) +{ + rtm->rtm_family = family; + rtm->rtm_table = table; +} + +static inline int +ip_route_get(const char *dev, int family, int table, void *dst, + void (*parse_rsp)(struct rt_route_getroute_rsp *rsp, void *out), + void *out) +{ + int ret = -1, ipalen = ip_addr_len(family); + struct rt_route_getroute_req *req; + struct rt_route_getroute_rsp *rsp; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_route_family, NULL); + if (!ys) + return -1; + + req = rt_route_getroute_req_alloc(); + if (!req) + goto err_req_alloc; + + fill_route_req_header(&req->_hdr, family, table); + rt_route_getroute_req_set_nlflags(req, NLM_F_REQUEST); + rt_route_getroute_req_set_dst(req, dst, ipalen); + rt_route_getroute_req_set_oif(req, if_nametoindex(dev)); + + rsp = rt_route_getroute(ys, req); + if (!rsp) + goto err_rsp_get; + + ret = 0; + if (parse_rsp) + parse_rsp(rsp, out); + + rt_route_getroute_rsp_free(rsp); +err_rsp_get: + rt_route_getroute_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline int +ip_link_add(const char *dev, char *link_type, + int (*fill_link_attr)(struct rt_link_newlink_req *req, void *data), + void *data) +{ + int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + struct rt_link_newlink_req *req; + struct ynl_sock *ys; + int ret = -1; + + ys = ynl_sock_create(&ynl_rt_link_family, NULL); + if (!ys) + return -1; + + req = rt_link_newlink_req_alloc(); + if (!req) + goto err_req_alloc; + + req->_hdr.ifi_flags = IFF_UP; + rt_link_newlink_req_set_nlflags(req, nl_flags); + rt_link_newlink_req_set_ifname(req, dev); + rt_link_newlink_req_set_linkinfo_kind(req, link_type); + + if (fill_link_attr && fill_link_attr(req, data) < 0) + goto err_attr_fill; + + ret = rt_link_newlink(ys, req); +err_attr_fill: + rt_link_newlink_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline int ip_link_del(const char *dev) +{ + struct rt_link_dellink_req *req; + struct ynl_sock *ys; + int ret = -1; + + ys = ynl_sock_create(&ynl_rt_link_family, NULL); + if (!ys) + return -1; + + req = rt_link_dellink_req_alloc(); + if (!req) + goto err_req_alloc; + + rt_link_dellink_req_set_nlflags(req, NLM_F_REQUEST); + rt_link_dellink_req_set_ifname(req, dev); + + ret = rt_link_dellink(ys, req); + rt_link_dellink_req_free(req); +err_req_alloc: + ynl_sock_destroy(ys); + return ret; +} + +static inline size_t build_eth(uint8_t *buf, uint16_t proto, unsigned char *src, + unsigned char *dest) +{ + struct ethhdr *eth = (struct ethhdr *)buf; + + eth->h_proto = htons(proto); + memcpy(eth->h_source, src, ETH_ALEN); + memcpy(eth->h_dest, dest, ETH_ALEN); + + return ETH_HLEN; +} + +static inline uint32_t add_csum(const uint8_t *buf, int len) +{ + uint16_t *sbuf = (uint16_t *)buf; + uint32_t sum = 0; + + while (len > 1) { + sum += *sbuf++; + len -= 2; + } + + if (len) + sum += *(uint8_t *)sbuf; + + return sum; +} + +static inline uint16_t finish_ip_csum(uint32_t sum) +{ + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + return ~((uint16_t)sum); +} + +static inline uint16_t build_ip_csum(const uint8_t *buf, int len, uint32_t sum) +{ + sum += add_csum(buf, len); + return finish_ip_csum(sum); +} + +static inline int build_ipv4_header(uint8_t *buf, uint8_t proto, + int payload_len, struct in_addr *src, + struct in_addr *dst) +{ + struct iphdr *iph = (struct iphdr *)buf; + + iph->ihl = 5; + iph->version = 4; + iph->ttl = TUNTAP_DEFAULT_TTL; + iph->tot_len = htons(sizeof(*iph) + payload_len); + iph->id = htons(TUNTAP_DEFAULT_IPID); + iph->protocol = proto; + iph->saddr = src->s_addr; + iph->daddr = dst->s_addr; + iph->check = build_ip_csum(buf, iph->ihl << 2, 0); + + return iph->ihl << 2; +} + +static inline void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield) +{ + uint16_t val, *ptr = (uint16_t *)ip6h; + + val = ntohs(*ptr); + val &= 0xF00F; + val |= ((uint16_t)dsfield) << 4; + *ptr = htons(val); +} + +static inline int build_ipv6_header(uint8_t *buf, uint8_t proto, + uint8_t dsfield, int payload_len, + struct in6_addr *src, struct in6_addr *dst) +{ + struct ipv6hdr *ip6h = (struct ipv6hdr *)buf; + + ip6h->version = 6; + ip6h->payload_len = htons(payload_len); + ip6h->nexthdr = proto; + ip6h->hop_limit = TUNTAP_DEFAULT_TTL; + ipv6_set_dsfield(ip6h, dsfield); + memcpy(&ip6h->saddr, src, sizeof(ip6h->saddr)); + memcpy(&ip6h->daddr, dst, sizeof(ip6h->daddr)); + + return sizeof(struct ipv6hdr); +} + +static inline int build_geneve_header(uint8_t *buf, uint32_t vni) +{ + uint16_t protocol = htons(ETH_P_TEB); + uint32_t geneve_vni = htonl((vni << 8) & 0xffffff00); + + memcpy(buf + 2, &protocol, 2); + memcpy(buf + 4, &geneve_vni, 4); + return GENEVE_HLEN; +} + +static inline int build_udp_header(uint8_t *buf, uint16_t sport, uint16_t dport, + int payload_len) +{ + struct udphdr *udph = (struct udphdr *)buf; + + udph->source = htons(sport); + udph->dest = htons(dport); + udph->len = htons(sizeof(*udph) + payload_len); + return sizeof(*udph); +} + +static inline void build_udp_packet_csum(uint8_t *buf, int family, + bool csum_off) +{ + struct udphdr *udph = (struct udphdr *)buf; + size_t ipalen = ip_addr_len(family); + uint32_t sum; + + /* No extension IPv4 and IPv6 headers addresses are the last fields */ + sum = add_csum(buf - 2 * ipalen, 2 * ipalen); + sum += htons(IPPROTO_UDP) + udph->len; + + if (!csum_off) + sum += add_csum(buf, udph->len); + + udph->check = finish_ip_csum(sum); +} + +static inline int build_udp_packet(uint8_t *buf, uint16_t sport, uint16_t dport, + int payload_len, int family, bool csum_off) +{ + struct udphdr *udph = (struct udphdr *)buf; + + build_udp_header(buf, sport, dport, payload_len); + memset(buf + sizeof(*udph), PKT_DATA, payload_len); + build_udp_packet_csum(buf, family, csum_off); + + return sizeof(*udph) + payload_len; +} + +static inline int build_virtio_net_hdr_v1_hash_tunnel(uint8_t *buf, bool is_tap, + int hdr_len, int gso_size, + int outer_family, + int inner_family) +{ + struct virtio_net_hdr_v1_hash_tunnel *vh_tunnel = (void *)buf; + struct virtio_net_hdr_v1 *vh = &vh_tunnel->hash_hdr.hdr; + int outer_iphlen, inner_iphlen, eth_hlen, gso_type; + + eth_hlen = is_tap ? ETH_HLEN : 0; + outer_iphlen = (outer_family == AF_INET) ? sizeof(struct iphdr) : + sizeof(struct ipv6hdr); + inner_iphlen = (inner_family == AF_INET) ? sizeof(struct iphdr) : + sizeof(struct ipv6hdr); + + vh_tunnel->outer_th_offset = eth_hlen + outer_iphlen; + vh_tunnel->inner_nh_offset = vh_tunnel->outer_th_offset + ETH_HLEN + + GENEVE_HLEN + sizeof(struct udphdr); + + vh->csum_start = vh_tunnel->inner_nh_offset + inner_iphlen; + vh->csum_offset = __builtin_offsetof(struct udphdr, check); + vh->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vh->hdr_len = hdr_len; + vh->gso_size = gso_size; + + if (gso_size) { + gso_type = outer_family == AF_INET ? + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 : + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; + vh->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4 | gso_type; + } + + return sizeof(struct virtio_net_hdr_v1_hash_tunnel); +} + +#endif /* _TUNTAP_HELPERS_H */ diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c index bcc14688661d..170be192f5c7 100644 --- a/tools/testing/selftests/net/txtimestamp.c +++ b/tools/testing/selftests/net/txtimestamp.c @@ -206,12 +206,10 @@ static void __print_timestamp(const char *name, struct timespec *cur, fprintf(stderr, "\n"); } -static void print_timestamp_usr(void) +static void record_timestamp_usr(void) { if (clock_gettime(CLOCK_REALTIME, &ts_usr)) error(1, errno, "clock_gettime"); - - __print_timestamp(" USR", &ts_usr, 0, 0); } static void print_timestamp(struct scm_timestamping *tss, int tstype, @@ -599,8 +597,6 @@ static void do_test(int family, unsigned int report_opt) fill_header_udp(buf + off, family == PF_INET); } - print_timestamp_usr(); - iov.iov_base = buf; iov.iov_len = total_len; @@ -655,10 +651,14 @@ static void do_test(int family, unsigned int report_opt) } + record_timestamp_usr(); + val = sendmsg(fd, &msg, 0); if (val != total_len) error(1, errno, "send"); + __print_timestamp(" USR", &ts_usr, 0, 0); + /* wait for all errors to be queued, else ACKs arrive OOO */ if (cfg_sleep_usec) usleep(cfg_sleep_usec); diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 40f5c2908dda..0370489d938b 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := nolibc-test +TEST_GEN_PROGS := nolibc-test libc-test include ../lib.mk include $(top_srcdir)/scripts/Makefile.compiler @@ -9,16 +9,16 @@ cc-option = $(call __cc-option, $(CC),,$(1),$(2)) include Makefile.include -CFLAGS = -nostdlib -nostdinc -static \ +$(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \ -isystem $(top_srcdir)/tools/include/nolibc -isystem $(top_srcdir)/usr/include \ $(CFLAGS_NOLIBC_TEST) - -ifeq ($(LLVM),) -LDLIBS := -lgcc -endif - +$(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc) $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers +$(OUTPUT)/libc-test: nolibc-test.c nolibc-test-linkage.c + $(call msg,CC,,$@) + $(Q)$(LINK.c) $^ -o $@ + help: @echo "For the custom nolibc testsuite use '$(MAKE) -f Makefile.nolibc'; available targets:" @$(MAKE) -f Makefile.nolibc help diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index f9d43cbdc894..f5704193038f 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -226,7 +226,7 @@ CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6 CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6 CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2 CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld) -CFLAGS_sparc32 = $(call cc-option,-m32) +CFLAGS_sparc32 = $(call cc-option,-m32) -mcpu=v8 CFLAGS_sh4 = -ml -m4 ifeq ($(origin XARCH),command line) CFLAGS_XARCH = $(CFLAGS_$(XARCH)) @@ -302,15 +302,9 @@ sysroot/$(ARCH)/include: $(Q)$(MAKE) -C $(srctree)/tools/include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone headers_check $(Q)mv sysroot/sysroot sysroot/$(ARCH) -ifneq ($(NOLIBC_SYSROOT),0) nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC) -else -nolibc-test: nolibc-test.c nolibc-test-linkage.c - $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC) -endif libc-test: nolibc-test.c nolibc-test-linkage.c $(QUIET_CC)$(HOSTCC) -o $@ nolibc-test.c nolibc-test-linkage.c diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 3c5a226dad3a..1b9d3b2e2491 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -17,6 +17,7 @@ #include <sys/mman.h> #include <sys/mount.h> #include <sys/prctl.h> +#include <sys/ptrace.h> #include <sys/random.h> #include <sys/reboot.h> #include <sys/resource.h> @@ -877,6 +878,58 @@ int test_file_stream(void) return 0; } +int test_file_stream_wsr(void) +{ + const char dataout[] = "foo"; + const size_t datasz = sizeof(dataout); + char datain[datasz]; + int fd, r; + FILE *f; + + fd = open("/tmp", O_TMPFILE | O_RDWR, 0644); + if (fd == -1) + return -1; + + f = fdopen(fd, "w+"); + if (!f) + return -1; + + errno = 0; + r = fwrite(dataout, 1, datasz, f); + if (r != datasz) + return -1; + + /* Attempt to read from the file without rewinding, + * we should read 0 items. + */ + r = fread(datain, 1, datasz, f); + if (r) + return -1; + + /* Rewind the file to the start */ + r = fseek(f, 0, SEEK_SET); + if (r) + return -1; + + /* Attempt to read back more than was written to + * make sure we handle short reads properly. + * fread() should return the number of complete items. + */ + r = fread(datain, 1, datasz + 1, f); + if (r != datasz) + return -1; + + /* Data we read should match the data we just wrote */ + if (memcmp(datain, dataout, datasz) != 0) + return -1; + + r = fclose(f); + if (r) + return -1; + + return 0; +} + enum fork_type { FORK_STANDARD, FORK_VFORK, @@ -1351,6 +1404,7 @@ int run_syscall(int min, int max) CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break; CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break; CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break; + CASE_TEST(file_stream_wsr); EXPECT_SYSZR(1, test_file_stream_wsr()); break; CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; @@ -1403,9 +1457,10 @@ int run_syscall(int min, int max) CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break; CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break; CASE_TEST(readv_badf); EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break; - CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(1, NULL, 0)); break; + CASE_TEST(readv_zero); EXPECT_SYSZR(1, readv(0, NULL, 0)); break; CASE_TEST(writev_badf); EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break; CASE_TEST(writev_zero); EXPECT_SYSZR(1, writev(1, NULL, 0)); break; + CASE_TEST(ptrace); EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break; CASE_TEST(syscall_noargs); EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break; CASE_TEST(syscall_args); EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break; CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break; @@ -1428,6 +1483,34 @@ int test_difftime(void) return 0; } +int test_time_types(void) +{ +#ifdef NOLIBC + struct __kernel_timespec kts; + struct timespec ts; + + if (!__builtin_types_compatible_p(time_t, __kernel_time64_t)) + return 1; + + if (sizeof(ts) != sizeof(kts)) + return 1; + + if (!__builtin_types_compatible_p(__typeof__(ts.tv_sec), __typeof__(kts.tv_sec))) + return 1; + + if (!__builtin_types_compatible_p(__typeof__(ts.tv_nsec), __typeof__(kts.tv_nsec))) + return 1; + + if (offsetof(__typeof__(ts), tv_sec) != offsetof(__typeof__(kts), tv_sec)) + return 1; + + if (offsetof(__typeof__(ts), tv_nsec) != offsetof(__typeof__(kts), tv_nsec)) + return 1; +#endif /* NOLIBC */ + + return 0; +} + int run_stdlib(int min, int max) { int test; @@ -1553,6 +1636,7 @@ int run_stdlib(int min, int max) CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break; CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break; CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break; + CASE_TEST(time_types); EXPECT_ZR(is_nolibc, test_time_types()); break; case __LINE__: return ret; /* must be last */ diff --git a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c index 23aac6f97061..eecb776c33af 100644 --- a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c +++ b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c @@ -70,6 +70,23 @@ TEST_F(pci_ep_bar, BAR_TEST) EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno); } +TEST_F(pci_ep_bar, BAR_SUBRANGE_TEST) +{ + int ret; + + pci_ep_ioctl(PCITEST_SET_IRQTYPE, PCITEST_IRQ_TYPE_AUTO); + ASSERT_EQ(0, ret) TH_LOG("Can't set AUTO IRQ type"); + + pci_ep_ioctl(PCITEST_BAR_SUBRANGE, variant->barno); + if (ret == -ENODATA) + SKIP(return, "BAR is disabled"); + if (ret == -EBUSY) + SKIP(return, "BAR is test register space"); + if (ret == -EOPNOTSUPP) + SKIP(return, "Subrange map is not supported"); + EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno); +} + FIXTURE(pci_ep_basic) { int fd; diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index 6571e04acd88..8bed951e06a0 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -229,7 +229,7 @@ static void *pidfd_info_pause_thread(void *arg) close(ipc_socket); - /* Sleep untill we're killed. */ + /* Sleep until we're killed. */ pause(); return NULL; } diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh index ac6e5a6e1d3a..9f61c1579edf 100755 --- a/tools/testing/selftests/ptp/phc.sh +++ b/tools/testing/selftests/ptp/phc.sh @@ -8,17 +8,20 @@ ALL_TESTS=" " DEV=$1 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + ############################################################################## # Sanity checks if [[ "$(id -u)" -ne 0 ]]; then echo "SKIP: need root privileges" - exit 0 + exit $ksft_skip fi if [[ "$DEV" == "" ]]; then echo "SKIP: PTP device not provided" - exit 0 + exit $ksft_skip fi require_command() @@ -27,7 +30,7 @@ require_command() if [[ ! -x "$(command -v "$cmd")" ]]; then echo "SKIP: $cmd not installed" - exit 1 + exit $ksft_skip fi } @@ -37,7 +40,7 @@ phc_sanity() if [ $? != 0 ]; then echo "SKIP: unknown clock $DEV: No such device" - exit 1 + exit $ksft_skip fi } @@ -49,6 +52,7 @@ phc_sanity # Exit status to return at the end. Set in case one of the tests fails. EXIT_STATUS=0 +PASS_COUNT=0 # Per-test return value. Clear at the beginning of each test. RET=0 @@ -65,12 +69,18 @@ log_test() { local test_name=$1 + if [[ $RET -eq $ksft_skip ]]; then + printf "TEST: %-60s [SKIP]\n" "$test_name" + return 0 + fi + if [[ $RET -ne 0 ]]; then EXIT_STATUS=1 printf "TEST: %-60s [FAIL]\n" "$test_name" return 1 fi + ((PASS_COUNT++)) printf "TEST: %-60s [ OK ]\n" "$test_name" return 0 } @@ -89,34 +99,49 @@ tests_run() settime_do() { - local res + local res out - res=$(phc_ctl $DEV set 0 wait 120.5 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV set 0 wait 120.5 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 120 )) } adjtime_do() { - local res + local res out - res=$(phc_ctl $DEV set 0 adj 10 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV set 0 adj 10 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 10 )) } adjfreq_do() { - local res + local res out # Set the clock to be 1% faster - res=$(phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2> /dev/null \ - | awk '/clock time is/{print $5}' \ - | awk -F. '{print $1}') + out=$(LC_ALL=C phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2>&1) + if [[ $? -ne 0 ]]; then + if echo "$out" | grep -qi "Operation not supported"; then + return $ksft_skip + fi + return 1 + fi + res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}') (( res == 101 )) } @@ -163,4 +188,7 @@ trap cleanup EXIT tests_run +if [[ $EXIT_STATUS -eq 0 && $PASS_COUNT -eq 0 ]]; then + exit $ksft_skip +fi exit $EXIT_STATUS diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore index f6cbce77460b..b8fd42547a6e 100644 --- a/tools/testing/selftests/rcutorture/.gitignore +++ b/tools/testing/selftests/rcutorture/.gitignore @@ -3,3 +3,4 @@ initrd b[0-9]* res *.swp +.kvm.sh.lock diff --git a/tools/testing/selftests/rcutorture/bin/config2csv.sh b/tools/testing/selftests/rcutorture/bin/config2csv.sh index 0cf55f1bf654..aeab4d6f11ad 100755 --- a/tools/testing/selftests/rcutorture/bin/config2csv.sh +++ b/tools/testing/selftests/rcutorture/bin/config2csv.sh @@ -42,7 +42,7 @@ do grep -v '^#' < $i | grep -v '^ *$' > $T/p if test -r $i.boot then - tr -s ' ' '\012' < $i.boot | grep -v '^#' >> $T/p + sed -e 's/#.*$//' < $i.boot | tr -s ' ' '\012' >> $T/p fi sed -e 's/^[^=]*$/&=?/' < $T/p | sed -e 's/^\([^=]*\)=\(.*\)$/\tp["\1:'"$i"'"] = "\2";\n\tc["\1"] = 1;/' >> $T/p.awk diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index 2ff905a1853b..c4ee5f910931 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -15,7 +15,7 @@ # This script is intended to replace kvm-check-branches.sh by providing # ease of use and faster execution. -T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`" +T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"; export T trap 'rm -rf $T' 0 scriptname=$0 @@ -32,6 +32,7 @@ then echo "$0: Repetition ('*') not allowed in config list." exit 1 fi +config_list_len="`echo ${config_list} | wc -w | awk '{ print $1; }'`" commit_list="${2}" if test -z "${commit_list}" @@ -47,70 +48,209 @@ then exit 2 fi sha1_list=`cat $T/commits` +sha1_list_len="`echo ${sha1_list} | wc -w | awk '{ print $1; }'`" shift shift RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE PATH=${RCUTORTURE}/bin:$PATH; export PATH +RES="${RCUTORTURE}/res"; export RES . functions.sh ret=0 -nfail=0 +nbuildfail=0 +nrunfail=0 nsuccess=0 -faillist= +ncpus=0 +buildfaillist= +runfaillist= successlist= cursha1="`git rev-parse --abbrev-ref HEAD`" ds="`date +%Y.%m.%d-%H.%M.%S`-series" +DS="${RES}/${ds}"; export DS startdate="`date`" starttime="`get_starttime`" echo " --- " $scriptname $args | tee -a $T/log echo " --- Results directory: " $ds | tee -a $T/log +# Do all builds. Iterate through commits within a given scenario +# because builds normally go faster from one commit to the next within a +# given scenario. In contrast, switching scenarios on each rebuild will +# often force a full rebuild due to Kconfig differences, for example, +# turning preemption on and off. Defer actual runs in order to run +# lots of them concurrently on large systems. +touch $T/torunlist +n2build="$((config_list_len*sha1_list_len))" +nbuilt=0 for config in ${config_list} do sha_n=0 for sha in ${sha1_list} do sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order. - echo Starting ${config}/${sha1} at `date` | tee -a $T/log - git checkout "${sha}" - time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@" + echo + echo Starting ${config}/${sha1} "($((nbuilt+1)) of ${n2build})" at `date` | tee -a $T/log + git checkout --detach "${sha}" + tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@" curret=$? if test "${curret}" -ne 0 then - nfail=$((nfail+1)) - faillist="$faillist ${config}/${sha1}(${curret})" + nbuildfail=$((nbuildfail+1)) + buildfaillist="$buildfaillist ${config}/${sha1}(${curret})" else - nsuccess=$((nsuccess+1)) - successlist="$successlist ${config}/${sha1}" - # Successful run, so remove large files. - rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers} + batchncpus="`grep -v "^# cpus=" "${DS}/${config}/${sha1}/batches" | awk '{ sum += $3 } END { print sum }'`" + echo run_one_qemu ${sha_n} ${config}/${sha1} ${batchncpus} >> $T/torunlist + if test "${ncpus}" -eq 0 + then + ncpus="`grep "^# cpus=" "${DS}/${config}/${sha1}/batches" | sed -e 's/^# cpus=//'`" + case "${ncpus}" in + ^[0-9]*$) + ;; + *) + ncpus=0 + ;; + esac + fi fi if test "${ret}" -eq 0 then ret=${curret} fi sha_n=$((sha_n+1)) + nbuilt=$((nbuilt+1)) done done + +# If the user did not specify the number of CPUs, use them all. +if test "${ncpus}" -eq 0 +then + ncpus="`identify_qemu_vcpus`" +fi + +cpusused=0 +touch $T/successlistfile +touch $T/faillistfile +n2run="`wc -l $T/torunlist | awk '{ print $1; }'`" +nrun=0 + +# do_run_one_qemu ds resultsdir qemu_curout +# +# Start the specified qemu run and record its success or failure. +do_run_one_qemu () { + local ret + local ds="$1" + local resultsdir="$2" + local qemu_curout="$3" + + tools/testing/selftests/rcutorture/bin/kvm-again.sh "${DS}/${resultsdir}" --link inplace-force > ${qemu_curout} 2>&1 + ret=$? + if test "${ret}" -eq 0 + then + echo ${resultsdir} >> $T/successlistfile + # Successful run, so remove large files. + rm -f ${DS}/${resultsdir}/{vmlinux,bzImage,System.map,Module.symvers} + else + echo "${resultsdir}(${ret})" >> $T/faillistfile + fi +} + +# cleanup_qemu_batch batchncpus +# +# Update success and failure lists, files, and counts at the end of +# a batch. +cleanup_qemu_batch () { + local batchncpus="$1" + + echo Waiting, cpusused=${cpusused}, ncpus=${ncpus} `date` | tee -a $T/log + wait + cpusused="${batchncpus}" + nsuccessbatch="`wc -l $T/successlistfile | awk '{ print $1 }'`" + nsuccess=$((nsuccess+nsuccessbatch)) + successlist="$successlist `cat $T/successlistfile`" + rm $T/successlistfile + touch $T/successlistfile + nfailbatch="`wc -l $T/faillistfile | awk '{ print $1 }'`" + nrunfail=$((nrunfail+nfailbatch)) + runfaillist="$runfaillist `cat $T/faillistfile`" + rm $T/faillistfile + touch $T/faillistfile +} + +# run_one_qemu sha_n config/sha1 batchncpus +# +# Launch into the background the sha_n-th qemu job whose results directory +# is config/sha1 and which uses batchncpus CPUs. Once we reach a job that +# would overflow the number of available CPUs, wait for the previous jobs +# to complete and record their results. +run_one_qemu () { + local sha_n="$1" + local config_sha1="$2" + local batchncpus="$3" + local qemu_curout + + cpusused=$((cpusused+batchncpus)) + if test "${cpusused}" -gt $ncpus + then + cleanup_qemu_batch "${batchncpus}" + fi + echo Starting ${config_sha1} using ${batchncpus} CPUs "($((nrun+1)) of ${n2run})" `date` + qemu_curout="${DS}/${config_sha1}/qemu-series" + do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} & + nrun="$((nrun+1))" +} + +# Re-ordering the runs will mess up the affinity chosen at build time +# (among other things, over-using CPU 0), so suppress it. +TORTURE_NO_AFFINITY="no-affinity"; export TORTURE_NO_AFFINITY + +# Run the kernels (if any) that built correctly. +echo | tee -a $T/log # Put a blank line between build and run messages. +. $T/torunlist +cleanup_qemu_batch "${batchncpus}" + +# Get back to initial checkout/SHA-1. git checkout "${cursha1}" -echo ${nsuccess} SUCCESSES: | tee -a $T/log -echo ${successlist} | fmt | tee -a $T/log -echo | tee -a $T/log -echo ${nfail} FAILURES: | tee -a $T/log -echo ${faillist} | fmt | tee -a $T/log -if test -n "${faillist}" +# Throw away leading and trailing space characters for fmt. +successlist="`echo ${successlist} | sed -e 's/^ *//' -e 's/ *$//'`" +buildfaillist="`echo ${buildfaillist} | sed -e 's/^ *//' -e 's/ *$//'`" +runfaillist="`echo ${runfaillist} | sed -e 's/^ *//' -e 's/ *$//'`" + +# Print lists of successes, build failures, and run failures, if any. +if test "${nsuccess}" -gt 0 +then + echo | tee -a $T/log + echo ${nsuccess} SUCCESSES: | tee -a $T/log + echo ${successlist} | fmt | tee -a $T/log +fi +if test "${nbuildfail}" -gt 0 then echo | tee -a $T/log - echo Failures across commits: | tee -a $T/log - echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | + echo ${nbuildfail} BUILD FAILURES: | tee -a $T/log + echo ${buildfaillist} | fmt | tee -a $T/log +fi +if test "${nrunfail}" -gt 0 +then + echo | tee -a $T/log + echo ${nrunfail} RUN FAILURES: | tee -a $T/log + echo ${runfaillist} | fmt | tee -a $T/log +fi + +# If there were build or runtime failures, map them to commits. +if test "${nbuildfail}" -gt 0 || test "${nrunfail}" -gt 0 +then + echo | tee -a $T/log + echo Build failures across commits: | tee -a $T/log + echo ${buildfaillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' | sort | uniq -c | sort -k2n | tee -a $T/log fi + +# Print run summary. +echo | tee -a $T/log echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log -echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log -cp $T/log tools/testing/selftests/rcutorture/res/${ds} +echo Summary: Successes: ${nsuccess} " "Build Failures: ${nbuildfail} " "Runtime Failures: ${nrunfail}| tee -a $T/log +cp $T/log ${DS} exit "${ret}" diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index fff15821c44c..65b04b832733 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -80,6 +80,7 @@ usage () { echo " --kasan" echo " --kconfig Kconfig-options" echo " --kcsan" + echo " --kill-previous" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --memory megabytes|nnnG" @@ -206,6 +207,9 @@ do --kcsan) TORTURE_KCONFIG_KCSAN_ARG="$debuginfo CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; + --kill-previous) + TORTURE_KILL_PREVIOUS=1 + ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="`echo "$TORTURE_KMAKE_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`" @@ -275,6 +279,42 @@ do shift done +# Prevent concurrent kvm.sh runs on the same source tree. The flock +# is automatically released when the script exits, even if killed. +TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock" + +# Terminate any processes holding the lock file, if requested. +if test -n "$TORTURE_KILL_PREVIOUS" +then + if test -e "$TORTURE_LOCK" + then + echo "Killing processes holding $TORTURE_LOCK..." + if fuser -k "$TORTURE_LOCK" >/dev/null 2>&1 + then + sleep 2 + echo "Previous kvm.sh processes killed." + else + echo "No processes were holding the lock." + fi + else + echo "No lock file exists, nothing to kill." + fi +fi + +if test -z "$dryrun" +then + # Create a file descriptor and flock it, so that when kvm.sh (and its + # children) exit, the flock is released by the kernel automatically. + exec 9>"$TORTURE_LOCK" + if ! flock -n 9 + then + echo "ERROR: Another kvm.sh instance is already running on this tree." + echo " Lock file: $TORTURE_LOCK" + echo " To run kvm.sh, kill all existing kvm.sh runs first (--kill-previous)." + exit 1 + fi +fi + if test -n "$dryrun" || test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh then : diff --git a/tools/testing/selftests/rcutorture/bin/mktestid.sh b/tools/testing/selftests/rcutorture/bin/mktestid.sh index 16f9907a4dae..24f6261dab6a 100755 --- a/tools/testing/selftests/rcutorture/bin/mktestid.sh +++ b/tools/testing/selftests/rcutorture/bin/mktestid.sh @@ -18,7 +18,7 @@ fi echo Build directory: `pwd` > ${resdir}/testid.txt if test -d .git then - echo Current commit: `git rev-parse HEAD` >> ${resdir}/testid.txt + echo Current commit: `git show --oneline --no-patch HEAD` >> ${resdir}/testid.txt echo >> ${resdir}/testid.txt echo ' ---' Output of "'"git status"'": >> ${resdir}/testid.txt git status >> ${resdir}/testid.txt diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index 85b407467454..18efab346381 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -10,5 +10,4 @@ CONFIG_PROVE_LOCKING=n #CHECK#CONFIG_PROVE_RCU=n CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y -CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index 9003c56cd764..8da390e82829 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -9,6 +9,5 @@ CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_FORCE_TASKS_TRACE_RCU=y #CHECK#CONFIG_TASKS_TRACE_RCU=y -CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y CONFIG_DEBUG_OBJECTS=y diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index 94cfdba5308d..f00b622c1460 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -290,8 +290,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param static bool arch_supports_noncont_cat(const struct resctrl_test *test) { - /* AMD always supports non-contiguous CBM. */ - if (get_vendor() == ARCH_AMD) + unsigned int vendor_id = get_vendor(); + + /* AMD and Hygon always support non-contiguous CBM. */ + if (vendor_id == ARCH_AMD || vendor_id == ARCH_HYGON) return true; #if defined(__i386__) || defined(__x86_64__) /* arch */ diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index 3c51bdac2dfa..afe635b6e48d 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -23,6 +23,7 @@ #include <asm/unistd.h> #include <linux/perf_event.h> #include <linux/compiler.h> +#include <linux/bits.h> #include "kselftest.h" #define MB (1024 * 1024) @@ -36,8 +37,9 @@ * Define as bits because they're used for vendor_specific bitmask in * the struct resctrl_test. */ -#define ARCH_INTEL 1 -#define ARCH_AMD 2 +#define ARCH_INTEL BIT(0) +#define ARCH_AMD BIT(1) +#define ARCH_HYGON BIT(2) #define END_OF_TESTS 1 @@ -163,7 +165,7 @@ extern int snc_unreliable; extern char llc_occup_path[1024]; int snc_nodes_per_l3_cache(void); -int get_vendor(void); +unsigned int get_vendor(void); bool check_resctrlfs_support(void); int filter_dmesg(void); int get_domain_id(const char *resource, int cpu_no, int *domain_id); diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c index 5154ffd821c4..dbcd5eea9fbc 100644 --- a/tools/testing/selftests/resctrl/resctrl_tests.c +++ b/tools/testing/selftests/resctrl/resctrl_tests.c @@ -23,16 +23,24 @@ static struct resctrl_test *resctrl_tests[] = { &l2_noncont_cat_test, }; -static int detect_vendor(void) +static unsigned int detect_vendor(void) { - FILE *inf = fopen("/proc/cpuinfo", "r"); - int vendor_id = 0; + static unsigned int vendor_id; + static bool initialized; char *s = NULL; + FILE *inf; char *res; - if (!inf) + if (initialized) return vendor_id; + inf = fopen("/proc/cpuinfo", "r"); + if (!inf) { + vendor_id = 0; + initialized = true; + return vendor_id; + } + res = fgrep(inf, "vendor_id"); if (res) @@ -42,18 +50,22 @@ static int detect_vendor(void) vendor_id = ARCH_INTEL; else if (s && !strcmp(s, ": AuthenticAMD\n")) vendor_id = ARCH_AMD; + else if (s && !strcmp(s, ": HygonGenuine\n")) + vendor_id = ARCH_HYGON; fclose(inf); free(res); + + initialized = true; return vendor_id; } -int get_vendor(void) +unsigned int get_vendor(void) { - static int vendor = -1; + unsigned int vendor; + + vendor = detect_vendor(); - if (vendor == -1) - vendor = detect_vendor(); if (vendor == 0) ksft_print_msg("Can not get vendor info...\n"); diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c index 195f04c4d158..b9c1bfb6cc02 100644 --- a/tools/testing/selftests/resctrl/resctrlfs.c +++ b/tools/testing/selftests/resctrl/resctrlfs.c @@ -243,6 +243,16 @@ int snc_nodes_per_l3_cache(void) } snc_mode = cache_cpus / node_cpus; + /* + * On some platforms (e.g. Hygon), + * cache_cpus < node_cpus, the calculated snc_mode is 0. + * + * Set snc_mode = 1 to indicate that SNC mode is not + * supported on the platform. + */ + if (!snc_mode) + snc_mode = 1; + if (snc_mode > 1) ksft_print_msg("SNC-%d mode discovered.\n", snc_mode); } diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 099b8c1f46f8..5671b4405a12 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector +RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector cfi else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/cfi/.gitignore b/tools/testing/selftests/riscv/cfi/.gitignore new file mode 100644 index 000000000000..c1faf7ca4346 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/.gitignore @@ -0,0 +1,2 @@ +cfitests +shadowstack diff --git a/tools/testing/selftests/riscv/cfi/Makefile b/tools/testing/selftests/riscv/cfi/Makefile new file mode 100644 index 000000000000..96a4dc4b69c3 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/Makefile @@ -0,0 +1,23 @@ +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -I$(top_srcdir)/tools/include + +CFLAGS += -march=rv64gc_zicfilp_zicfiss -fcf-protection=full + +# Check for zicfi* extensions needs cross compiler +# which is not set until lib.mk is included +ifeq ($(LLVM)$(CC),cc) +CC := $(CROSS_COMPILE)gcc +endif + + +ifeq ($(shell $(CC) $(CFLAGS) -nostdlib -xc /dev/null -o /dev/null > /dev/null 2>&1; echo $$?),0) +TEST_GEN_PROGS := cfitests + +$(OUTPUT)/cfitests: cfitests.c shadowstack.c + $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ +else + +$(shell echo "Toolchain doesn't support CFI, skipping CFI kselftest." >&2) +endif + +include ../../lib.mk diff --git a/tools/testing/selftests/riscv/cfi/cfi_rv_test.h b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h new file mode 100644 index 000000000000..1c8043f2b778 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_RISCV_CFI_H +#define SELFTEST_RISCV_CFI_H +#include <stddef.h> +#include <sys/types.h> +#include "shadowstack.h" + +#define CHILD_EXIT_CODE_SSWRITE 10 +#define CHILD_EXIT_CODE_SIG_TEST 11 + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + \ + __asm__ volatile( \ + "ecall\n" \ + : "+r" \ + (_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + \ + __asm__ volatile( \ + "ecall\n" \ + : "+r" (_arg1) \ + : "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#ifndef __NR_prctl +#define __NR_prctl 167 +#endif + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +#define CSR_SSP 0x011 + +#ifdef __ASSEMBLY__ +#define __ASM_STR(x) x +#else +#define __ASM_STR(x) #x +#endif + +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) \ + : "=r" (__v) : \ + : "memory"); \ + __v; \ +}) + +#define csr_write(csr, val) \ +({ \ + unsigned long __v = (unsigned long)(val); \ + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" \ + : : "rK" (__v) \ + : "memory"); \ +}) + +#endif diff --git a/tools/testing/selftests/riscv/cfi/cfitests.c b/tools/testing/selftests/riscv/cfi/cfitests.c new file mode 100644 index 000000000000..298544854415 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfitests.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "../../kselftest.h" +#include <sys/signal.h> +#include <asm/ucontext.h> +#include <linux/prctl.h> +#include <errno.h> +#include <linux/ptrace.h> +#include <sys/wait.h> +#include <linux/elf.h> +#include <sys/uio.h> +#include <asm-generic/unistd.h> + +#include "cfi_rv_test.h" + +/* do not optimize cfi related test functions */ +#pragma GCC push_options +#pragma GCC optimize("O0") + +void sigsegv_handler(int signum, siginfo_t *si, void *uc) +{ + struct ucontext *ctx = (struct ucontext *)uc; + + if (si->si_code == SEGV_CPERR) { + ksft_print_msg("Control flow violation happened somewhere\n"); + ksft_print_msg("PC where violation happened %lx\n", ctx->uc_mcontext.gregs[0]); + exit(-1); + } + + /* all other cases are expected to be of shadow stack write case */ + exit(CHILD_EXIT_CODE_SSWRITE); +} + +bool register_signal_handler(void) +{ + struct sigaction sa = {}; + + sa.sa_sigaction = sigsegv_handler; + sa.sa_flags = SA_SIGINFO; + if (sigaction(SIGSEGV, &sa, NULL)) { + ksft_print_msg("Registering signal handler for landing pad violation failed\n"); + return false; + } + + return true; +} + +long ptrace(int request, pid_t pid, void *addr, void *data); + +bool cfi_ptrace_test(void) +{ + pid_t pid; + int status, ret = 0; + unsigned long ptrace_test_num = 0, total_ptrace_tests = 2; + + struct user_cfi_state cfi_reg; + struct iovec iov; + + pid = fork(); + + if (pid == -1) { + ksft_exit_fail_msg("%s: fork failed\n", __func__); + exit(1); + } + + if (pid == 0) { + /* allow to be traced */ + ptrace(PTRACE_TRACEME, 0, NULL, NULL); + raise(SIGSTOP); + asm volatile ("la a5, 1f\n" + "jalr a5\n" + "nop\n" + "nop\n" + "1: nop\n" + : : : "a5"); + exit(11); + /* child shouldn't go beyond here */ + } + + /* parent's code goes here */ + iov.iov_base = &cfi_reg; + iov.iov_len = sizeof(cfi_reg); + + while (ptrace_test_num < total_ptrace_tests) { + memset(&cfi_reg, 0, sizeof(cfi_reg)); + waitpid(pid, &status, 0); + if (WIFSTOPPED(status)) { + errno = 0; + ret = ptrace(PTRACE_GETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov); + if (ret == -1 && errno) + ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__); + } else { + ksft_exit_fail_msg("%s: child didn't stop, failed\n", __func__); + } + + switch (ptrace_test_num) { +#define CFI_ENABLE_MASK (PTRACE_CFI_LP_EN_STATE | \ + PTRACE_CFI_SS_EN_STATE | \ + PTRACE_CFI_SS_PTR_STATE) + case 0: + if ((cfi_reg.cfi_status.cfi_state & CFI_ENABLE_MASK) != CFI_ENABLE_MASK) + ksft_exit_fail_msg("%s: ptrace_getregset failed, %llu\n", __func__, + cfi_reg.cfi_status.cfi_state); + if (!cfi_reg.shstk_ptr) + ksft_exit_fail_msg("%s: NULL shadow stack pointer, test failed\n", + __func__); + break; + case 1: + if (!(cfi_reg.cfi_status.cfi_state & PTRACE_CFI_ELP_STATE)) + ksft_exit_fail_msg("%s: elp must have been set\n", __func__); + /* clear elp state. not interested in anything else */ + cfi_reg.cfi_status.cfi_state = 0; + + ret = ptrace(PTRACE_SETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov); + if (ret == -1 && errno) + ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__); + break; + default: + ksft_exit_fail_msg("%s: unreachable switch case\n", __func__); + break; + } + ptrace(PTRACE_CONT, pid, NULL, NULL); + ptrace_test_num++; + } + + waitpid(pid, &status, 0); + if (WEXITSTATUS(status) != 11) + ksft_print_msg("%s, bad return code from child\n", __func__); + + ksft_print_msg("%s, ptrace test succeeded\n", __func__); + return true; +} + +int main(int argc, char *argv[]) +{ + int ret = 0; + unsigned long lpad_status = 0, ss_status = 0; + + ksft_print_header(); + + ksft_print_msg("Starting risc-v tests\n"); + + /* + * Landing pad test. Not a lot of kernel changes to support landing + * pads for user mode except lighting up a bit in senvcfg via a prctl. + * Enable landing pad support throughout the execution of the test binary. + */ + ret = my_syscall5(__NR_prctl, PR_GET_INDIR_BR_LP_STATUS, &lpad_status, 0, 0, 0); + if (ret) + ksft_exit_fail_msg("Get landing pad status failed with %d\n", ret); + + if (!(lpad_status & PR_INDIR_BR_LP_ENABLE)) + ksft_exit_fail_msg("Landing pad is not enabled, should be enabled via glibc\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) + ksft_exit_fail_msg("Get shadow stack failed with %d\n", ret); + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_exit_fail_msg("Shadow stack is not enabled, should be enabled via glibc\n"); + + if (!register_signal_handler()) + ksft_exit_fail_msg("Registering signal handler for SIGSEGV failed\n"); + + ksft_print_msg("Landing pad and shadow stack are enabled for binary\n"); + cfi_ptrace_test(); + + execute_shadow_stack_tests(); + + return 0; +} + +#pragma GCC pop_options diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.c b/tools/testing/selftests/riscv/cfi/shadowstack.c new file mode 100644 index 000000000000..f8eed8260a12 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/shadowstack.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "../../kselftest.h" +#include <sys/wait.h> +#include <signal.h> +#include <fcntl.h> +#include <asm-generic/unistd.h> +#include <sys/mman.h> +#include "shadowstack.h" +#include "cfi_rv_test.h" + +static struct shadow_stack_tests shstk_tests[] = { + { "shstk fork test\n", shadow_stack_fork_test }, + { "map shadow stack syscall\n", shadow_stack_map_test }, + { "shadow stack gup tests\n", shadow_stack_gup_tests }, + { "shadow stack signal tests\n", shadow_stack_signal_test}, + { "memory protections of shadow stack memory\n", shadow_stack_protection_test } +}; + +#define RISCV_SHADOW_STACK_TESTS ARRAY_SIZE(shstk_tests) + +/* do not optimize shadow stack related test functions */ +#pragma GCC push_options +#pragma GCC optimize("O0") + +void zar(void) +{ + unsigned long ssp = 0; + + ssp = csr_read(CSR_SSP); + ksft_print_msg("Spewing out shadow stack ptr: %lx\n" + " This is to ensure shadow stack is indeed enabled and working\n", + ssp); +} + +void bar(void) +{ + zar(); +} + +void foo(void) +{ + bar(); +} + +void zar_child(void) +{ + unsigned long ssp = 0; + + ssp = csr_read(CSR_SSP); + ksft_print_msg("Spewing out shadow stack ptr: %lx\n" + " This is to ensure shadow stack is indeed enabled and working\n", + ssp); +} + +void bar_child(void) +{ + zar_child(); +} + +void foo_child(void) +{ + bar_child(); +} + +typedef void (call_func_ptr)(void); +/* + * call couple of functions to test push/pop. + */ +int shadow_stack_call_tests(call_func_ptr fn_ptr, bool parent) +{ + ksft_print_msg("dummy calls for sspush and sspopchk in context of %s\n", + parent ? "parent" : "child"); + + (fn_ptr)(); + + return 0; +} + +/* forks a thread, and ensure shadow stacks fork out */ +bool shadow_stack_fork_test(unsigned long test_num, void *ctx) +{ + int pid = 0, child_status = 0, parent_pid = 0, ret = 0; + unsigned long ss_status = 0; + + ksft_print_msg("Exercising shadow stack fork test\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) { + ksft_exit_skip("Shadow stack get status prctl failed with errorcode %d\n", ret); + return false; + } + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_exit_skip("Shadow stack is not enabled, should be enabled via glibc\n"); + + parent_pid = getpid(); + pid = fork(); + + if (pid) { + ksft_print_msg("Parent pid %d and child pid %d\n", parent_pid, pid); + shadow_stack_call_tests(&foo, true); + } else { + shadow_stack_call_tests(&foo_child, false); + } + + if (pid) { + ksft_print_msg("Waiting on child to finish\n"); + wait(&child_status); + } else { + /* exit child gracefully */ + exit(0); + } + + if (pid && WIFSIGNALED(child_status)) { + ksft_print_msg("Child faulted, fork test failed\n"); + return false; + } + + return true; +} + +/* exercise 'map_shadow_stack', pivot to it and call some functions to ensure it works */ +#define SHADOW_STACK_ALLOC_SIZE 4096 +bool shadow_stack_map_test(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr; + int ret = 0; + + ksft_print_msg("Exercising shadow stack map test\n"); + + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", + (int)shdw_addr); + return false; + } + + ret = munmap((void *)shdw_addr, SHADOW_STACK_ALLOC_SIZE); + + if (ret) { + ksft_print_msg("munmap failed with error code %d\n", ret); + return false; + } + + return true; +} + +/* + * shadow stack protection tests. map a shadow stack and + * validate all memory protections work on it + */ +bool shadow_stack_protection_test(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr; + unsigned long *write_addr = NULL; + int ret = 0, pid = 0, child_status = 0; + + ksft_print_msg("Exercising shadow stack protection test (WPT)\n"); + + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", + (int)shdw_addr); + return false; + } + + write_addr = (unsigned long *)shdw_addr; + pid = fork(); + + /* no child was created, return false */ + if (pid == -1) + return false; + + /* + * try to perform a store from child on shadow stack memory + * it should result in SIGSEGV + */ + if (!pid) { + /* below write must lead to SIGSEGV */ + *write_addr = 0xdeadbeef; + } else { + wait(&child_status); + } + + /* test fail, if 0xdeadbeef present on shadow stack address */ + if (*write_addr == 0xdeadbeef) { + ksft_print_msg("Shadow stack WPT failed\n"); + return false; + } + + /* if child reached here, then fail */ + if (!pid) { + ksft_print_msg("Shadow stack WPT failed: child reached unreachable state\n"); + return false; + } + + /* if child exited via signal handler but not for write on ss */ + if (WIFEXITED(child_status) && + WEXITSTATUS(child_status) != CHILD_EXIT_CODE_SSWRITE) { + ksft_print_msg("Shadow stack WPT failed: child wasn't signaled for write\n"); + return false; + } + + ret = munmap(write_addr, SHADOW_STACK_ALLOC_SIZE); + if (ret) { + ksft_print_msg("Shadow stack WPT failed: munmap failed, error code %d\n", + ret); + return false; + } + + return true; +} + +#define SS_MAGIC_WRITE_VAL 0xbeefdead + +int gup_tests(int mem_fd, unsigned long *shdw_addr) +{ + unsigned long val = 0; + + lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET); + if (read(mem_fd, &val, sizeof(val)) < 0) { + ksft_print_msg("Reading shadow stack mem via gup failed\n"); + return 1; + } + + val = SS_MAGIC_WRITE_VAL; + lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET); + if (write(mem_fd, &val, sizeof(val)) < 0) { + ksft_print_msg("Writing shadow stack mem via gup failed\n"); + return 1; + } + + if (*shdw_addr != SS_MAGIC_WRITE_VAL) { + ksft_print_msg("GUP write to shadow stack memory failed\n"); + return 1; + } + + return 0; +} + +bool shadow_stack_gup_tests(unsigned long test_num, void *ctx) +{ + unsigned long shdw_addr = 0; + unsigned long *write_addr = NULL; + int fd = 0; + bool ret = false; + + ksft_print_msg("Exercising shadow stack gup tests\n"); + shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0); + + if (((long)shdw_addr) <= 0) { + ksft_print_msg("map_shadow_stack failed with error code %d\n", (int)shdw_addr); + return false; + } + + write_addr = (unsigned long *)shdw_addr; + + fd = open("/proc/self/mem", O_RDWR); + if (fd == -1) + return false; + + if (gup_tests(fd, write_addr)) { + ksft_print_msg("gup tests failed\n"); + goto out; + } + + ret = true; +out: + if (shdw_addr && munmap(write_addr, SHADOW_STACK_ALLOC_SIZE)) { + ksft_print_msg("munmap failed with error code %d\n", ret); + ret = false; + } + + return ret; +} + +volatile bool break_loop; + +void sigusr1_handler(int signo) +{ + break_loop = true; +} + +bool sigusr1_signal_test(void) +{ + struct sigaction sa = {}; + + sa.sa_handler = sigusr1_handler; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL)) { + ksft_print_msg("Registering signal handler for SIGUSR1 failed\n"); + return false; + } + + return true; +} + +/* + * shadow stack signal test. shadow stack must be enabled. + * register a signal, fork another thread which is waiting + * on signal. Send a signal from parent to child, verify + * that signal was received by child. If not test fails + */ +bool shadow_stack_signal_test(unsigned long test_num, void *ctx) +{ + int pid = 0, child_status = 0, ret = 0; + unsigned long ss_status = 0; + + ksft_print_msg("Exercising shadow stack signal test\n"); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0); + if (ret) { + ksft_print_msg("Shadow stack get status prctl failed with errorcode %d\n", ret); + return false; + } + + if (!(ss_status & PR_SHADOW_STACK_ENABLE)) + ksft_print_msg("Shadow stack is not enabled, should be enabled via glibc\n"); + + /* this should be caught by signal handler and do an exit */ + if (!sigusr1_signal_test()) { + ksft_print_msg("Registering sigusr1 handler failed\n"); + exit(-1); + } + + pid = fork(); + + if (pid == -1) { + ksft_print_msg("Signal test: fork failed\n"); + goto out; + } + + if (pid == 0) { + while (!break_loop) + sleep(1); + + exit(11); + /* child shouldn't go beyond here */ + } + + /* send SIGUSR1 to child */ + kill(pid, SIGUSR1); + wait(&child_status); + +out: + + return (WIFEXITED(child_status) && + WEXITSTATUS(child_status) == 11); +} + +int execute_shadow_stack_tests(void) +{ + int ret = 0; + unsigned long test_count = 0; + unsigned long shstk_status = 0; + bool test_pass = false; + + ksft_print_msg("Executing RISC-V shadow stack self tests\n"); + ksft_set_plan(RISCV_SHADOW_STACK_TESTS); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &shstk_status, 0, 0, 0); + + if (ret != 0) + ksft_exit_fail_msg("Get shadow stack status failed with %d\n", ret); + + /* + * If we are here that means get shadow stack status succeeded and + * thus shadow stack support is baked in the kernel. + */ + while (test_count < RISCV_SHADOW_STACK_TESTS) { + test_pass = (*shstk_tests[test_count].t_func)(test_count, NULL); + ksft_test_result(test_pass, shstk_tests[test_count].name); + test_count++; + } + + ksft_finished(); + + return 0; +} + +#pragma GCC pop_options diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.h b/tools/testing/selftests/riscv/cfi/shadowstack.h new file mode 100644 index 000000000000..943a3685905f --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/shadowstack.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_SHADOWSTACK_TEST_H +#define SELFTEST_SHADOWSTACK_TEST_H +#include <stddef.h> +#include <linux/prctl.h> + +/* + * A CFI test returns true for success or false for fail. + * Takes a test number to index into array, and a void pointer. + */ +typedef bool (*shstk_test_func)(unsigned long test_num, void *); + +struct shadow_stack_tests { + char *name; + shstk_test_func t_func; +}; + +bool shadow_stack_fork_test(unsigned long test_num, void *ctx); +bool shadow_stack_map_test(unsigned long test_num, void *ctx); +bool shadow_stack_protection_test(unsigned long test_num, void *ctx); +bool shadow_stack_gup_tests(unsigned long test_num, void *ctx); +bool shadow_stack_signal_test(unsigned long test_num, void *ctx); + +int execute_shadow_stack_tests(void); + +#endif diff --git a/tools/testing/selftests/riscv/hwprobe/which-cpus.c b/tools/testing/selftests/riscv/hwprobe/which-cpus.c index 3ab53067e8dd..587feb198c04 100644 --- a/tools/testing/selftests/riscv/hwprobe/which-cpus.c +++ b/tools/testing/selftests/riscv/hwprobe/which-cpus.c @@ -83,9 +83,9 @@ static void do_which_cpus(int argc, char **argv, cpu_set_t *cpus) int main(int argc, char **argv) { - struct riscv_hwprobe pairs[2]; + struct riscv_hwprobe pairs[3]; cpu_set_t cpus_aff, cpus; - __u64 ext0_all; + __u64 ext0_all, ext1_all; long rc; rc = sched_getaffinity(0, sizeof(cpu_set_t), &cpus_aff); @@ -112,6 +112,11 @@ int main(int argc, char **argv) assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_0); ext0_all = pairs[0].value; + pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, }; + rc = riscv_hwprobe(pairs, 1, 0, NULL, 0); + assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_1); + ext1_all = pairs[0].value; + pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; CPU_ZERO(&cpus); rc = riscv_hwprobe(pairs, 1, 0, (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); @@ -134,20 +139,23 @@ int main(int argc, char **argv) pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, }; CPU_ZERO(&cpus); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == sysconf(_SC_NPROCESSORS_ONLN), "set all cpus\n"); pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, }; memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t)); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_EQUAL(&cpus, &cpus_aff), "set all affinity cpus\n"); pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, }; pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ~ext0_all, }; + pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ~ext1_all, }; memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t)); - rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); + rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS); ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == 0, "clear all cpus\n"); ksft_finished(); diff --git a/tools/testing/selftests/riscv/vector/.gitignore b/tools/testing/selftests/riscv/vector/.gitignore index 7d9c87cd0649..40a82baf364f 100644 --- a/tools/testing/selftests/riscv/vector/.gitignore +++ b/tools/testing/selftests/riscv/vector/.gitignore @@ -2,3 +2,5 @@ vstate_exec_nolibc vstate_prctl v_initval v_exec_initval_nolibc +vstate_ptrace +validate_v_ptrace diff --git a/tools/testing/selftests/riscv/vector/Makefile b/tools/testing/selftests/riscv/vector/Makefile index 2c2a33fc083e..326dafd739bf 100644 --- a/tools/testing/selftests/riscv/vector/Makefile +++ b/tools/testing/selftests/riscv/vector/Makefile @@ -2,11 +2,14 @@ # Copyright (C) 2021 ARM Limited # Originally tools/testing/arm64/abi/Makefile -TEST_GEN_PROGS := v_initval vstate_prctl vstate_ptrace +TEST_GEN_PROGS := v_initval vstate_prctl vstate_ptrace validate_v_ptrace TEST_GEN_PROGS_EXTENDED := vstate_exec_nolibc v_exec_initval_nolibc +TEST_GEN_LIBS := v_helpers.c sys_hwprobe.c include ../../lib.mk +TEST_GEN_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(TEST_GEN_LIBS)) + $(OUTPUT)/sys_hwprobe.o: ../hwprobe/sys_hwprobe.S $(CC) -static -c -o$@ $(CFLAGS) $^ @@ -29,3 +32,8 @@ $(OUTPUT)/v_exec_initval_nolibc: v_exec_initval_nolibc.c $(OUTPUT)/vstate_ptrace: vstate_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + +$(OUTPUT)/validate_v_ptrace: validate_v_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o + $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + +EXTRA_CLEAN += $(TEST_GEN_OBJ) diff --git a/tools/testing/selftests/riscv/vector/v_helpers.c b/tools/testing/selftests/riscv/vector/v_helpers.c index 01a8799dcb78..de6da7c8d2f1 100644 --- a/tools/testing/selftests/riscv/vector/v_helpers.c +++ b/tools/testing/selftests/riscv/vector/v_helpers.c @@ -26,6 +26,29 @@ bool is_vector_supported(void) return pair.value & RISCV_HWPROBE_EXT_ZVE32X; } +unsigned long get_vr_len(void) +{ + unsigned long vlenb; + + if (is_vector_supported()) { + asm volatile("csrr %[vlenb], vlenb" : [vlenb] "=r"(vlenb)); + return vlenb; + } + + if (is_xtheadvector_supported()) { + asm volatile ( + // 0 | zimm[10:0] | rs1 | 1 1 1 | rd | 1010111 | vsetvli + // vsetvli t4, x0, e8, m1, d1 + ".4byte 0b00000000000000000111111011010111\n\t" + "mv %[vlenb], t4\n\t" + : [vlenb] "=r"(vlenb) : : "memory", "t4"); + return vlenb; + } + + printf("WARNING: vector not supported\n"); + return 0; +} + int launch_test(char *next_program, int test_inherit, int xtheadvector) { char *exec_argv[4], *exec_envp[1]; diff --git a/tools/testing/selftests/riscv/vector/v_helpers.h b/tools/testing/selftests/riscv/vector/v_helpers.h index 763cddfe26da..c538077f1195 100644 --- a/tools/testing/selftests/riscv/vector/v_helpers.h +++ b/tools/testing/selftests/riscv/vector/v_helpers.h @@ -5,4 +5,6 @@ bool is_xtheadvector_supported(void); bool is_vector_supported(void); +unsigned long get_vr_len(void); + int launch_test(char *next_program, int test_inherit, int xtheadvector); diff --git a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c new file mode 100644 index 000000000000..3589549f7228 --- /dev/null +++ b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c @@ -0,0 +1,915 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/uio.h> +#include <unistd.h> +#include <errno.h> + +#include <linux/ptrace.h> +#include <linux/elf.h> + +#include "kselftest_harness.h" +#include "v_helpers.h" + +#define SR_FS_DIRTY 0x00006000UL +#define CSR_VXRM_SHIFT 1 + +volatile unsigned long chld_lock; + +TEST(ptrace_v_not_enabled) +{ + pid_t pid; + + if (!(is_vector_supported() || is_xtheadvector_supported())) + SKIP(return, "Vector not supported"); + + chld_lock = 1; + pid = fork(); + ASSERT_LE(0, pid) + TH_LOG("fork: %m"); + + if (pid == 0) { + while (chld_lock == 1) + asm volatile("" : : "g"(chld_lock) : "memory"); + + asm volatile ("ebreak" : : : ); + } else { + struct __riscv_v_regset_state *regset_data; + unsigned long vlenb = get_vr_len(); + size_t regset_size; + struct iovec iov; + int status; + int ret; + + /* attach */ + + ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* unlock */ + + ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0)); + + /* resume and wait for ebreak */ + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* try to read vector registers from the tracee */ + + regset_size = sizeof(*regset_data) + vlenb * 32; + regset_data = calloc(1, regset_size); + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + /* V extension is available, but not yet enabled for the tracee */ + + errno = 0; + ret = ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov); + ASSERT_EQ(ENODATA, errno); + ASSERT_EQ(-1, ret); + + /* cleanup */ + + ASSERT_EQ(0, kill(pid, SIGKILL)); + } +} + +TEST(ptrace_v_early_debug) +{ + static volatile unsigned long vstart; + static volatile unsigned long vtype; + static volatile unsigned long vlenb; + static volatile unsigned long vcsr; + static volatile unsigned long vl; + bool xtheadvector; + pid_t pid; + + if (!(is_vector_supported() || is_xtheadvector_supported())) + SKIP(return, "Vector not supported"); + + xtheadvector = is_xtheadvector_supported(); + + chld_lock = 1; + pid = fork(); + ASSERT_LE(0, pid) + TH_LOG("fork: %m"); + + if (pid == 0) { + unsigned long vxsat, vxrm; + + vlenb = get_vr_len(); + + while (chld_lock == 1) + asm volatile ("" : : "g"(chld_lock) : "memory"); + + asm volatile ( + "csrr %[vstart], vstart\n" + "csrr %[vtype], vtype\n" + "csrr %[vl], vl\n" + : [vtype] "=r"(vtype), [vstart] "=r"(vstart), [vl] "=r"(vl) + : + : "memory"); + + /* no 'is_xtheadvector_supported()' here to avoid clobbering v-state by syscall */ + if (xtheadvector) { + asm volatile ( + "csrs sstatus, %[bit]\n" + "csrr %[vxsat], vxsat\n" + "csrr %[vxrm], vxrm\n" + : [vxsat] "=r"(vxsat), [vxrm] "=r"(vxrm) + : [bit] "r" (SR_FS_DIRTY) + : "memory"); + vcsr = vxsat | vxrm << CSR_VXRM_SHIFT; + } else { + asm volatile ( + "csrr %[vcsr], vcsr\n" + : [vcsr] "=r"(vcsr) + : + : "memory"); + } + + asm volatile ( + ".option push\n" + ".option norvc\n" + "ebreak\n" + ".option pop\n"); + } else { + struct __riscv_v_regset_state *regset_data; + unsigned long vstart_csr; + unsigned long vlenb_csr; + unsigned long vtype_csr; + unsigned long vcsr_csr; + unsigned long vl_csr; + size_t regset_size; + struct iovec iov; + int status; + + /* attach */ + + ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* unlock */ + + ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0)); + + /* resume and wait for ebreak */ + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vector csr regs using ptrace PEEKDATA */ + + errno = 0; + vstart_csr = ptrace(PTRACE_PEEKDATA, pid, &vstart, NULL); + ASSERT_FALSE((errno != 0) && (vstart_csr == -1)); + + errno = 0; + vl_csr = ptrace(PTRACE_PEEKDATA, pid, &vl, NULL); + ASSERT_FALSE((errno != 0) && (vl_csr == -1)); + + errno = 0; + vtype_csr = ptrace(PTRACE_PEEKDATA, pid, &vtype, NULL); + ASSERT_FALSE((errno != 0) && (vtype_csr == -1)); + + errno = 0; + vcsr_csr = ptrace(PTRACE_PEEKDATA, pid, &vcsr, NULL); + ASSERT_FALSE((errno != 0) && (vcsr_csr == -1)); + + errno = 0; + vlenb_csr = ptrace(PTRACE_PEEKDATA, pid, &vlenb, NULL); + ASSERT_FALSE((errno != 0) && (vlenb_csr == -1)); + + /* read tracee csr regs using ptrace GETREGSET */ + + regset_size = sizeof(*regset_data) + vlenb_csr * 32; + regset_data = calloc(1, regset_size); + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* compare */ + + EXPECT_EQ(vstart_csr, regset_data->vstart); + EXPECT_EQ(vtype_csr, regset_data->vtype); + EXPECT_EQ(vlenb_csr, regset_data->vlenb); + EXPECT_EQ(vcsr_csr, regset_data->vcsr); + EXPECT_EQ(vl_csr, regset_data->vl); + + /* cleanup */ + + ASSERT_EQ(0, kill(pid, SIGKILL)); + } +} + +TEST(ptrace_v_syscall_clobbering) +{ + pid_t pid; + + if (!is_vector_supported() && !is_xtheadvector_supported()) + SKIP(return, "Vector not supported"); + + chld_lock = 1; + pid = fork(); + ASSERT_LE(0, pid) + TH_LOG("fork: %m"); + + if (pid == 0) { + unsigned long vl; + + while (chld_lock == 1) + asm volatile("" : : "g"(chld_lock) : "memory"); + + if (is_xtheadvector_supported()) { + asm volatile ( + // 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli + // vsetvli t4, x0, e16, m2, d1 + ".4byte 0b00000000010100000111111011010111\n" + "mv %[new_vl], t4\n" + : [new_vl] "=r" (vl) : : "t4"); + } else { + asm volatile ( + ".option push\n" + ".option arch, +zve32x\n" + "vsetvli %[new_vl], x0, e16, m2, tu, mu\n" + ".option pop\n" + : [new_vl] "=r"(vl) : : ); + } + + while (1) { + asm volatile ( + ".option push\n" + ".option norvc\n" + "ebreak\n" + ".option pop\n"); + + sleep(0); + } + } else { + struct __riscv_v_regset_state *regset_data; + unsigned long vlenb = get_vr_len(); + struct user_regs_struct regs; + size_t regset_size; + struct iovec iov; + int status; + + /* attach */ + + ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* unlock */ + + ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0)); + + /* resume and wait for the 1st ebreak */ + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vector csr regs using ptrace GETREGSET */ + + regset_size = sizeof(*regset_data) + vlenb * 32; + regset_data = calloc(1, regset_size); + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* verify initial vsetvli settings */ + + if (is_xtheadvector_supported()) + EXPECT_EQ(5UL, regset_data->vtype); + else + EXPECT_EQ(9UL, regset_data->vtype); + + EXPECT_EQ(regset_data->vlenb, regset_data->vl); + EXPECT_EQ(vlenb, regset_data->vlenb); + EXPECT_EQ(0UL, regset_data->vstart); + EXPECT_EQ(0UL, regset_data->vcsr); + + /* skip 1st ebreak, then resume and wait for the 2nd ebreak */ + + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov)); + regs.pc += 4; + ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov)); + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vtype using ptrace GETREGSET */ + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* verify that V state is illegal after syscall */ + + EXPECT_EQ((1UL << (__riscv_xlen - 1)), regset_data->vtype); + EXPECT_EQ(vlenb, regset_data->vlenb); + EXPECT_EQ(0UL, regset_data->vstart); + EXPECT_EQ(0UL, regset_data->vcsr); + EXPECT_EQ(0UL, regset_data->vl); + + /* cleanup */ + + ASSERT_EQ(0, kill(pid, SIGKILL)); + } +} + +FIXTURE(v_csr_invalid) +{ +}; + +FIXTURE_SETUP(v_csr_invalid) +{ +} + +FIXTURE_TEARDOWN(v_csr_invalid) +{ +} + +#define VECTOR_1_0 BIT(0) +#define XTHEAD_VECTOR_0_7 BIT(1) + +#define vector_test(x) ((x) & VECTOR_1_0) +#define xthead_test(x) ((x) & XTHEAD_VECTOR_0_7) + +/* modifications of the initial vsetvli settings */ +FIXTURE_VARIANT(v_csr_invalid) +{ + unsigned long vstart; + unsigned long vl; + unsigned long vtype; + unsigned long vcsr; + unsigned long vlenb_mul; + unsigned long vlenb_min; + unsigned long vlenb_max; + unsigned long spec; +}; + +/* unexpected vlenb value */ +FIXTURE_VARIANT_ADD(v_csr_invalid, new_vlenb) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x3, + .vcsr = 0x0, + .vlenb_mul = 0x2, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7, +}; + +/* invalid reserved bits in vcsr */ +FIXTURE_VARIANT_ADD(v_csr_invalid, vcsr_invalid_reserved_bits) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x3, + .vcsr = 0x1UL << 8, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7, +}; + +/* invalid reserved bits in vtype */ +FIXTURE_VARIANT_ADD(v_csr_invalid, vtype_invalid_reserved_bits) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = (0x1UL << 8) | 0x3, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7, +}; + +/* set vill bit */ +FIXTURE_VARIANT_ADD(v_csr_invalid, invalid_vill_bit) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = (0x1UL << (__riscv_xlen - 1)) | 0x3, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7, +}; + +/* reserved vsew value: vsew > 3 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, reserved_vsew) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x4UL << 3, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = VECTOR_1_0, +}; + +/* XTheadVector: unsupported non-zero VEDIV value */ +FIXTURE_VARIANT_ADD(v_csr_invalid, reserved_vediv) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x3UL << 5, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = XTHEAD_VECTOR_0_7, +}; + +/* reserved vlmul value: vlmul == 4 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, reserved_vlmul) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x4, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x0, + .spec = VECTOR_1_0, +}; + +/* invalid fractional LMUL for VLEN <= 256: LMUL= 1/8, SEW = 64 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, frac_lmul1) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x1d, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x20, + .spec = VECTOR_1_0, +}; + +/* invalid integral LMUL for VLEN <= 16: LMUL= 2, SEW = 64 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, int_lmul1) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x19, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x2, + .spec = VECTOR_1_0, +}; + +/* XTheadVector: invalid integral LMUL for VLEN <= 16: LMUL= 2, SEW = 64 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, int_lmul2) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0xd, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x2, + .spec = XTHEAD_VECTOR_0_7, +}; + +/* invalid VL for VLEN <= 128: LMUL= 2, SEW = 64, VL = 8 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, vl1) +{ + .vstart = 0x0, + .vl = 0x8, + .vtype = 0x19, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x10, + .spec = VECTOR_1_0, +}; + +/* XTheadVector: invalid VL for VLEN <= 128: LMUL= 2, SEW = 64, VL = 8 */ +FIXTURE_VARIANT_ADD(v_csr_invalid, vl2) +{ + .vstart = 0x0, + .vl = 0x8, + .vtype = 0xd, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x0, + .vlenb_max = 0x10, + .spec = XTHEAD_VECTOR_0_7, +}; + +TEST_F(v_csr_invalid, ptrace_v_invalid_values) +{ + unsigned long vlenb; + pid_t pid; + + if (!is_vector_supported() && !is_xtheadvector_supported()) + SKIP(return, "Vectors not supported"); + + if (is_vector_supported() && !vector_test(variant->spec)) + SKIP(return, "Test not supported for Vector"); + + if (is_xtheadvector_supported() && !xthead_test(variant->spec)) + SKIP(return, "Test not supported for XTheadVector"); + + vlenb = get_vr_len(); + + if (variant->vlenb_min) { + if (vlenb < variant->vlenb_min) + SKIP(return, "This test does not support VLEN < %lu\n", + variant->vlenb_min * 8); + } + + if (variant->vlenb_max) { + if (vlenb > variant->vlenb_max) + SKIP(return, "This test does not support VLEN > %lu\n", + variant->vlenb_max * 8); + } + + chld_lock = 1; + pid = fork(); + ASSERT_LE(0, pid) + TH_LOG("fork: %m"); + + if (pid == 0) { + unsigned long vl; + + while (chld_lock == 1) + asm volatile("" : : "g"(chld_lock) : "memory"); + + if (is_xtheadvector_supported()) { + asm volatile ( + // 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli + // vsetvli t4, x0, e16, m2, d1 + ".4byte 0b00000000010100000111111011010111\n" + "mv %[new_vl], t4\n" + : [new_vl] "=r" (vl) : : "t4"); + } else { + asm volatile ( + ".option push\n" + ".option arch, +zve32x\n" + "vsetvli %[new_vl], x0, e16, m2, tu, mu\n" + ".option pop\n" + : [new_vl] "=r"(vl) : : ); + } + + while (1) { + asm volatile ( + ".option push\n" + ".option norvc\n" + "ebreak\n" + "nop\n" + ".option pop\n"); + } + } else { + struct __riscv_v_regset_state *regset_data; + size_t regset_size; + struct iovec iov; + int status; + int ret; + + /* attach */ + + ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* unlock */ + + ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0)); + + /* resume and wait for the 1st ebreak */ + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vector csr regs using ptrace GETREGSET */ + + regset_size = sizeof(*regset_data) + vlenb * 32; + regset_data = calloc(1, regset_size); + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* verify initial vsetvli settings */ + + if (is_xtheadvector_supported()) + EXPECT_EQ(5UL, regset_data->vtype); + else + EXPECT_EQ(9UL, regset_data->vtype); + + EXPECT_EQ(regset_data->vlenb, regset_data->vl); + EXPECT_EQ(vlenb, regset_data->vlenb); + EXPECT_EQ(0UL, regset_data->vstart); + EXPECT_EQ(0UL, regset_data->vcsr); + + /* apply invalid settings from fixture variants */ + + regset_data->vlenb *= variant->vlenb_mul; + regset_data->vstart = variant->vstart; + regset_data->vtype = variant->vtype; + regset_data->vcsr = variant->vcsr; + regset_data->vl = variant->vl; + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + errno = 0; + ret = ptrace(PTRACE_SETREGSET, pid, NT_RISCV_VECTOR, &iov); + ASSERT_EQ(errno, EINVAL); + ASSERT_EQ(ret, -1); + + /* cleanup */ + + ASSERT_EQ(0, kill(pid, SIGKILL)); + } +} + +FIXTURE(v_csr_valid) +{ +}; + +FIXTURE_SETUP(v_csr_valid) +{ +} + +FIXTURE_TEARDOWN(v_csr_valid) +{ +} + +/* modifications of the initial vsetvli settings */ +FIXTURE_VARIANT(v_csr_valid) +{ + unsigned long vstart; + unsigned long vl; + unsigned long vtype; + unsigned long vcsr; + unsigned long vlenb_mul; + unsigned long vlenb_min; + unsigned long vlenb_max; + unsigned long spec; +}; + +/* valid for VLEN >= 128: LMUL= 1/4, SEW = 32 */ +FIXTURE_VARIANT_ADD(v_csr_valid, frac_lmul1) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x16, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x10, + .vlenb_max = 0x0, + .spec = VECTOR_1_0, +}; + +/* valid for VLEN >= 16: LMUL= 2, SEW = 32 */ +FIXTURE_VARIANT_ADD(v_csr_valid, int_lmul1) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x11, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x2, + .vlenb_max = 0x0, + .spec = VECTOR_1_0, +}; + +/* valid for XTheadVector VLEN >= 16: LMUL= 2, SEW = 32 */ +FIXTURE_VARIANT_ADD(v_csr_valid, int_lmul2) +{ + .vstart = 0x0, + .vl = 0x0, + .vtype = 0x9, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x2, + .vlenb_max = 0x0, + .spec = XTHEAD_VECTOR_0_7, +}; + +/* valid for VLEN >= 32: LMUL= 2, SEW = 32, VL = 2 */ +FIXTURE_VARIANT_ADD(v_csr_valid, int_lmul3) +{ + .vstart = 0x0, + .vl = 0x2, + .vtype = 0x11, + .vcsr = 0x0, + .vlenb_mul = 0x1, + .vlenb_min = 0x4, + .vlenb_max = 0x0, + .spec = VECTOR_1_0, +}; + +TEST_F(v_csr_valid, ptrace_v_valid_values) +{ + unsigned long vlenb; + pid_t pid; + + if (!is_vector_supported() && !is_xtheadvector_supported()) + SKIP(return, "Vectors not supported"); + + if (is_vector_supported() && !vector_test(variant->spec)) + SKIP(return, "Test not supported for Vector"); + + if (is_xtheadvector_supported() && !xthead_test(variant->spec)) + SKIP(return, "Test not supported for XTheadVector"); + + vlenb = get_vr_len(); + + if (variant->vlenb_min) { + if (vlenb < variant->vlenb_min) + SKIP(return, "This test does not support VLEN < %lu\n", + variant->vlenb_min * 8); + } + if (variant->vlenb_max) { + if (vlenb > variant->vlenb_max) + SKIP(return, "This test does not support VLEN > %lu\n", + variant->vlenb_max * 8); + } + + chld_lock = 1; + pid = fork(); + ASSERT_LE(0, pid) + TH_LOG("fork: %m"); + + if (pid == 0) { + unsigned long vl; + + while (chld_lock == 1) + asm volatile("" : : "g"(chld_lock) : "memory"); + + if (is_xtheadvector_supported()) { + asm volatile ( + // 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli + // vsetvli t4, x0, e16, m2, d1 + ".4byte 0b00000000010100000111111011010111\n" + "mv %[new_vl], t4\n" + : [new_vl] "=r" (vl) : : "t4"); + } else { + asm volatile ( + ".option push\n" + ".option arch, +zve32x\n" + "vsetvli %[new_vl], x0, e16, m2, tu, mu\n" + ".option pop\n" + : [new_vl] "=r"(vl) : : ); + } + + asm volatile ( + ".option push\n" + ".option norvc\n" + ".option arch, +zve32x\n" + "ebreak\n" /* breakpoint 1: apply new V state using ptrace */ + "nop\n" + "ebreak\n" /* breakpoint 2: V state clean - context will not be saved */ + "vmv.v.i v0, -1\n" + "ebreak\n" /* breakpoint 3: V state dirty - context will be saved */ + ".option pop\n"); + } else { + struct __riscv_v_regset_state *regset_data; + struct user_regs_struct regs; + size_t regset_size; + struct iovec iov; + int status; + + /* attach */ + + ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* unlock */ + + ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0)); + + /* resume and wait for the 1st ebreak */ + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vector csr regs using ptrace GETREGSET */ + + regset_size = sizeof(*regset_data) + vlenb * 32; + regset_data = calloc(1, regset_size); + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* verify initial vsetvli settings */ + + if (is_xtheadvector_supported()) + EXPECT_EQ(5UL, regset_data->vtype); + else + EXPECT_EQ(9UL, regset_data->vtype); + + EXPECT_EQ(regset_data->vlenb, regset_data->vl); + EXPECT_EQ(vlenb, regset_data->vlenb); + EXPECT_EQ(0UL, regset_data->vstart); + EXPECT_EQ(0UL, regset_data->vcsr); + + /* apply valid settings from fixture variants */ + + regset_data->vlenb *= variant->vlenb_mul; + regset_data->vstart = variant->vstart; + regset_data->vtype = variant->vtype; + regset_data->vcsr = variant->vcsr; + regset_data->vl = variant->vl; + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* skip 1st ebreak, then resume and wait for the 2nd ebreak */ + + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov)); + regs.pc += 4; + ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov)); + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vector csr regs using ptrace GETREGSET */ + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* verify vector csr regs from tracee context */ + + EXPECT_EQ(regset_data->vstart, variant->vstart); + EXPECT_EQ(regset_data->vtype, variant->vtype); + EXPECT_EQ(regset_data->vcsr, variant->vcsr); + EXPECT_EQ(regset_data->vl, variant->vl); + EXPECT_EQ(regset_data->vlenb, vlenb); + + /* skip 2nd ebreak, then resume and wait for the 3rd ebreak */ + + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov)); + regs.pc += 4; + ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov)); + + ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL)); + ASSERT_EQ(pid, waitpid(pid, &status, 0)); + ASSERT_TRUE(WIFSTOPPED(status)); + + /* read tracee vector csr regs using ptrace GETREGSET */ + + iov.iov_base = regset_data; + iov.iov_len = regset_size; + + ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov)); + + /* verify vector csr regs from tracee context */ + + EXPECT_EQ(regset_data->vstart, variant->vstart); + EXPECT_EQ(regset_data->vtype, variant->vtype); + EXPECT_EQ(regset_data->vcsr, variant->vcsr); + EXPECT_EQ(regset_data->vl, variant->vl); + EXPECT_EQ(regset_data->vlenb, vlenb); + + /* cleanup */ + + ASSERT_EQ(0, kill(pid, SIGKILL)); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c index 7b7d6f21acb4..12f1b1b1c7aa 100644 --- a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c +++ b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c @@ -16,10 +16,10 @@ int main(int argc, char **argv) if (argc > 2 && strcmp(argv[2], "x")) xtheadvector = 1; - ctrl = my_syscall1(__NR_prctl, PR_RISCV_V_GET_CONTROL); - if (ctrl < 0) { + ctrl = prctl(PR_RISCV_V_GET_CONTROL, 0, 0, 0, 0); + if (ctrl == -1) { puts("PR_RISCV_V_GET_CONTROL is not supported\n"); - return ctrl; + exit(-1); } if (test_inherit) { @@ -51,7 +51,7 @@ int main(int argc, char **argv) } if (!pid) { - rc = my_syscall1(__NR_prctl, PR_RISCV_V_GET_CONTROL); + rc = prctl(PR_RISCV_V_GET_CONTROL, 0, 0, 0, 0); if (rc != ctrl) { puts("child's vstate_ctrl not equal to parent's\n"); exit(-1); diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 0fda241fa62b..ec01d164c1f0 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -10,3 +10,4 @@ param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice syscall_errors_test +slice_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d0a5fae5954..4ef90823b652 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test slice_test TEST_GEN_PROGS_EXTENDED = librseq.so @@ -59,3 +59,6 @@ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDE $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index fb4ec8a75dd4..ecef315204b2 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -53,6 +53,27 @@ struct rseq_abi_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_abi_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_abi_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -165,6 +186,12 @@ struct rseq_abi { __u32 mm_cid; /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_abi_slice_ctrl slice_ctrl; + + /* * Flexible array member at end of structure, after last feature field. */ char end[]; diff --git a/tools/testing/selftests/rseq/rseq-slice-hist.py b/tools/testing/selftests/rseq/rseq-slice-hist.py new file mode 100644 index 000000000000..b7933eeaefb9 --- /dev/null +++ b/tools/testing/selftests/rseq/rseq-slice-hist.py @@ -0,0 +1,132 @@ +#!/usr/bin/python3 + +# +# trace-cmd record -e hrtimer_start -e hrtimer_cancel -e hrtimer_expire_entry -- $cmd +# + +from tracecmd import * + +def load_kallsyms(file_path='/proc/kallsyms'): + """ + Parses /proc/kallsyms into a dictionary. + Returns: { address_int: symbol_name } + """ + kallsyms_map = {} + + try: + with open(file_path, 'r') as f: + for line in f: + # The format is: [address] [type] [name] [module] + parts = line.split() + if len(parts) < 3: + continue + + addr = int(parts[0], 16) + name = parts[2] + + kallsyms_map[addr] = name + + except PermissionError: + print(f"Error: Permission denied reading {file_path}. Try running with sudo.") + except FileNotFoundError: + print(f"Error: {file_path} not found.") + + return kallsyms_map + +ksyms = load_kallsyms() + +# pending[timer_ptr] = {'ts': timestamp, 'comm': comm} +pending = {} + +# histograms[comm][bucket] = count +histograms = {} + +class OnlineHarmonicMean: + def __init__(self): + self.n = 0 # Count of elements + self.S = 0.0 # Cumulative sum of reciprocals + + def update(self, x): + if x == 0: + raise ValueError("Harmonic mean is undefined for zero.") + + self.n += 1 + self.S += 1.0 / x + return self.n / self.S + + @property + def mean(self): + return self.n / self.S if self.n > 0 else 0 + +ohms = {} + +def handle_start(record): + func_name = ksyms[record.num_field("function")] + if "rseq_slice_expired" in func_name: + timer_ptr = record.num_field("hrtimer") + pending[timer_ptr] = { + 'ts': record.ts, + 'comm': record.comm + } + return None + +def handle_cancel(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + duration_ns = record.ts - start_data['ts'] + duration_us = duration_ns // 1000 + + comm = start_data['comm'] + + if comm not in ohms: + ohms[comm] = OnlineHarmonicMean() + + ohms[comm].update(duration_ns) + + if comm not in histograms: + histograms[comm] = {} + + histograms[comm][duration_us] = histograms[comm].get(duration_us, 0) + 1 + return None + +def handle_expire(record): + timer_ptr = record.num_field("hrtimer") + + if timer_ptr in pending: + start_data = pending.pop(timer_ptr) + comm = start_data['comm'] + + if comm not in histograms: + histograms[comm] = {} + + # Record -1 bucket for expired (failed to cancel) + histograms[comm][-1] = histograms[comm].get(-1, 0) + 1 + return None + +if __name__ == "__main__": + t = Trace("trace.dat") + for cpu in range(0, t.cpus): + ev = t.read_event(cpu) + while ev: + if "hrtimer_start" in ev.name: + handle_start(ev) + if "hrtimer_cancel" in ev.name: + handle_cancel(ev) + if "hrtimer_expire_entry" in ev.name: + handle_expire(ev) + + ev = t.read_event(cpu) + + print("\n" + "="*40) + print("RSEQ SLICE HISTOGRAM (us)") + print("="*40) + for comm, buckets in histograms.items(): + print(f"\nTask: {comm} Mean: {ohms[comm].mean:.3f} ns") + print(f" {'Latency (us)':<15} | {'Count'}") + print(f" {'-'*30}") + # Sort buckets numerically, putting -1 at the top + for bucket in sorted(buckets.keys()): + label = "EXPIRED" if bucket == -1 else f"{bucket} us" + print(f" {label:<15} | {buckets[bucket]}") diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c new file mode 100644 index 000000000000..357122dcb487 --- /dev/null +++ b/tools/testing/selftests/rseq/slice_test.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include <assert.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <syscall.h> +#include <unistd.h> + +#include <linux/prctl.h> +#include <sys/prctl.h> +#include <sys/time.h> + +#include "rseq.h" + +#include "../kselftest_harness.h" + +#ifndef __NR_rseq_slice_yield +# define __NR_rseq_slice_yield 471 +#endif + +#define BITS_PER_INT 32 +#define BITS_PER_BYTE 8 + +#ifndef PR_RSEQ_SLICE_EXTENSION +# define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 +#endif + +#ifndef RSEQ_SLICE_EXT_REQUEST_BIT +# define RSEQ_SLICE_EXT_REQUEST_BIT 0 +# define RSEQ_SLICE_EXT_GRANTED_BIT 1 +#endif + +#ifndef asm_inline +# define asm_inline asm __inline +#endif + +#define NSEC_PER_SEC 1000000000L +#define NSEC_PER_USEC 1000L + +struct noise_params { + int64_t noise_nsecs; + int64_t sleep_nsecs; + int64_t run; +}; + +FIXTURE(slice_ext) +{ + pthread_t noise_thread; + struct noise_params noise_params; +}; + +FIXTURE_VARIANT(slice_ext) +{ + int64_t total_nsecs; + int64_t slice_nsecs; + int64_t noise_nsecs; + int64_t sleep_nsecs; + bool no_yield; +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n50_2_50) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 50LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, +}; + +FIXTURE_VARIANT_ADD(slice_ext, n2_2_50_no_yield) +{ + .total_nsecs = 5LL * NSEC_PER_SEC, + .slice_nsecs = 2LL * NSEC_PER_USEC, + .noise_nsecs = 2LL * NSEC_PER_USEC, + .sleep_nsecs = 50LL * NSEC_PER_USEC, + .no_yield = true, +}; + + +static inline bool elapsed(struct timespec *start, struct timespec *now, + int64_t span) +{ + int64_t delta = now->tv_sec - start->tv_sec; + + delta *= NSEC_PER_SEC; + delta += now->tv_nsec - start->tv_nsec; + return delta >= span; +} + +static void *noise_thread(void *arg) +{ + struct noise_params *p = arg; + + while (RSEQ_READ_ONCE(p->run)) { + struct timespec ts_start, ts_now; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, p->noise_nsecs)); + + ts_start.tv_sec = 0; + ts_start.tv_nsec = p->sleep_nsecs; + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts_start, NULL); + } + return NULL; +} + +FIXTURE_SETUP(slice_ext) +{ + cpu_set_t affinity; + + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); + + /* Pin it on a single CPU. Avoid CPU 0 */ + for (int i = 1; i < CPU_SETSIZE; i++) { + if (!CPU_ISSET(i, &affinity)) + continue; + + CPU_ZERO(&affinity); + CPU_SET(i, &affinity); + ASSERT_EQ(sched_setaffinity(0, sizeof(affinity), &affinity), 0); + break; + } + + ASSERT_EQ(rseq_register_current_thread(), 0); + + ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); + + self->noise_params.noise_nsecs = variant->noise_nsecs; + self->noise_params.sleep_nsecs = variant->sleep_nsecs; + self->noise_params.run = 1; + + ASSERT_EQ(pthread_create(&self->noise_thread, NULL, noise_thread, &self->noise_params), 0); +} + +FIXTURE_TEARDOWN(slice_ext) +{ + self->noise_params.run = 0; + pthread_join(self->noise_thread, NULL); +} + +TEST_F(slice_ext, slice_test) +{ + unsigned long success = 0, yielded = 0, scheduled = 0, raced = 0; + unsigned long total = 0, aborted = 0; + struct rseq_abi *rs = rseq_get_abi(); + struct timespec ts_start, ts_now; + + ASSERT_NE(rs, NULL); + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + do { + struct timespec ts_cs; + bool req = false; + + clock_gettime(CLOCK_MONOTONIC, &ts_cs); + + total++; + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 1); + do { + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_cs, &ts_now, variant->slice_nsecs)); + + /* + * request can be cleared unconditionally, but for making + * the stats work this is actually checking it first + */ + if (RSEQ_READ_ONCE(rs->slice_ctrl.request)) { + RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 0); + /* Race between check and clear! */ + req = true; + success++; + } + + if (RSEQ_READ_ONCE(rs->slice_ctrl.granted)) { + /* The above raced against a late grant */ + if (req) + success--; + if (variant->no_yield) { + syscall(__NR_getpid); + aborted++; + } else { + yielded++; + if (!syscall(__NR_rseq_slice_yield)) + raced++; + } + } else { + if (!req) + scheduled++; + } + + clock_gettime(CLOCK_MONOTONIC, &ts_now); + } while (!elapsed(&ts_start, &ts_now, variant->total_nsecs)); + + printf("# Total %12ld\n", total); + printf("# Success %12ld\n", success); + printf("# Yielded %12ld\n", yielded); + printf("# Aborted %12ld\n", aborted); + printf("# Scheduled %12ld\n", scheduled); + printf("# Raced %12ld\n", raced); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh index d4be97498b32..84d45254675c 100755 --- a/tools/testing/selftests/run_kselftest.sh +++ b/tools/testing/selftests/run_kselftest.sh @@ -30,6 +30,7 @@ Usage: $0 [OPTIONS] -s | --summary Print summary with detailed log in output.log (conflict with -p) -p | --per-test-log Print test log in /tmp with each test name (conflict with -s) -t | --test COLLECTION:TEST Run TEST from COLLECTION + -S | --skip COLLECTION:TEST Skip TEST from COLLECTION -c | --collection COLLECTION Run all tests from COLLECTION -l | --list List the available collection:test entries -d | --dry-run Don't actually run any tests @@ -43,6 +44,7 @@ EOF COLLECTIONS="" TESTS="" +SKIP="" dryrun="" kselftest_override_timeout="" ERROR_ON_FAIL=true @@ -58,6 +60,9 @@ while true; do -t | --test) TESTS="$TESTS $2" shift 2 ;; + -S | --skip) + SKIP="$SKIP $2" + shift 2 ;; -c | --collection) COLLECTIONS="$COLLECTIONS $2" shift 2 ;; @@ -109,6 +114,12 @@ if [ -n "$TESTS" ]; then done available="$(echo "$valid" | sed -e 's/ /\n/g')" fi +# Remove tests to be skipped from available list +if [ -n "$SKIP" ]; then + for skipped in $SKIP ; do + available="$(echo "$available" | grep -v "^${skipped}$")" + done +fi kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)" export kselftest_failures_file diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 5fe45f9c5f8f..2c601a7eaff5 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -183,7 +183,9 @@ auto-test-targets := \ select_cpu_dispatch_bad_dsq \ select_cpu_dispatch_dbl_dsp \ select_cpu_vtime \ + rt_stall \ test_example \ + total_bw \ testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index eddf9e0e26e7..82c71653977b 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -4,6 +4,7 @@ * Copyright (c) 2023 David Vernet <dvernet@meta.com> * Copyright (c) 2023 Tejun Heo <tj@kernel.org> */ +#include <signal.h> #include <stdio.h> #include <unistd.h> #include <sched.h> @@ -23,6 +24,9 @@ static enum scx_test_status run_test(bool global) int ret, i, status; struct sched_param param = {}; pid_t pids[num_pre_forks]; + int pipe_fds[2]; + + SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe"); skel = init_enable_count__open(); SCX_FAIL_IF(!skel, "Failed to open"); @@ -38,26 +42,34 @@ static enum scx_test_status run_test(bool global) * ensure (at least in practical terms) that there are more tasks that * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that * take the fork() path either below or in other processes. + * + * All children will block on read() on the pipe until the parent closes + * the write end after attaching the scheduler, which signals all of + * them to exit simultaneously. Auto-reap so we don't have to wait on + * them. */ + signal(SIGCHLD, SIG_IGN); for (i = 0; i < num_pre_forks; i++) { - pids[i] = fork(); - SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); - if (pids[i] == 0) { - sleep(1); + pid_t pid = fork(); + + SCX_FAIL_IF(pid < 0, "Failed to fork child"); + if (pid == 0) { + char buf; + + close(pipe_fds[1]); + read(pipe_fds[0], &buf, 1); + close(pipe_fds[0]); exit(0); } } + close(pipe_fds[0]); link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); SCX_FAIL_IF(!link, "Failed to attach struct_ops"); - for (i = 0; i < num_pre_forks; i++) { - SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], - "Failed to wait for pre-forked child\n"); - - SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, - status); - } + /* Signal all pre-forked children to exit. */ + close(pipe_fds[1]); + signal(SIGCHLD, SIG_DFL); bpf_link__destroy(link); SCX_GE(skel->bss->init_task_cnt, num_pre_forks); diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c new file mode 100644 index 000000000000..80086779dd1e --- /dev/null +++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that verified if RT tasks can stall SCHED_EXT tasks. + * + * Copyright (c) 2025 NVIDIA Corporation. + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops rt_stall_ops = { + .exit = (void *)rt_stall_exit, + .name = "rt_stall", +}; diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c new file mode 100644 index 000000000000..015200f80f6e --- /dev/null +++ b/tools/testing/selftests/sched_ext/rt_stall.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 NVIDIA Corporation. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sched.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <time.h> +#include <linux/sched.h> +#include <signal.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include <unistd.h> +#include "rt_stall.bpf.skel.h" +#include "scx_test.h" +#include "../kselftest.h" + +#define CORE_ID 0 /* CPU to pin tasks to */ +#define RUN_TIME 5 /* How long to run the test in seconds */ + +/* Simple busy-wait function for test tasks */ +static void process_func(void) +{ + while (1) { + /* Busy wait */ + for (volatile unsigned long i = 0; i < 10000000UL; i++) + ; + } +} + +/* Set CPU affinity to a specific core */ +static void set_affinity(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask) != 0) { + perror("sched_setaffinity"); + exit(EXIT_FAILURE); + } +} + +/* Set task scheduling policy and priority */ +static void set_sched(int policy, int priority) +{ + struct sched_param param; + + param.sched_priority = priority; + if (sched_setscheduler(0, policy, ¶m) != 0) { + perror("sched_setscheduler"); + exit(EXIT_FAILURE); + } +} + +/* Get process runtime from /proc/<pid>/stat */ +static float get_process_runtime(int pid) +{ + char path[256]; + FILE *file; + long utime, stime; + int fields; + + snprintf(path, sizeof(path), "/proc/%d/stat", pid); + file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open stat file"); + return -1; + } + + /* Skip the first 13 fields and read the 14th and 15th */ + fields = fscanf(file, + "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu", + &utime, &stime); + fclose(file); + + if (fields != 2) { + fprintf(stderr, "Failed to read stat file\n"); + return -1; + } + + /* Calculate the total time spent in the process */ + long total_time = utime + stime; + long ticks_per_second = sysconf(_SC_CLK_TCK); + float runtime_seconds = total_time * 1.0 / ticks_per_second; + + return runtime_seconds; +} + +static enum scx_test_status setup(void **ctx) +{ + struct rt_stall *skel; + + skel = rt_stall__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static bool sched_stress_test(bool is_ext) +{ + /* + * We're expecting the EXT task to get around 5% of CPU time when + * competing with the RT task (small 1% fluctuations are expected). + * + * However, the EXT task should get at least 4% of the CPU to prove + * that the EXT deadline server is working correctly. A percentage + * less than 4% indicates a bug where RT tasks can potentially + * stall SCHED_EXT tasks, causing the test to fail. + */ + const float expected_min_ratio = 0.04; /* 4% */ + const char *class_str = is_ext ? "EXT" : "FAIR"; + + float ext_runtime, rt_runtime, actual_ratio; + int ext_pid, rt_pid; + + ksft_print_header(); + ksft_set_plan(1); + + /* Create and set up a EXT task */ + ext_pid = fork(); + if (ext_pid == 0) { + set_affinity(CORE_ID); + process_func(); + exit(0); + } else if (ext_pid < 0) { + perror("fork task"); + ksft_exit_fail(); + } + + /* Create an RT task */ + rt_pid = fork(); + if (rt_pid == 0) { + set_affinity(CORE_ID); + set_sched(SCHED_FIFO, 50); + process_func(); + exit(0); + } else if (rt_pid < 0) { + perror("fork for RT task"); + ksft_exit_fail(); + } + + /* Let the processes run for the specified time */ + sleep(RUN_TIME); + + /* Get runtime for the EXT task */ + ext_runtime = get_process_runtime(ext_pid); + if (ext_runtime == -1) + ksft_exit_fail_msg("Error getting runtime for %s task (PID %d)\n", + class_str, ext_pid); + ksft_print_msg("Runtime of %s task (PID %d) is %f seconds\n", + class_str, ext_pid, ext_runtime); + + /* Get runtime for the RT task */ + rt_runtime = get_process_runtime(rt_pid); + if (rt_runtime == -1) + ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid); + ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime); + + /* Kill the processes */ + kill(ext_pid, SIGKILL); + kill(rt_pid, SIGKILL); + waitpid(ext_pid, NULL, 0); + waitpid(rt_pid, NULL, 0); + + /* Verify that the scx task got enough runtime */ + actual_ratio = ext_runtime / (ext_runtime + rt_runtime); + ksft_print_msg("%s task got %.2f%% of total runtime\n", + class_str, actual_ratio * 100); + + if (actual_ratio >= expected_min_ratio) { + ksft_test_result_pass("PASS: %s task got more than %.2f%% of runtime\n", + class_str, expected_min_ratio * 100); + return true; + } + ksft_test_result_fail("FAIL: %s task got less than %.2f%% of runtime\n", + class_str, expected_min_ratio * 100); + return false; +} + +static enum scx_test_status run(void *ctx) +{ + struct rt_stall *skel = ctx; + struct bpf_link *link = NULL; + bool res; + int i; + + /* + * Test if the dl_server is working both with and without the + * sched_ext scheduler attached. + * + * This ensures all the scenarios are covered: + * - fair_server stop -> ext_server start + * - ext_server stop -> fair_server stop + */ + for (i = 0; i < 4; i++) { + bool is_ext = i % 2; + + if (is_ext) { + memset(&skel->data->uei, 0, sizeof(skel->data->uei)); + link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + } + res = sched_stress_test(is_ext); + if (is_ext) { + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + } + + if (!res) + ksft_exit_fail(); + } + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct rt_stall *skel = ctx; + + rt_stall__destroy(skel); +} + +struct scx_test rt_stall = { + .name = "rt_stall", + .description = "Verify that RT tasks cannot stall SCHED_EXT tasks", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&rt_stall) diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c new file mode 100644 index 000000000000..5b0a619bab86 --- /dev/null +++ b/tools/testing/selftests/sched_ext/total_bw.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test to verify that total_bw value remains consistent across all CPUs + * in different BPF program states. + * + * Copyright (C) 2025 NVIDIA Corporation. + */ +#include <bpf/bpf.h> +#include <errno.h> +#include <pthread.h> +#include <scx/common.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/wait.h> +#include <unistd.h> +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +#define MAX_CPUS 512 +#define STRESS_DURATION_SEC 5 + +struct total_bw_ctx { + struct minimal *skel; + long baseline_bw[MAX_CPUS]; + int nr_cpus; +}; + +static void *cpu_stress_thread(void *arg) +{ + volatile int i; + time_t end_time = time(NULL) + STRESS_DURATION_SEC; + + while (time(NULL) < end_time) + for (i = 0; i < 1000000; i++) + ; + + return NULL; +} + +/* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ +static int run_cpu_stress(int nr_cpus) +{ + pthread_t *threads; + int i, ret = 0; + + threads = calloc(nr_cpus, sizeof(pthread_t)); + if (!threads) + return -ENOMEM; + + /* Create threads to run on each CPU */ + for (i = 0; i < nr_cpus; i++) { + if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) { + ret = -errno; + fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret)); + break; + } + } + + /* Wait for all threads to complete */ + for (i = 0; i < nr_cpus; i++) { + if (threads[i]) + pthread_join(threads[i], NULL); + } + + free(threads); + return ret; +} + +static int read_total_bw_values(long *bw_values, int max_cpus) +{ + FILE *fp; + char line[256]; + int cpu_count = 0; + + fp = fopen("/sys/kernel/debug/sched/debug", "r"); + if (!fp) { + SCX_ERR("Failed to open debug file"); + return -1; + } + + while (fgets(line, sizeof(line), fp)) { + char *bw_str = strstr(line, "total_bw"); + + if (bw_str) { + bw_str = strchr(bw_str, ':'); + if (bw_str) { + /* Only store up to max_cpus values */ + if (cpu_count < max_cpus) + bw_values[cpu_count] = atol(bw_str + 1); + cpu_count++; + } + } + } + + fclose(fp); + return cpu_count; +} + +static bool verify_total_bw_consistency(long *bw_values, int count) +{ + int i; + long first_value; + + if (count <= 0) + return false; + + first_value = bw_values[0]; + + for (i = 1; i < count; i++) { + if (bw_values[i] != first_value) { + SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld", + first_value, i, bw_values[i]); + return false; + } + } + + return true; +} + +static int fetch_verify_total_bw(long *bw_values, int nr_cpus) +{ + int attempts = 0; + int max_attempts = 10; + int count; + + /* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ + if (run_cpu_stress(nr_cpus) < 0) { + SCX_ERR("Failed to run CPU stress"); + return -1; + } + + /* Try multiple times to get stable values */ + while (attempts < max_attempts) { + count = read_total_bw_values(bw_values, nr_cpus); + fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus); + /* If system has more CPUs than we're testing, that's OK */ + if (count < nr_cpus) { + SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count); + attempts++; + sleep(1); + continue; + } + + /* Only verify the CPUs we're testing */ + if (verify_total_bw_consistency(bw_values, nr_cpus)) { + fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]); + return 0; + } + + attempts++; + sleep(1); + } + + return -1; +} + +static enum scx_test_status setup(void **ctx) +{ + struct total_bw_ctx *test_ctx; + + if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) { + fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n"); + return SCX_TEST_SKIP; + } + + test_ctx = calloc(1, sizeof(*test_ctx)); + if (!test_ctx) + return SCX_TEST_FAIL; + + test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (test_ctx->nr_cpus <= 0) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */ + if (test_ctx->nr_cpus > MAX_CPUS) + test_ctx->nr_cpus = MAX_CPUS; + + /* Test scenario 1: BPF program not loaded */ + /* Read and verify baseline total_bw before loading BPF program */ + fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable baseline values"); + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* Load the BPF skeleton */ + test_ctx->skel = minimal__open(); + if (!test_ctx->skel) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + SCX_ENUM_INIT(test_ctx->skel); + if (minimal__load(test_ctx->skel)) { + minimal__destroy(test_ctx->skel); + free(test_ctx); + return SCX_TEST_FAIL; + } + + *ctx = test_ctx; + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + struct bpf_link *link; + long loaded_bw[MAX_CPUS]; + long unloaded_bw[MAX_CPUS]; + int i; + + /* Test scenario 2: BPF program loaded */ + link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + fprintf(stderr, "BPF program loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values with BPF loaded"); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + bpf_link__destroy(link); + + /* Test scenario 3: BPF program unloaded */ + fprintf(stderr, "BPF program unloaded, reading total_bw values\n"); + if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values after BPF unload"); + return SCX_TEST_FAIL; + } + + /* Verify all three scenarios have the same total_bw values */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (test_ctx->baseline_bw[i] != loaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld", + i, test_ctx->baseline_bw[i], loaded_bw[i]); + return SCX_TEST_FAIL; + } + + if (test_ctx->baseline_bw[i] != unloaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld", + i, test_ctx->baseline_bw[i], unloaded_bw[i]); + return SCX_TEST_FAIL; + } + } + + fprintf(stderr, "All total_bw values are consistent across all scenarios\n"); + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + + if (test_ctx) { + if (test_ctx->skel) + minimal__destroy(test_ctx->skel); + free(test_ctx); + } +} + +struct scx_test total_bw = { + .name = "total_bw", + .description = "Verify total_bw consistency across BPF program states", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&total_bw) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json new file mode 100644 index 000000000000..0efe229fb86e --- /dev/null +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json @@ -0,0 +1,559 @@ +[ + { + "id": "684b", + "name": "Create CAKE_MQ with default setting (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1\" > /sys/bus/netdevsim/del_device || true", + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7ee8", + "name": "Create CAKE_MQ with bandwidth limit (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq bandwidth 1000", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "1f87", + "name": "Create CAKE_MQ with rtt time (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq rtt 200", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 200us raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "e9cf", + "name": "Create CAKE_MQ with besteffort flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq besteffort", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited besteffort triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7c05", + "name": "Create CAKE_MQ with diffserv8 flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv8", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv8 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "5a77", + "name": "Create CAKE_MQ with diffserv4 flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv4", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv4 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "8f7a", + "name": "Create CAKE_MQ with flowblind flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7ef7", + "name": "Create CAKE_MQ with dsthost and nat flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dsthost nat", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dsthost nat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2e4d", + "name": "Create CAKE_MQ with wash flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq hosts wash", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 hosts nonat wash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "b3e6", + "name": "Create CAKE_MQ with flowblind and no-split-gso flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind no-split-gso", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter no-split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "62cd", + "name": "Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-srchost ack-filter", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-srchost nonat nowash ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "0df3", + "name": "Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-dsthost ack-filter-aggressive", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-dsthost nonat nowash ack-filter-aggressive split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "9a75", + "name": "Create CAKE_MQ with memlimit and ptm flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq memlimit 10000 ptm", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw ptm overhead 0 memlimit 10000b ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "cdef", + "name": "Create CAKE_MQ with fwmark and atm flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq fwmark 8 atm", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw atm overhead 0 fwmark 0x8 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "93dd", + "name": "Create CAKE_MQ with overhead 0 and mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 256 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "1475", + "name": "Create CAKE_MQ with conservative and ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7bf1", + "name": "Delete CAKE_MQ with conservative and ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress" + ], + "cmdUnderTest": "$TC qdisc del dev $ETH handle 1: root", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "ee55", + "name": "Replace CAKE_MQ with mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256" + ], + "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq mpu 128", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "6df9", + "name": "Change CAKE_MQ with mpu (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256" + ], + "cmdUnderTest": "$TC qdisc change dev $ETH handle 1: root cake_mq mpu 128", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "67e2", + "name": "Show CAKE_MQ class (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "0", + "verifyCmd": "$TC class show dev $ETH", + "matchPattern": "class cake_mq", + "matchCount": "4", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "2de4", + "name": "Change bandwidth of CAKE_MQ (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq bandwidth 1000", + "expExitCode": "0", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "5f62", + "name": "Fail to create CAKE_MQ with autorate-ingress flag (4 queues)", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq autorate-ingress", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited autorate-ingress diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "038e", + "name": "Fail to change setting of sub-qdisc under CAKE_MQ", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 cake besteffort flows", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "7bdc", + "name": "Fail to replace sub-qdisc under CAKE_MQ", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 4\" > /sys/bus/netdevsim/new_device", + "$TC qdisc add dev $ETH handle 1: root cake_mq" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 fq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "5", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, + { + "id": "18e0", + "name": "Fail to install CAKE_MQ on single queue device", + "category": [ + "qdisc", + "cake_mq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 1\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + } +] diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c index ca2bd03154e4..569d44f22835 100644 --- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c +++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c @@ -12,6 +12,7 @@ #define WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/notification_delay_ms" #define WORKLOAD_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_hint_enable" +#define WORKLOAD_SLOW_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_slow_hint_enable" #define WORKLOAD_TYPE_INDEX_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_type_index" static const char * const workload_types[] = { @@ -22,6 +23,9 @@ static const char * const workload_types[] = { NULL }; +static int wlt_slow; +static char *wlt_enable_attr; + #define WORKLOAD_TYPE_MAX_INDEX 3 void workload_hint_exit(int signum) @@ -30,7 +34,7 @@ void workload_hint_exit(int signum) /* Disable feature via sysfs knob */ - fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); + fd = open(wlt_enable_attr, O_RDWR); if (fd < 0) { perror("Unable to open workload type feature enable file"); exit(1); @@ -46,6 +50,26 @@ void workload_hint_exit(int signum) close(fd); } +static void update_delay(char *delay_str) +{ + int fd; + + printf("Setting notification delay in ms to %s\n", delay_str); + + fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); + if (fd < 0) { + perror("Unable to open workload notification delay"); + exit(1); + } + + if (write(fd, delay_str, strlen(delay_str)) < 0) { + perror("Can't set delay"); + exit(1); + } + + close(fd); +} + int main(int argc, char **argv) { struct pollfd ufd; @@ -54,32 +78,26 @@ int main(int argc, char **argv) char delay_str[64]; int delay = 0; - printf("Usage: workload_hint_test [notification delay in milli seconds]\n"); + printf("Usage: workload_hint_test [notification delay in milli seconds][slow]\n"); if (argc > 1) { - ret = sscanf(argv[1], "%d", &delay); - if (ret < 0) { - printf("Invalid delay\n"); - exit(1); - } + int i; - printf("Setting notification delay to %d ms\n", delay); - if (delay < 0) - exit(1); + for (i = 1; i < argc; ++i) { + if (!strcmp(argv[i], "slow")) { + wlt_slow = 1; + continue; + } - sprintf(delay_str, "%s\n", argv[1]); - fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR); - if (fd < 0) { - perror("Unable to open workload notification delay"); - exit(1); - } + ret = sscanf(argv[1], "%d", &delay); + if (ret < 0) { + printf("Invalid delay\n"); + exit(1); + } - if (write(fd, delay_str, strlen(delay_str)) < 0) { - perror("Can't set delay"); - exit(1); + sprintf(delay_str, "%s\n", argv[1]); + update_delay(delay_str); } - - close(fd); } if (signal(SIGINT, workload_hint_exit) == SIG_IGN) @@ -89,8 +107,13 @@ int main(int argc, char **argv) if (signal(SIGTERM, workload_hint_exit) == SIG_IGN) signal(SIGTERM, SIG_IGN); + if (wlt_slow) + wlt_enable_attr = WORKLOAD_SLOW_ENABLE_ATTRIBUTE; + else + wlt_enable_attr = WORKLOAD_ENABLE_ATTRIBUTE; + /* Enable feature via sysfs knob */ - fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR); + fd = open(wlt_enable_attr, O_RDWR); if (fd < 0) { perror("Unable to open workload type feature enable file"); exit(1); @@ -145,6 +168,13 @@ int main(int argc, char **argv) if (ret < 0) break; + if (wlt_slow) { + if (index & 0x10) + printf("workload type slow:%s\n", "power"); + else + printf("workload type slow:%s\n", "performance"); + } + index &= 0x0f; if (index > WORKLOAD_TYPE_MAX_INDEX) printf("Invalid workload type index\n"); diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore index 8b2871ea7751..e17bd28f27e0 100644 --- a/tools/testing/selftests/ublk/.gitignore +++ b/tools/testing/selftests/ublk/.gitignore @@ -1,3 +1,5 @@ -kublk -/tools +# SPDX-License-Identifier: GPL-2.0 *-verify.state +/tools +kublk +metadata_size diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 06ba6fde098d..8ac2d4a682a1 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -7,22 +7,21 @@ endif LDLIBS += -lpthread -lm -luring -TEST_PROGS := test_generic_01.sh -TEST_PROGS += test_generic_02.sh +TEST_PROGS := test_generic_02.sh TEST_PROGS += test_generic_03.sh -TEST_PROGS += test_generic_04.sh -TEST_PROGS += test_generic_05.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh TEST_PROGS += test_generic_08.sh TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh -TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh -TEST_PROGS += test_generic_14.sh -TEST_PROGS += test_generic_15.sh +TEST_PROGS += test_generic_16.sh + +TEST_PROGS += test_batch_01.sh +TEST_PROGS += test_batch_02.sh +TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh @@ -34,6 +33,14 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh + +TEST_PROGS += test_integrity_01.sh +TEST_PROGS += test_integrity_02.sh + +TEST_PROGS += test_recover_01.sh +TEST_PROGS += test_recover_02.sh +TEST_PROGS += test_recover_03.sh +TEST_PROGS += test_recover_04.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh @@ -41,6 +48,9 @@ TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh +TEST_PROGS += test_part_01.sh +TEST_PROGS += test_part_02.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh @@ -48,13 +58,55 @@ TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_PROGS += test_stress_08.sh +TEST_PROGS += test_stress_09.sh + +TEST_FILES := settings -TEST_GEN_PROGS_EXTENDED = kublk +TEST_GEN_PROGS_EXTENDED = kublk metadata_size +STANDALONE_UTILS := metadata_size.c LOCAL_HDRS += $(wildcard *.h) include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c) +$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh + +# Test groups for running subsets of tests +# JOBS=1 (default): sequential with kselftest TAP output +# JOBS>1: parallel execution with xargs -P +# Usage: make run_null JOBS=4 +JOBS ?= 1 +export JOBS + +# Auto-detect test groups from TEST_PROGS (test_<group>_<num>.sh -> group) +TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ + sed 's/test_\([^_]*\)_.*/\1/' | sort -u) + +# Template for group test targets +# $(1) = group name (e.g., null, generic, stress) +define RUN_GROUP +run_$(1): all + @if [ $$(JOBS) -gt 1 ]; then \ + echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \ + xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \ + else \ + $$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \ + fi +.PHONY: run_$(1) +endef + +# Generate targets for each discovered test group +$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group)))) + +# Run all tests (parallel when JOBS>1) +run_all: all + @if [ $(JOBS) -gt 1 ]; then \ + echo $(TEST_PROGS) | tr ' ' '\n' | \ + xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \ + else \ + $(call RUN_TESTS, $(TEST_PROGS)); \ + fi +.PHONY: run_all diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 000000000000..a54025b00917 --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,607 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + if (t->commit_buf) { + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + + munlock(t->commit_buf, total); + free(t->commit_buf); + } + allocator_deinit(&t->commit_buf_alloc); + free(t->commit); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) + t->commit[j++].q_id = i; + } + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + + /* lock commit buffer pages for fast access */ + if (mlock(t->commit_buf, total)) + ublk_err("%s: can't lock commit buffer %s\n", __func__, + strerror(errno)); + + return 0; + +fail: + free_batch_commit_buf(t); + return ret; +} + +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->q_map[i]; + + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2 * t->nr_queues; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_fetch_bufs; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } + free(t->fetch); +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + int ret; + + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); + free_batch_fetch_buf(t); +} + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + /* Use plain user buffer instead of fixed buffer */ + cmd->flags |= t->cmd_flags; +} + +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t) +{ + int i; + int j = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) { + struct ublk_queue *q = &t->dev->q[i]; + + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + t->cmd_inflight--; + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->cmd_inflight--; + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + t->cmd_inflight--; + ublk_batch_queue_fetch(t, q, buf_idx); + } +} + +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = cb->done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, cb->buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; + struct ublk_io *io = &q->ios[tag]; + + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + + ublk_assert(q->q_id == cb->q_id); + + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} + +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues) +{ + int i, j; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < nthreads; j++) { + for (i = 0; i < queues; i++) { + printf("%03u ", q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index 01580a6f8519..530f9877c9dd 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -12,11 +12,11 @@ void backing_file_tgt_deinit(struct ublk_dev *dev) } } -int backing_file_tgt_init(struct ublk_dev *dev) +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; @@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev) ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); - fd = open(file, O_RDWR | O_DIRECT); + fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0)); if (fd < 0) { ublk_err("%s: backing file %s can't be opened: %s\n", __func__, file, strerror(errno)); diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index b227bd78b252..3b897f69c014 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, .dev_sectors = dev_size >> 9, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); return 0; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 269d5f124e06..228af2580ac6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, @@ -35,8 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, unsigned auto_zc = ublk_queue_use_auto_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc); struct ublk_io *io = ublk_get_io(q, tag); + __u64 offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + unsigned short buf_index = ublk_io_buf_idx(t, q, tag); + + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { + ublk_io_alloc_sqes(t, sqe, 1); + /* Use second backing file for integrity data */ + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2), + io->integrity_buf, + ublk_integrity_len(q, len), + ublk_integrity_len(q, offset)); + sqe[0]->flags = IOSQE_FIXED_FILE; + /* tgt_data = 1 indicates integrity I/O */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1); + } if (!zc || auto_zc) { ublk_io_alloc_sqes(t, sqe, 1); @@ -45,34 +60,34 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_index; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - return 1; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1; } ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, - iod->nr_sectors << 9, - iod->start_sector << 9); - sqe[1]->buf_index = tag; + len, + offset); + sqe[1]->buf_index = buf_index; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); - return 2; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; } static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag) @@ -119,12 +134,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); - if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { - if (!io->result) - io->result = cqe->res; - if (cqe->res < 0) - ublk_err("%s: io failed op %x user_data %lx\n", - __func__, op, cqe->user_data); + if (cqe->res < 0) { + io->result = cqe->res; + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + __s32 data_len = user_data_to_tgt_data(cqe->user_data) + ? ublk_integrity_data_len(q, cqe->res) + : cqe->res; + + if (!io->result || data_len < io->result) + io->result = data_len; } /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ @@ -135,9 +155,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, ublk_complete_io(t, q, tag, io->result); } +static int ublk_loop_memset_file(int fd, __u8 byte, size_t len) +{ + off_t offset = 0; + __u8 buf[4096]; + + memset(buf, byte, sizeof(buf)); + while (len) { + int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset); + + if (ret < 0) + return -errno; + if (!ret) + return -EIO; + + len -= ret; + offset += ret; + } + return 0; +} + static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { unsigned long long bytes; + unsigned long blocks; int ret; struct ublk_params p = { .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, @@ -154,19 +195,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) }, }; + ublk_set_integrity_params(ctx, &p); if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } - ret = backing_file_tgt_init(dev); + /* Use O_DIRECT only for data file */ + ret = backing_file_tgt_init(dev, 1); if (ret) return ret; - if (dev->tgt.nr_backing_files != 1) + /* Expect a second file for integrity data */ + if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size) return -EINVAL; - bytes = dev->tgt.backing_file_size[0]; + blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift; + if (ctx->metadata_size) { + unsigned long metadata_blocks = + dev->tgt.backing_file_size[1] / ctx->metadata_size; + unsigned long integrity_len; + + /* Ensure both data and integrity data fit in backing files */ + blocks = min(blocks, metadata_blocks); + integrity_len = blocks * ctx->metadata_size; + /* + * Initialize PI app tag and ref tag to 0xFF + * to disable bio-integrity-auto checks + */ + ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len); + if (ret) + return ret; + } + bytes = blocks << p.basic.logical_bs_shift; dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f197ad9cc262..e1c3b3c55e56 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -3,6 +3,7 @@ * Description: uring_cmd based ublk */ +#include <linux/fs.h> #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -107,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev) return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_TRY_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + static int ublk_ctrl_start_dev(struct ublk_dev *dev, int daemon_pid) { @@ -415,14 +425,18 @@ static void ublk_queue_deinit(struct ublk_queue *q) if (q->io_cmd_buf) munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); - for (i = 0; i < nr_ios; i++) + for (i = 0; i < nr_ios; i++) { free(q->ios[i].buf_addr); + free(q->ios[i].integrity_buf); + } } static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -432,19 +446,22 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, + __u8 metadata_size) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; int i; - int cmd_buf_size, io_buf_size; + int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; q->flags = dev->dev_info.flags; q->flags |= extra_flags; + q->metadata_size = metadata_size; /* Cache fd in queue for fast path access */ q->ublk_fd = dev->fds[0]; @@ -460,11 +477,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) } io_buf_size = dev->dev_info.max_io_buf_bytes; + integrity_size = ublk_integrity_len(q, io_buf_size); for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; q->ios[i].tag = i; + if (integrity_size) { + q->ios[i].integrity_buf = malloc(integrity_size); + if (!q->ios[i].integrity_buf) { + ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n", + dev->dev_info.dev_id, q->q_id, i, + integrity_size); + goto fail; + } + } + + if (ublk_queue_no_buf(q)) continue; @@ -491,6 +520,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth * 2; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -505,15 +538,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { @@ -579,16 +630,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -607,13 +659,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) __u8 ublk_op = ublksrv_get_op(iod); __u32 len = iod->nr_sectors << 9; void *addr = io->buf_addr; + ssize_t copied; if (ublk_op != match_ublk_op) return; while (len) { __u32 copy_len = min(len, UBLK_USER_COPY_LEN); - ssize_t copied; if (ublk_op == UBLK_IO_OP_WRITE) copied = pread(q->ublk_fd, addr, copy_len, off); @@ -626,6 +678,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) off += copy_len; len -= copy_len; } + + if (!(iod->op_flags & UBLK_IO_F_INTEGRITY)) + return; + + len = ublk_integrity_len(q, iod->nr_sectors << 9); + off = ublk_user_copy_offset(q->q_id, io->tag); + off |= UBLKSRV_IO_INTEGRITY_FLAG; + if (ublk_op == UBLK_IO_OP_WRITE) + copied = pread(q->ublk_fd, io->integrity_buf, len, off); + else if (ublk_op == UBLK_IO_OP_READ) + copied = pwrite(q->ublk_fd, io->integrity_buf, len, off); + else + assert(0); + assert(copied == (ssize_t)len); } int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) @@ -690,7 +756,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); @@ -779,13 +845,15 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, unsigned tag = user_data_to_tag(cqe->user_data); struct ublk_io *io = &q->ios[tag]; + t->cmd_inflight--; + if (!fetch) { t->state |= UBLKS_T_STOPPING; io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (ublk_queue_use_user_copy(q)) ublk_user_copy(io, UBLK_IO_OP_WRITE); @@ -813,28 +881,30 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } - t->cmd_inflight--; - - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -866,7 +936,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), @@ -882,6 +958,7 @@ struct ublk_thread_info { sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; }; static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) @@ -891,6 +968,26 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) info->dev->dev_info.dev_id, info->idx); } +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->q_map[i] == 0) + continue; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret >= 0); + } +} + static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) { struct ublk_thread t = { @@ -900,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf int dev_id = info->dev->dev_info.dev_id; int ret; + /* Copy per-thread queue mapping into thread-local variable */ + if (info->q_thread_map) + memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", @@ -911,8 +1012,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t.idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(&t); + if (!ublk_thread_batch_io(&t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + } else { + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t); + } + do { if (ublk_process_io(&t) < 0) break; @@ -984,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; void *thread_ret; sem_t ready; int ret, i; @@ -1003,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ret) return ret; + if (ublk_dev_batch_io(dev)) { + q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); + if (!q_thread_map) { + ret = -ENOMEM; + goto fail; + } + ublk_batch_setup_map(q_thread_map, dev->nthreads, + dinfo->nr_hw_queues); + } + if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; if (ctx->no_ublk_fixed_fd) @@ -1012,7 +1130,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->q[i].dev = dev; dev->q[i].q_id = i; - ret = ublk_queue_init(&dev->q[i], extra_flags); + ret = ublk_queue_init(&dev->q[i], extra_flags, + ctx->metadata_size); if (ret) { ublk_err("ublk dev %d queue %d init queue failed\n", dinfo->dev_id, i); @@ -1025,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].idx = i; tinfo[i].ready = &ready; tinfo[i].extra_flags = extra_flags; + tinfo[i].q_thread_map = q_thread_map; /* * If threads are not tied 1:1 to queues, setting thread @@ -1044,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); free(affinity_buf); + free(q_thread_map); /* everything is fine now, start us */ if (ctx->recovery) @@ -1214,7 +1335,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1394,6 +1516,42 @@ static int cmd_dev_del(struct dev_ctx *ctx) return 0; } +static int cmd_dev_stop(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + if (number < 0) { + ublk_err("%s: device id is required\n", __func__); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + if (ctx->safe_stop) { + ret = ublk_ctrl_try_stop_dev(dev); + if (ret < 0) + ublk_err("%s: try_stop dev %d failed ret %d\n", + __func__, number, ret); + } else { + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", + __func__, number, ret); + } + +fail: + ublk_ctrl_deinit(dev); + + return ret; +} + static int __cmd_dev_list(struct dev_ctx *ctx) { struct ublk_dev *dev = ublk_ctrl_init(); @@ -1456,6 +1614,10 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), + FEAT_NAME(UBLK_F_INTEGRITY), + FEAT_NAME(UBLK_F_SAFE_STOP_DEV), + FEAT_NAME(UBLK_F_BATCH_IO), + FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), }; struct ublk_dev *dev; __u64 features = 0; @@ -1551,6 +1713,9 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " + "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); + printf("\t[--batch|-b] [--no_auto_part_scan]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1583,6 +1748,8 @@ static int cmd_dev_help(char *exe) printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n\n"); + printf("%s stop -n dev_id [--safe]\n", exe); + printf("\t --safe only stop if device has no active openers\n\n"); printf("%s list [-n dev_id] -a \n", exe); printf("\t -a list all devices, -n list specified device, default -a \n\n"); printf("%s features\n", exe); @@ -1614,6 +1781,15 @@ int main(int argc, char *argv[]) { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, { "no_ublk_fixed_fd", 0, NULL, 0 }, + { "integrity_capable", 0, NULL, 0 }, + { "integrity_reftag", 0, NULL, 0 }, + { "metadata_size", 1, NULL, 0 }, + { "pi_offset", 1, NULL, 0 }, + { "csum_type", 1, NULL, 0 }, + { "tag_size", 1, NULL, 0 }, + { "safe", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, + { "no_auto_part_scan", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1625,6 +1801,7 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", + .csum_type = LBMD_PI_CSUM_NONE, }; int ret = -EINVAL, i; int tgt_argc = 1; @@ -1636,12 +1813,15 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", longopts, &option_idx)) != -1) { switch (opt) { case 'a': ctx.all = 1; break; + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'n': ctx.dev_id = strtol(optarg, NULL, 10); break; @@ -1699,6 +1879,32 @@ int main(int argc, char *argv[]) ctx.per_io_tasks = 1; if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) ctx.no_ublk_fixed_fd = 1; + if (!strcmp(longopts[option_idx].name, "integrity_capable")) + ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY; + if (!strcmp(longopts[option_idx].name, "integrity_reftag")) + ctx.integrity_flags |= LBMD_PI_CAP_REFTAG; + if (!strcmp(longopts[option_idx].name, "metadata_size")) + ctx.metadata_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "pi_offset")) + ctx.pi_offset = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "csum_type")) { + if (!strcmp(optarg, "ip")) { + ctx.csum_type = LBMD_PI_CSUM_IP; + } else if (!strcmp(optarg, "t10dif")) { + ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF; + } else if (!strcmp(optarg, "nvme")) { + ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME; + } else { + ublk_err("invalid csum_type: %s\n", optarg); + return -EINVAL; + } + } + if (!strcmp(longopts[option_idx].name, "tag_size")) + ctx.tag_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "safe")) + ctx.safe_stop = 1; + if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) + ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; break; case '?': /* @@ -1722,6 +1928,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && @@ -1741,6 +1952,28 @@ int main(int argc, char *argv[]) return -EINVAL; } + if (ctx.metadata_size) { + if (!(ctx.flags & UBLK_F_USER_COPY)) { + ublk_err("integrity requires user_copy\n"); + return -EINVAL; + } + + ctx.flags |= UBLK_F_INTEGRITY; + } else if (ctx.integrity_flags || + ctx.pi_offset || + ctx.csum_type != LBMD_PI_CSUM_NONE || + ctx.tag_size) { + ublk_err("integrity parameters require metadata_size\n"); + return -EINVAL; + } + + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; @@ -1766,6 +1999,8 @@ int main(int argc, char *argv[]) } } else if (!strcmp(cmd, "del")) ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "stop")) + ret = cmd_dev_stop(&ctx); else if (!strcmp(cmd, "list")) { ctx.all = 1; ret = cmd_dev_list(&ctx); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8a83b90ec603..02f0c55d006b 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,6 +78,13 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + unsigned int safe_stop:1; + unsigned int no_auto_part_scan:1; + __u32 integrity_flags; + __u8 metadata_size; + __u8 pi_offset; + __u8 csum_type; + __u8 tag_size; int _evtfd; int _shmid; @@ -107,6 +114,7 @@ struct ublk_ctrl_cmd_data { struct ublk_io { char *buf_addr; + void *integrity_buf; #define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) #define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) @@ -143,7 +151,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -165,23 +174,76 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ + __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; +}; + +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; }; struct ublk_thread { + /* Thread-local copy of queue-to-thread mapping for this thread */ + unsigned char q_map[UBLK_MAX_QUEUES]; + struct ublk_dev *dev; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; unsigned int cmd_inflight; unsigned int io_inflight; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; + struct batch_commit_buf *commit; + /* FETCH_IO_CMDS buffer */ + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; + struct io_uring ring; }; @@ -202,6 +264,55 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + +static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, + struct ublk_params *params) +{ + if (!ctx->metadata_size) + return; + + params->types |= UBLK_PARAM_TYPE_INTEGRITY; + params->integrity = (struct ublk_param_integrity) { + .flags = ctx->integrity_flags, + .interval_exp = params->basic.logical_bs_shift, + .metadata_size = ctx->metadata_size, + .pi_offset = ctx->pi_offset, + .csum_type = ctx->csum_type, + .tag_size = ctx->tag_size, + }; +} + +static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len) +{ + /* All targets currently use interval_exp = logical_bs_shift = 9 */ + return (len >> 9) * q->metadata_size; +} + +static inline size_t +ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len) +{ + return (integrity_len / q->metadata_size) << 9; +} static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { @@ -223,10 +334,10 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, unsigned tgt_data, unsigned q_id, unsigned is_target_io) { /* we only have 7 bits to encode q_id */ - _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7"); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } @@ -357,33 +468,22 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } -static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) -{ - return &q->ios[tag]; -} +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) { - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); + return q->ios[tag].buf_index; } -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) +static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } + return &q->ios[tag]; } static inline int ublk_completed_tgt_io(struct ublk_thread *t, @@ -421,12 +521,90 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->q_map[q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; extern const struct ublk_tgt_ops fault_inject_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); -int backing_file_tgt_init(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct); #endif diff --git a/tools/testing/selftests/ublk/metadata_size.c b/tools/testing/selftests/ublk/metadata_size.c new file mode 100644 index 000000000000..76ecddf04d25 --- /dev/null +++ b/tools/testing/selftests/ublk/metadata_size.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <fcntl.h> +#include <linux/fs.h> +#include <stdio.h> +#include <sys/ioctl.h> + +int main(int argc, char **argv) +{ + struct logical_block_metadata_cap cap = {}; + const char *filename; + int fd; + int result; + + if (argc != 2) { + fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]); + return 1; + } + + filename = argv[1]; + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror(filename); + return 1; + } + + result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap); + if (result < 0) { + perror("ioctl"); + return 1; + } + + printf("metadata_size: %u\n", cap.lbmd_size); + printf("pi_offset: %u\n", cap.lbmd_pi_offset); + printf("pi_tuple_size: %u\n", cap.lbmd_pi_size); + return 0; +} diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 280043f6b689..7656888f4149 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_segments = 32, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; @@ -43,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -60,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -85,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -136,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings new file mode 100644 index 000000000000..682a40f1c8e6 --- /dev/null +++ b/tools/testing/selftests/ublk/settings @@ -0,0 +1 @@ +timeout=150 diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index fd412e1f01c0..dca819f5366e 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = io->buf_addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } @@ -298,6 +299,10 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); @@ -311,14 +316,14 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) chunk_shift = ilog2(chunk_size); - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files); if (ret) return ret; if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh new file mode 100755 index 000000000000..a18fb39af8be --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh new file mode 100755 index 000000000000..7ca384d11987 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh new file mode 100755 index 000000000000..aca9cf144b55 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ea9a5f3eb70a..163a40007910 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Derive TID from script name: test_<type>_<num>.sh -> <type>_<num> +# Can be overridden in test script after sourcing this file +TID=$(basename "$0" .sh) +TID=${TID#test_} + UBLK_SKIP_CODE=4 _have_program() { @@ -10,6 +15,16 @@ _have_program() { return 1 } +# Sleep with awareness of parallel execution. +# Usage: _ublk_sleep <normal_secs> <parallel_secs> +_ublk_sleep() { + if [ "${JOBS:-1}" -gt 1 ]; then + sleep "$2" + else + sleep "$1" + fi +} + _get_disk_dev_t() { local dev_id=$1 local dev @@ -43,7 +58,7 @@ _create_backfile() { old_file="${UBLK_BACKFILES[$index]}" [ -f "$old_file" ] && rm -f "$old_file" - new_file=$(mktemp ublk_file_"${new_size}"_XXXXX) + new_file=$(mktemp ${UBLK_TEST_DIR}/ublk_file_"${new_size}"_XXXXX) truncate -s "${new_size}" "${new_file}" UBLK_BACKFILES["$index"]="$new_file" } @@ -60,7 +75,7 @@ _remove_files() { _create_tmp_dir() { local my_file; - my_file=$(mktemp -d ublk_dir_XXXXX) + my_file=$(mktemp -d ${UBLK_TEST_DIR}/ublk_dir_XXXXX) echo "$my_file" } @@ -101,11 +116,6 @@ _check_root() { fi } -_remove_ublk_devices() { - ${UBLK_PROG} del -a - modprobe -r ublk_drv > /dev/null 2>&1 -} - _get_ublk_dev_state() { ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' } @@ -119,8 +129,12 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - UBLK_TMP=$(mktemp ublk_test_XXXXX) + local base_dir=${TMPDIR:-./ublktest-dir} + mkdir -p "$base_dir" + UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX) + UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" + echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } _remove_test_files() @@ -162,9 +176,16 @@ _check_add_dev() } _cleanup_test() { - "${UBLK_PROG}" del -a + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + while read -r dev_id; do + ${UBLK_PROG} del -n "${dev_id}" + done < "${UBLK_TEST_DIR}/.ublk_devs" + rm -f "${UBLK_TEST_DIR}/.ublk_devs" + fi _remove_files + rmdir ${UBLK_TEST_DIR} + echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } _have_feature() @@ -197,10 +218,11 @@ _create_ublk_dev() { fi if [ "$settle" = "yes" ]; then - udevadm settle + udevadm settle --timeout=20 fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then + echo "$dev_id" >> "${UBLK_TEST_DIR}/.ublk_devs" echo "${dev_id}" else return 255 @@ -220,7 +242,7 @@ _recover_ublk_dev() { local state dev_id=$(_create_ublk_dev "recover" "yes" "$@") - for ((j=0;j<20;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "LIVE" ] && break sleep 1 @@ -240,7 +262,7 @@ __ublk_quiesce_dev() return "$state" fi - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "$exp_state" ] && break sleep 1 @@ -259,7 +281,7 @@ __ublk_kill_daemon() daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") state=$(_get_ublk_dev_state "${dev_id}") - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do [ "$state" == "$exp_state" ] && break kill -9 "$daemon_pid" > /dev/null 2>&1 sleep 1 @@ -268,12 +290,23 @@ __ublk_kill_daemon() echo "$state" } -__remove_ublk_dev_return() { +_ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" + + # Remove from tracking file + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + sed -i "/^${dev_id}$/d" "${UBLK_TEST_DIR}/.ublk_devs" + fi +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + _ublk_del_dev "${dev_id}" local res=$? - udevadm settle + udevadm settle --timeout=20 return ${res} } @@ -384,6 +417,16 @@ _ublk_test_top_dir() cd "$(dirname "$0")" && pwd } +METADATA_SIZE_PROG="$(_ublk_test_top_dir)/metadata_size" + +_get_metadata_size() +{ + local dev_id=$1 + local field=$2 + + "$METADATA_SIZE_PROG" "/dev/ublkb$dev_id" | grep "$field" | grep -o "[0-9]*" +} + UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh deleted file mode 100755 index 21a31cd5491a..000000000000 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -TID="generic_01" -ERR_CODE=0 - -if ! _have_program bpftrace; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "null" "sequential io order" - -dev_id=$(_add_ublk_dev -t null) -_check_add_dev $TID $? - -dev_t=$(_get_disk_dev_t "$dev_id") -bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & -btrace_pid=$! -sleep 2 - -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then - _cleanup_test "null" - exit "$UBLK_SKIP_CODE" -fi - -# run fio over this ublk disk -fio --name=write_seq \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --bs=4k > /dev/null 2>&1 -ERR_CODE=$? -kill "$btrace_pid" -wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" - ERR_CODE=255 -fi -_cleanup_test "null" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 12920768b1a0..46b657143fd6 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_02" ERR_CODE=0 if ! _have_program bpftrace; then @@ -14,7 +13,7 @@ if ! _have_program fio; then exit "$UBLK_SKIP_CODE" fi -_prep_test "null" "sequential io order for MQ" +_prep_test "null" "ublk dispatch won't reorder IO for MQ" dev_id=$(_add_ublk_dev -t null -q 2) _check_add_dev $TID $? @@ -22,15 +21,20 @@ _check_add_dev $TID $? dev_t=$(_get_disk_dev_t "$dev_id") bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & btrace_pid=$! -sleep 2 -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then +# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY) +for _ in $(seq 100); do + grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break + sleep 0.1 +done + +if ! kill -0 "$btrace_pid" 2>/dev/null; then _cleanup_test "null" exit "$UBLK_SKIP_CODE" fi -# run fio over this ublk disk -fio --name=write_seq \ +# run fio over this ublk disk (pinned to CPU 0) +taskset -c 0 fio --name=write_seq \ --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=16 \ --rw=write \ @@ -40,8 +44,11 @@ fio --name=write_seq \ ERR_CODE=$? kill "$btrace_pid" wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" + +# Check for out-of-order completions detected by bpftrace +if grep -q "^out_of_order:" "$UBLK_TMP"; then + echo "I/O reordering detected:" + grep "^out_of_order:" "$UBLK_TMP" ERR_CODE=255 fi _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh index b551aa76cb0d..8934ea926762 100755 --- a/tools/testing/selftests/ublk/test_generic_03.sh +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_03" ERR_CODE=0 _prep_test "null" "check dma & segment limits for zero copy" diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh index fd42062b7b76..14a05054fcd8 100755 --- a/tools/testing/selftests/ublk/test_generic_06.sh +++ b/tools/testing/selftests/ublk/test_generic_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_06" ERR_CODE=0 _prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server" diff --git a/tools/testing/selftests/ublk/test_generic_07.sh b/tools/testing/selftests/ublk/test_generic_07.sh index cba86451fa5e..8dcfd8978f50 100755 --- a/tools/testing/selftests/ublk/test_generic_07.sh +++ b/tools/testing/selftests/ublk/test_generic_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_07" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh index b222f3a77e12..ce88c31d6b9c 100755 --- a/tools/testing/selftests/ublk/test_generic_08.sh +++ b/tools/testing/selftests/ublk/test_generic_08.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_08" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh index bb6f77ca5522..744d0cdaa242 100755 --- a/tools/testing/selftests/ublk/test_generic_09.sh +++ b/tools/testing/selftests/ublk/test_generic_09.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_09" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh index abc11c3d416b..4b4293b9081f 100755 --- a/tools/testing/selftests/ublk/test_generic_10.sh +++ b/tools/testing/selftests/ublk/test_generic_10.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_10" ERR_CODE=0 if ! _have_feature "UPDATE_SIZE"; then diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index b4046201b4d9..54b81ddfe9f9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_12" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh index b7aa90b1cb74..922115aa14f4 100755 --- a/tools/testing/selftests/ublk/test_generic_13.sh +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_13" ERR_CODE=0 _prep_test "null" "check that feature list is complete" diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh new file mode 100755 index 000000000000..3ef367836ac5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "null" "stop --safe command" + +# Check if SAFE_STOP_DEV feature is supported +if ! _have_feature "SAFE_STOP_DEV"; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# Test 1: stop --safe on idle device should succeed +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Device is idle (no openers), stop --safe should succeed +if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then + echo "stop --safe on idle device failed unexpectedly!" + ERR_CODE=255 +fi + +# Clean up device +_ublk_del_dev "${dev_id}" > /dev/null 2>&1 +udevadm settle + +# Test 2: stop --safe on device with active opener should fail +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Open device in background (dd reads indefinitely) +dd if=/dev/ublkb${dev_id} of=/dev/null bs=4k iflag=direct > /dev/null 2>&1 & +dd_pid=$! + +# Give dd time to start +sleep 0.2 + +# Device has active opener, stop --safe should fail with -EBUSY +if ${UBLK_PROG} stop -n "${dev_id}" --safe 2>/dev/null; then + echo "stop --safe on busy device succeeded unexpectedly!" + ERR_CODE=255 +fi + +# Kill dd and clean up +kill $dd_pid 2>/dev/null +wait $dd_pid 2>/dev/null + +# Now device should be idle, regular delete should work +_ublk_del_dev "${dev_id}" +udevadm settle + +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_01.sh b/tools/testing/selftests/ublk/test_integrity_01.sh new file mode 100755 index 000000000000..6713b280a6ff --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_01.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 + + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} + +_test_metadata_only() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + _ublk_del_dev "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + _ublk_del_dev "${dev_id}" +} + +_prep_test "null" "integrity params" + +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_integrity_02.sh b/tools/testing/selftests/ublk/test_integrity_02.sh new file mode 100755 index 000000000000..aaf1f52da559 --- /dev/null +++ b/tools/testing/selftests/ublk/test_integrity_02.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +ERR_CODE=0 + +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" + +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) + + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? + + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" + + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} + +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi + + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_prep_test "loop" "end-to-end integrity" + +_setup_device + +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag + +rm -f "$fio_err" + +_cleanup_test +_show_result "$TID" $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 833fa0dbc700..338a235fd82a 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 874568b3646b..04c52454e2ec 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_02" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index c30f797c6429..6e8f649fe93d 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index b01d75b3214d..9f6774ec0de6 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_04" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh index de2141533074..2b8d99e007be 100755 --- a/tools/testing/selftests/ublk/test_loop_05.sh +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_06.sh b/tools/testing/selftests/ublk/test_loop_06.sh index 1d1a8a725502..e73f6f4844db 100755 --- a/tools/testing/selftests/ublk/test_loop_06.sh +++ b/tools/testing/selftests/ublk/test_loop_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_06" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_07.sh b/tools/testing/selftests/ublk/test_loop_07.sh index 493f3fb611a5..264d20e7c530 100755 --- a/tools/testing/selftests/ublk/test_loop_07.sh +++ b/tools/testing/selftests/ublk/test_loop_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_07" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with user copy" diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index c2cb8f7a09fe..eebce8076530 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 8accd35beb55..654bdff39664 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_03.sh b/tools/testing/selftests/ublk/test_null_03.sh index 0051067b4686..29cd09f06672 100755 --- a/tools/testing/selftests/ublk/test_null_03.sh +++ b/tools/testing/selftests/ublk/test_null_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_part_01.sh b/tools/testing/selftests/ublk/test_part_01.sh new file mode 100755 index 000000000000..8028f6e4b3a5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_01.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +format_backing_file() +{ + local backing_file=$1 + + # Create ublk device to write partition table + local tmp_dev=$(_add_ublk_dev -t loop "${backing_file}") + [ $? -ne 0 ] && return 1 + + # Write partition table with sfdisk + sfdisk /dev/ublkb"${tmp_dev}" > /dev/null 2>&1 <<EOF +label: dos +start=2048, size=100MiB, type=83 +start=206848, size=100MiB, type=83 +EOF + local ret=$? + + "${UBLK_PROG}" del -n "${tmp_dev}" + + return $ret +} + +test_auto_part_scan() +{ + local backing_file=$1 + + # Create device WITHOUT --no_auto_part_scan + local dev_id=$(_add_ublk_dev -t loop "${backing_file}") + [ $? -ne 0 ] && return 1 + + udevadm settle + + # Partitions should be auto-detected + if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + "${UBLK_PROG}" del -n "${dev_id}" + return 0 +} + +test_no_auto_part_scan() +{ + local backing_file=$1 + + # Create device WITH --no_auto_part_scan + local dev_id=$(_add_ublk_dev -t loop --no_auto_part_scan "${backing_file}") + [ $? -ne 0 ] && return 1 + + udevadm settle + + # Partitions should NOT be auto-detected + if [ -e /dev/ublkb"${dev_id}"p1 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + # Manual scan should work + blockdev --rereadpt /dev/ublkb"${dev_id}" > /dev/null 2>&1 + udevadm settle + + if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + "${UBLK_PROG}" del -n "${dev_id}" + return 0 +} + +if ! _have_program sfdisk || ! _have_program blockdev; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_NO_AUTO_PART_SCAN" + +if ! _have_feature "UBLK_F_NO_AUTO_PART_SCAN"; then + _cleanup_test "generic" + exit "$UBLK_SKIP_CODE" +fi + + +# Create and format backing file with partition table +_create_backfile 0 256M +format_backing_file "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test normal auto partition scan +[ "$ERR_CODE" -eq 0 ] && test_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test no auto partition scan with manual scan +[ "$ERR_CODE" -eq 0 ] && test_no_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_part_02.sh index 76379362e0a2..7d42ab4d6e83 100755 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_15" ERR_CODE=0 _test_partition_scan_no_hang() @@ -34,7 +33,7 @@ _test_partition_scan_no_hang() # The add command should return quickly because partition scan is async. # Now sleep briefly to let the async partition scan work start and hit # the delay in the fault_inject handler. - sleep 1 + _ublk_sleep 1 5 # Kill the ublk daemon while partition scan is potentially blocked # And check state transitions properly @@ -47,13 +46,13 @@ _test_partition_scan_no_hang() if [ "$state" != "${expected_state}" ]; then echo "FAIL: Device state is $state, expected ${expected_state}" ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 return fi echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 } _prep_test "partition_scan" "verify async partition scan prevents IO hang" diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_recover_01.sh index baf5b156193d..2672f9c40fa8 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_recover_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_04" ERR_CODE=0 ublk_run_recover_test() @@ -26,6 +25,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 & ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_recover_02.sh index 7b5083afc02a..bda5064bc31f 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_recover_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_05" ERR_CODE=0 ublk_run_recover_test() @@ -30,6 +29,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 -z & ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_recover_03.sh index d1f973c8c645..e0dc0b8fe5d6 100755 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ b/tools/testing/selftests/ublk/test_recover_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_11" ERR_CODE=0 ublk_run_quiesce_recover() diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_recover_04.sh index cd9b44b97c24..178443394ca5 100755 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ b/tools/testing/selftests/ublk/test_recover_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_14" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7d3150f057d4..a9322ce496e9 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_01" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 4bdd921081e5..6c114194f9c9 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 3ed4c9b2d8c0..4e81ca0db758 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_03" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index efa8dc33234b..6c6f44b172bc 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_04" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 68a194144302..7e9324de2030 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh index 37188ec2e1f7..c72e5d0b14be 100755 --- a/tools/testing/selftests/ublk/test_stress_06.sh +++ b/tools/testing/selftests/ublk/test_stress_06.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh index fb061fc26d36..04c2764d5238 100755 --- a/tools/testing/selftests/ublk/test_stress_07.sh +++ b/tools/testing/selftests/ublk/test_stress_07.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh new file mode 100755 index 000000000000..37f7d204879a --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh new file mode 100755 index 000000000000..53c1e3b2ab30 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index 4e4f0fdf3c9b..3bc821aadad8 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh index 5820ab2efba4..4a7d2b21a6bf 100755 --- a/tools/testing/selftests/ublk/test_stripe_02.sh +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_02" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh index 20b977e27814..a1c159d54e53 100755 --- a/tools/testing/selftests/ublk/test_stripe_03.sh +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh index 1b51ed2f1d84..0c30bd6c2b3b 100755 --- a/tools/testing/selftests/ublk/test_stripe_04.sh +++ b/tools/testing/selftests/ublk/test_stripe_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_04" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on zero copy" diff --git a/tools/testing/selftests/ublk/test_stripe_05.sh b/tools/testing/selftests/ublk/test_stripe_05.sh index 05d71951d710..6ddfa88ad226 100755 --- a/tools/testing/selftests/ublk/test_stripe_05.sh +++ b/tools/testing/selftests/ublk/test_stripe_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_06.sh b/tools/testing/selftests/ublk/test_stripe_06.sh index d06cac7626e2..a2c7bf4cc613 100755 --- a/tools/testing/selftests/ublk/test_stripe_06.sh +++ b/tools/testing/selftests/ublk/test_stripe_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_06" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on user copy" diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt index b2f60a92b118..9d36ba35468f 100644 --- a/tools/testing/selftests/ublk/trace/seq_io.bt +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -2,23 +2,52 @@ $1: dev_t $2: RWBS $3: strlen($2) + + Track request order between block_io_start and block_rq_complete. + Sequence starts at 1 so 0 means "never seen". On first valid + completion, sync complete_seq to handle probe attachment races. + block_rq_complete listed first to reduce missed completion window. */ + BEGIN { - @last_rw[$1, str($2)] = (uint64)0; + @start_seq = (uint64)1; + @complete_seq = (uint64)0; + @out_of_order = (uint64)0; + @start_order[0] = (uint64)0; + delete(@start_order[0]); + printf("BPFTRACE_READY\n"); } + tracepoint:block:block_rq_complete +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ { - $dev = $1; - if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { - $last = @last_rw[$dev, str($2)]; - if ((uint64)args.sector != $last) { - printf("io_out_of_order: exp %llu actual %llu\n", - args.sector, $last); + $expected = @start_order[args.sector]; + if ($expected > 0) { + if (@complete_seq == 0) { + @complete_seq = $expected; + } + if ($expected != @complete_seq) { + printf("out_of_order: sector %llu started at seq %llu but completed at seq %llu\n", + args.sector, $expected, @complete_seq); + @out_of_order = @out_of_order + 1; } - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + delete(@start_order[args.sector]); + @complete_seq = @complete_seq + 1; } } +tracepoint:block:block_io_start +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ +{ + @start_order[args.sector] = @start_seq; + @start_seq = @start_seq + 1; +} + END { - clear(@last_rw); + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", + @start_seq - 1, @complete_seq, @out_of_order); + clear(@start_order); + clear(@start_seq); + clear(@complete_seq); + clear(@out_of_order); } diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e..aab522f26167 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) @@ -43,6 +97,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +107,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +118,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h index 50c261005111..5da223731b81 100644 --- a/tools/testing/selftests/vDSO/vdso_config.h +++ b/tools/testing/selftests/vDSO/vdso_config.h @@ -66,7 +66,7 @@ static const char *versions[7] = { }; __attribute__((unused)) -static const char *names[2][7] = { +static const char *names[2][8] = { { "__kernel_gettimeofday", "__kernel_clock_gettime", @@ -75,6 +75,7 @@ static const char *names[2][7] = { "__kernel_getcpu", "__kernel_clock_gettime64", "__kernel_getrandom", + "__kernel_clock_getres_time64", }, { "__vdso_gettimeofday", @@ -84,6 +85,7 @@ static const char *names[2][7] = { "__vdso_getcpu", "__vdso_clock_gettime64", "__vdso_getrandom", + "__vdso_clock_getres_time64", }, }; diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c index c620317eaeea..b162a4ba9c4f 100644 --- a/tools/testing/selftests/vDSO/vdso_test_abi.c +++ b/tools/testing/selftests/vDSO/vdso_test_abi.c @@ -36,6 +36,7 @@ typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz); typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts); typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts); +typedef long (*vdso_clock_getres_time64_t)(clockid_t clk_id, struct vdso_timespec64 *ts); typedef time_t (*vdso_time_t)(time_t *t); static const char * const vdso_clock_name[] = { @@ -179,7 +180,7 @@ static void vdso_test_clock_getres(clockid_t clk_id) clock_getres_fail++; } - ret = syscall(SYS_clock_getres, clk_id, &sys_ts); + ret = syscall(__NR_clock_getres, clk_id, &sys_ts); ksft_print_msg("The syscall resolution is %lld %lld\n", (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); @@ -196,6 +197,55 @@ static void vdso_test_clock_getres(clockid_t clk_id) } } +#ifdef __NR_clock_getres_time64 +static void vdso_test_clock_getres_time64(clockid_t clk_id) +{ + int clock_getres_fail = 0; + + /* Find clock_getres. */ + vdso_clock_getres_time64_t vdso_clock_getres_time64 = + (vdso_clock_getres_time64_t)vdso_sym(version, name[7]); + + if (!vdso_clock_getres_time64) { + ksft_print_msg("Couldn't find %s\n", name[7]); + ksft_test_result_skip("%s %s\n", name[7], + vdso_clock_name[clk_id]); + return; + } + + struct vdso_timespec64 ts, sys_ts; + long ret = VDSO_CALL(vdso_clock_getres_time64, 2, clk_id, &ts); + + if (ret == 0) { + ksft_print_msg("The vdso resolution is %lld %lld\n", + (long long)ts.tv_sec, (long long)ts.tv_nsec); + } else { + clock_getres_fail++; + } + + ret = syscall(__NR_clock_getres_time64, clk_id, &sys_ts); + + ksft_print_msg("The syscall resolution is %lld %lld\n", + (long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec); + + if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec)) + clock_getres_fail++; + + if (clock_getres_fail > 0) { + ksft_test_result_fail("%s %s\n", name[7], + vdso_clock_name[clk_id]); + } else { + ksft_test_result_pass("%s %s\n", name[7], + vdso_clock_name[clk_id]); + } +} +#else /* !__NR_clock_getres_time64 */ +static void vdso_test_clock_getres_time64(clockid_t clk_id) +{ + ksft_test_result_skip("%s %s\n", name[7], vdso_clock_name[clk_id]); +} +#endif /* __NR_clock_getres_time64 */ + /* * This function calls vdso_test_clock_gettime and vdso_test_clock_getres * with different values for clock_id. @@ -208,9 +258,10 @@ static inline void vdso_test_clock(clockid_t clock_id) vdso_test_clock_gettime64(clock_id); vdso_test_clock_getres(clock_id); + vdso_test_clock_getres_time64(clock_id); } -#define VDSO_TEST_PLAN 29 +#define VDSO_TEST_PLAN 38 int main(int argc, char **argv) { diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c index bea8ad54da11..3fe49cbdae98 100644 --- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -16,9 +16,7 @@ #include "vdso_config.h" #include "vdso_call.h" -struct getcpu_cache; -typedef long (*getcpu_t)(unsigned int *, unsigned int *, - struct getcpu_cache *); +typedef long (*getcpu_t)(unsigned int *, unsigned int *, void *); int main(int argc, char **argv) { diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S index 8c3cbf4dfd6a..16f985b089d4 100644 --- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S +++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S @@ -16,5 +16,5 @@ #elif defined(__s390x__) #include "../../../../arch/s390/kernel/vdso/vgetrandom-chacha.S" #elif defined(__x86_64__) -#include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S" +#include "../../../../arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S" #endif diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 3c796ca99a50..8e90e409e91d 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,5 +1,13 @@ +ARCH ?= $(shell uname -m) + +ifeq (,$(filter $(ARCH),arm64 x86_64)) +# Do nothing on unsupported architectures +include ../lib.mk +else + CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test +TEST_GEN_PROGS += vfio_dma_mapping_mmio_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test @@ -27,3 +35,5 @@ TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_PROGS_O) $(LIBVFIO_O)) -include $(TEST_DEP_FILES) EXTRA_CLEAN += $(TEST_GEN_PROGS_O) $(TEST_DEP_FILES) + +endif diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h index 279ddcd70194..1b6da54cc2cb 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -23,4 +23,13 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]); char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); +/* + * Reserve virtual address space of size at an address satisfying + * (vaddr % align) == offset. + * + * Returns the reserved vaddr. The caller is responsible for unmapping + * the returned region. + */ +void *mmap_reserve(size_t size, size_t align, size_t offset); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h index 5c9b9dc6d993..e9a3386a4719 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -61,6 +61,12 @@ iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); +#define MODE_VFIO_TYPE1_IOMMU "vfio_type1_iommu" +#define MODE_VFIO_TYPE1V2_IOMMU "vfio_type1v2_iommu" +#define MODE_IOMMUFD_COMPAT_TYPE1 "iommufd_compat_type1" +#define MODE_IOMMUFD_COMPAT_TYPE1V2 "iommufd_compat_type1v2" +#define MODE_IOMMUFD "iommufd" + /* * Generator for VFIO selftests fixture variants that replicate across all * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c index 58b7fb7430d4..035dac069d60 100644 --- a/tools/testing/selftests/vfio/lib/iommu.c +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -20,32 +20,32 @@ #include "../../../kselftest.h" #include <libvfio.h> -const char *default_iommu_mode = "iommufd"; +const char *default_iommu_mode = MODE_IOMMUFD; /* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ static const struct iommu_mode iommu_modes[] = { { - .name = "vfio_type1_iommu", + .name = MODE_VFIO_TYPE1_IOMMU, .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1_IOMMU, }, { - .name = "vfio_type1v2_iommu", + .name = MODE_VFIO_TYPE1V2_IOMMU, .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1v2_IOMMU, }, { - .name = "iommufd_compat_type1", + .name = MODE_IOMMUFD_COMPAT_TYPE1, .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1_IOMMU, }, { - .name = "iommufd_compat_type1v2", + .name = MODE_IOMMUFD_COMPAT_TYPE1V2, .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1v2_IOMMU, }, { - .name = "iommufd", + .name = MODE_IOMMUFD, }, }; diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c index a23a3cc5be69..3a3d1ed635c1 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.c +++ b/tools/testing/selftests/vfio/lib/libvfio.c @@ -2,6 +2,9 @@ #include <stdio.h> #include <stdlib.h> +#include <sys/mman.h> + +#include <linux/align.h> #include "../../../kselftest.h" #include <libvfio.h> @@ -76,3 +79,25 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]) return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; } + +void *mmap_reserve(size_t size, size_t align, size_t offset) +{ + void *map_base, *map_align; + size_t delta; + + VFIO_ASSERT_GT(align, offset); + delta = align - offset; + + map_base = mmap(NULL, size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + VFIO_ASSERT_NE(map_base, MAP_FAILED); + + map_align = (void *)(ALIGN((uintptr_t)map_base + delta, align) - delta); + + if (map_align > map_base) + VFIO_ASSERT_EQ(munmap(map_base, map_align - map_base), 0); + + VFIO_ASSERT_EQ(munmap(map_align + size, map_base + align - map_align), 0); + + return map_align; +} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index fac4c0ecadef..4e5871f1ebc3 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -11,10 +11,14 @@ #include <sys/ioctl.h> #include <sys/mman.h> +#include <linux/align.h> #include <linux/iommufd.h> +#include <linux/kernel.h> #include <linux/limits.h> +#include <linux/log2.h> #include <linux/mman.h> #include <linux/overflow.h> +#include <linux/sizes.h> #include <linux/types.h> #include <linux/vfio.h> @@ -123,20 +127,38 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index, static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) { struct vfio_pci_bar *bar = &device->bars[index]; + size_t align, size; int prot = 0; + void *vaddr; VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); VFIO_ASSERT_NULL(bar->vaddr); VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP); + VFIO_ASSERT_TRUE(is_power_of_2(bar->info.size)); if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) prot |= PROT_READ; if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) prot |= PROT_WRITE; - bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED, + size = bar->info.size; + + /* + * Align BAR mmaps to improve page fault granularity during potential + * subsequent IOMMU mapping of these BAR vaddr. 1G for x86 is the + * largest hugepage size across any architecture, so no benefit from + * larger alignment. BARs smaller than 1G will be aligned by their + * power-of-two size, guaranteeing sufficient alignment for smaller + * hugepages, if present. + */ + align = min_t(size_t, size, SZ_1G); + + vaddr = mmap_reserve(size, align, 0); + bar->vaddr = mmap(vaddr, size, prot, MAP_SHARED | MAP_FIXED, device->fd, bar->info.offset); VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED); + + madvise(bar->vaddr, size, MADV_HUGEPAGE); } static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c new file mode 100644 index 000000000000..957a89ce7b3a --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <sys/mman.h> +#include <unistd.h> + +#include <uapi/linux/types.h> +#include <linux/pci_regs.h> +#include <linux/sizes.h> +#include <linux/vfio.h> + +#include <libvfio.h> + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +static struct vfio_pci_bar *largest_mapped_bar(struct vfio_pci_device *device) +{ + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + struct vfio_pci_bar *largest = NULL; + u64 bar_size = 0; + + for (int i = 0; i < PCI_STD_NUM_BARS; i++) { + struct vfio_pci_bar *bar = &device->bars[i]; + + if (!bar->vaddr) + continue; + + /* + * iommu_map() maps with READ|WRITE, so require the same + * abilities for the underlying VFIO region. + */ + if ((bar->info.flags & flags) != flags) + continue; + + if (bar->info.size > bar_size) { + bar_size = bar->info.size; + largest = bar; + } + } + + return largest; +} + +FIXTURE(vfio_dma_mapping_mmio_test) { + struct iommu *iommu; + struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; + struct vfio_pci_bar *bar; +}; + +FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +#undef FIXTURE_VARIANT_ADD_IOMMU_MODE + +FIXTURE_SETUP(vfio_dma_mapping_mmio_test) +{ + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); + self->bar = largest_mapped_bar(self->device); + + if (!self->bar) + SKIP(return, "No mappable BAR found on device %s", device_bdf); +} + +FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test) +{ + iova_allocator_cleanup(self->iova_allocator); + vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); +} + +static void do_mmio_map_test(struct iommu *iommu, + struct iova_allocator *iova_allocator, + void *vaddr, size_t size) +{ + struct dma_region region = { + .vaddr = vaddr, + .size = size, + .iova = iova_allocator_alloc(iova_allocator, size), + }; + + /* + * NOTE: Check for iommufd compat success once it lands. Native iommufd + * will never support this. + */ + if (!strcmp(iommu->mode->name, MODE_VFIO_TYPE1V2_IOMMU) || + !strcmp(iommu->mode->name, MODE_VFIO_TYPE1_IOMMU)) { + iommu_map(iommu, ®ion); + iommu_unmap(iommu, ®ion); + } else { + VFIO_ASSERT_NE(__iommu_map(iommu, ®ion), 0); + VFIO_ASSERT_NE(__iommu_unmap(iommu, ®ion, NULL), 0); + } +} + +TEST_F(vfio_dma_mapping_mmio_test, map_full_bar) +{ + do_mmio_map_test(self->iommu, self->iova_allocator, + self->bar->vaddr, self->bar->info.size); +} + +TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar) +{ + if (self->bar->info.size < 2 * getpagesize()) + SKIP(return, "BAR too small (size=0x%llx)", self->bar->info.size); + + do_mmio_map_test(self->iommu, self->iova_allocator, + self->bar->vaddr, getpagesize()); +} + +/* Test IOMMU mapping of BAR mmap with intentionally poor vaddr alignment. */ +TEST_F(vfio_dma_mapping_mmio_test, map_bar_misaligned) +{ + /* Limit size to bound test time for large BARs */ + size_t size = min_t(size_t, self->bar->info.size, SZ_1G); + void *vaddr; + + vaddr = mmap_reserve(size, SZ_1G, getpagesize()); + vaddr = mmap(vaddr, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, + self->device->fd, self->bar->info.offset); + VFIO_ASSERT_NE(vaddr, MAP_FAILED); + + do_mmio_map_test(self->iommu, self->iova_allocator, vaddr, size); + + VFIO_ASSERT_EQ(munmap(vaddr, size), 0); +} + +int main(int argc, char *argv[]) +{ + device_bdf = vfio_selftests_get_bdf(&argc, argv); + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 3bf984b337ac..abb170bdcef7 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -161,12 +161,8 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) if (rc == -EOPNOTSUPP) goto unmap; - /* - * IOMMUFD compatibility-mode does not support huge mappings when - * using VFIO_TYPE1_IOMMU. - */ - if (!strcmp(variant->iommu_mode, "iommufd_compat_type1")) - mapping_size = SZ_4K; + if (self->iommu->mode->iommu_type == VFIO_TYPE1_IOMMU) + goto unmap; ASSERT_EQ(0, rc); printf("Found IOMMU mappings for IOVA 0x%lx:\n", region.iova); diff --git a/tools/testing/selftests/vsock/settings b/tools/testing/selftests/vsock/settings index 694d70710ff0..79b65bdf05db 100644 --- a/tools/testing/selftests/vsock/settings +++ b/tools/testing/selftests/vsock/settings @@ -1 +1 @@ -timeout=300 +timeout=1200 diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index c7b270dd77a9..dc8dbe74a6d0 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -7,6 +7,7 @@ # * virtme-ng # * busybox-static (used by virtme-ng) # * qemu (used by virtme-ng) +# * socat # # shellcheck disable=SC2317,SC2119 @@ -41,14 +42,119 @@ readonly KERNEL_CMDLINE="\ virtme.ssh virtme_ssh_channel=tcp virtme_ssh_user=$USER \ " readonly LOG=$(mktemp /tmp/vsock_vmtest_XXXX.log) -readonly TEST_NAMES=(vm_server_host_client vm_client_host_server vm_loopback) + +# Namespace tests must use the ns_ prefix. This is checked in check_netns() and +# is used to determine if a test needs namespace setup before test execution. +readonly TEST_NAMES=( + vm_server_host_client + vm_client_host_server + vm_loopback + ns_host_vsock_ns_mode_ok + ns_host_vsock_child_ns_mode_ok + ns_global_same_cid_fails + ns_local_same_cid_ok + ns_global_local_same_cid_ok + ns_local_global_same_cid_ok + ns_diff_global_host_connect_to_global_vm_ok + ns_diff_global_host_connect_to_local_vm_fails + ns_diff_global_vm_connect_to_global_host_ok + ns_diff_global_vm_connect_to_local_host_fails + ns_diff_local_host_connect_to_local_vm_fails + ns_diff_local_vm_connect_to_local_host_fails + ns_diff_global_to_local_loopback_local_fails + ns_diff_local_to_global_loopback_fails + ns_diff_local_to_local_loopback_fails + ns_diff_global_to_global_loopback_ok + ns_same_local_loopback_ok + ns_same_local_host_connect_to_local_vm_ok + ns_same_local_vm_connect_to_local_host_ok + ns_delete_vm_ok + ns_delete_host_ok + ns_delete_both_ok +) readonly TEST_DESCS=( + # vm_server_host_client "Run vsock_test in server mode on the VM and in client mode on the host." + + # vm_client_host_server "Run vsock_test in client mode on the VM and in server mode on the host." + + # vm_loopback "Run vsock_test using the loopback transport in the VM." + + # ns_host_vsock_ns_mode_ok + "Check /proc/sys/net/vsock/ns_mode strings on the host." + + # ns_host_vsock_child_ns_mode_ok + "Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable." + + # ns_global_same_cid_fails + "Check QEMU fails to start two VMs with same CID in two different global namespaces." + + # ns_local_same_cid_ok + "Check QEMU successfully starts two VMs with same CID in two different local namespaces." + + # ns_global_local_same_cid_ok + "Check QEMU successfully starts one VM in a global ns and then another VM in a local ns with the same CID." + + # ns_local_global_same_cid_ok + "Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID." + + # ns_diff_global_host_connect_to_global_vm_ok + "Run vsock_test client in global ns with server in VM in another global ns." + + # ns_diff_global_host_connect_to_local_vm_fails + "Run socat to test a process in a global ns fails to connect to a VM in a local ns." + + # ns_diff_global_vm_connect_to_global_host_ok + "Run vsock_test client in VM in a global ns with server in another global ns." + + # ns_diff_global_vm_connect_to_local_host_fails + "Run socat to test a VM in a global ns fails to connect to a host process in a local ns." + + # ns_diff_local_host_connect_to_local_vm_fails + "Run socat to test a host process in a local ns fails to connect to a VM in another local ns." + + # ns_diff_local_vm_connect_to_local_host_fails + "Run socat to test a VM in a local ns fails to connect to a host process in another local ns." + + # ns_diff_global_to_local_loopback_local_fails + "Run socat to test a loopback vsock in a global ns fails to connect to a vsock in a local ns." + + # ns_diff_local_to_global_loopback_fails + "Run socat to test a loopback vsock in a local ns fails to connect to a vsock in a global ns." + + # ns_diff_local_to_local_loopback_fails + "Run socat to test a loopback vsock in a local ns fails to connect to a vsock in another local ns." + + # ns_diff_global_to_global_loopback_ok + "Run socat to test a loopback vsock in a global ns successfully connects to a vsock in another global ns." + + # ns_same_local_loopback_ok + "Run socat to test a loopback vsock in a local ns successfully connects to a vsock in the same ns." + + # ns_same_local_host_connect_to_local_vm_ok + "Run vsock_test client in a local ns with server in VM in same ns." + + # ns_same_local_vm_connect_to_local_host_ok + "Run vsock_test client in VM in a local ns with server in same ns." + + # ns_delete_vm_ok + "Check that deleting the VM's namespace does not break the socket connection" + + # ns_delete_host_ok + "Check that deleting the host's namespace does not break the socket connection" + + # ns_delete_both_ok + "Check that deleting the VM and host's namespaces does not break the socket connection" ) -readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback) +readonly USE_SHARED_VM=( + vm_server_host_client + vm_client_host_server + vm_loopback +) +readonly NS_MODES=("local" "global") VERBOSE=0 @@ -71,7 +177,7 @@ usage() { for ((i = 0; i < ${#TEST_NAMES[@]}; i++)); do name=${TEST_NAMES[${i}]} desc=${TEST_DESCS[${i}]} - printf "\t%-35s%-35s\n" "${name}" "${desc}" + printf "\t%-55s%-35s\n" "${name}" "${desc}" done echo @@ -103,13 +209,55 @@ check_result() { fi } +add_namespaces() { + local orig_mode + orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode) + + for mode in "${NS_MODES[@]}"; do + echo "${mode}" > /proc/sys/net/vsock/child_ns_mode + ip netns add "${mode}0" 2>/dev/null + ip netns add "${mode}1" 2>/dev/null + done + + echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode +} + +init_namespaces() { + for mode in "${NS_MODES[@]}"; do + # we need lo for qemu port forwarding + ip netns exec "${mode}0" ip link set dev lo up + ip netns exec "${mode}1" ip link set dev lo up + done +} + +del_namespaces() { + for mode in "${NS_MODES[@]}"; do + ip netns del "${mode}0" &>/dev/null + ip netns del "${mode}1" &>/dev/null + log_host "removed ns ${mode}0" + log_host "removed ns ${mode}1" + done +} + vm_ssh() { - ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@" + local ns_exec + + if [[ "${1}" == init_ns ]]; then + ns_exec="" + else + ns_exec="ip netns exec ${1}" + fi + + shift + + ${ns_exec} ssh -q -o UserKnownHostsFile=/dev/null -p "${SSH_HOST_PORT}" localhost "$@" + return $? } cleanup() { terminate_pidfiles "${!PIDFILES[@]}" + del_namespaces } check_args() { @@ -139,7 +287,7 @@ check_args() { } check_deps() { - for dep in vng ${QEMU} busybox pkill ssh; do + for dep in vng ${QEMU} busybox pkill ssh ss socat; do if [[ ! -x $(command -v "${dep}") ]]; then echo -e "skip: dependency ${dep} not found!\n" exit "${KSFT_SKIP}" @@ -153,6 +301,20 @@ check_deps() { fi } +check_netns() { + local tname=$1 + + # If the test requires NS support, check if NS support exists + # using /proc/self/ns + if [[ "${tname}" =~ ^ns_ ]] && + [[ ! -e /proc/self/ns ]]; then + log_host "No NS support detected for test ${tname}" + return 1 + fi + + return 0 +} + check_vng() { local tested_versions local version @@ -176,6 +338,20 @@ check_vng() { fi } +check_socat() { + local support_string + + support_string="$(socat -V)" + + if [[ "${support_string}" != *"WITH_VSOCK 1"* ]]; then + die "err: socat is missing vsock support" + fi + + if [[ "${support_string}" != *"WITH_UNIX 1"* ]]; then + die "err: socat is missing unix support" + fi +} + handle_build() { if [[ ! "${BUILD}" -eq 1 ]]; then return @@ -224,12 +400,22 @@ terminate_pidfiles() { done } +terminate_pids() { + local pid + + for pid in "$@"; do + kill -SIGTERM "${pid}" &>/dev/null || : + done +} + vm_start() { local pidfile=$1 + local ns=$2 local logfile=/dev/null local verbose_opt="" local kernel_opt="" local qemu_opts="" + local ns_exec="" local qemu qemu=$(command -v "${QEMU}") @@ -250,7 +436,11 @@ vm_start() { kernel_opt="${KERNEL_CHECKOUT}" fi - vng \ + if [[ "${ns}" != "init_ns" ]]; then + ns_exec="ip netns exec ${ns}" + fi + + ${ns_exec} vng \ --run \ ${kernel_opt} \ ${verbose_opt} \ @@ -265,6 +455,7 @@ vm_start() { } vm_wait_for_ssh() { + local ns=$1 local i i=0 @@ -272,7 +463,8 @@ vm_wait_for_ssh() { if [[ ${i} -gt ${WAIT_PERIOD_MAX} ]]; then die "Timed out waiting for guest ssh" fi - if vm_ssh -- true; then + + if vm_ssh "${ns}" -- true; then break fi i=$(( i + 1 )) @@ -286,50 +478,107 @@ wait_for_listener() local port=$1 local interval=$2 local max_intervals=$3 - local protocol=tcp - local pattern + local protocol=$4 local i - pattern=":$(printf "%04X" "${port}") " - - # for tcp protocol additionally check the socket state - [ "${protocol}" = "tcp" ] && pattern="${pattern}0A" - for i in $(seq "${max_intervals}"); do - if awk -v pattern="${pattern}" \ - 'BEGIN {rc=1} $2" "$4 ~ pattern {rc=0} END {exit rc}' \ - /proc/net/"${protocol}"*; then + case "${protocol}" in + tcp) + if ss --listening --tcp --numeric | grep -q ":${port} "; then + break + fi + ;; + vsock) + if ss --listening --vsock --numeric | grep -q ":${port} "; then + break + fi + ;; + unix) + # For unix sockets, port is actually the socket path + if ss --listening --unix | grep -q "${port}"; then + break + fi + ;; + *) + echo "Unknown protocol: ${protocol}" >&2 break - fi + ;; + esac sleep "${interval}" done } vm_wait_for_listener() { - local port=$1 + local ns=$1 + local port=$2 + local protocol=$3 - vm_ssh <<EOF + vm_ssh "${ns}" <<EOF $(declare -f wait_for_listener) -wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX} +wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX} ${protocol} EOF } host_wait_for_listener() { - local port=$1 + local ns=$1 + local port=$2 + local protocol=$3 + + if [[ "${ns}" == "init_ns" ]]; then + wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}" "${protocol}" + else + ip netns exec "${ns}" bash <<-EOF + $(declare -f wait_for_listener) + wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX} ${protocol} + EOF + fi +} + +vm_dmesg_oops_count() { + local ns=$1 + + vm_ssh "${ns}" -- dmesg 2>/dev/null | grep -c -i 'Oops' +} + +vm_dmesg_warn_count() { + local ns=$1 + + vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock' +} + +vm_dmesg_check() { + local pidfile=$1 + local ns=$2 + local oops_before=$3 + local warn_before=$4 + local oops_after warn_after + + oops_after=$(vm_dmesg_oops_count "${ns}") + if [[ "${oops_after}" -gt "${oops_before}" ]]; then + echo "FAIL: kernel oops detected on vm in ns ${ns}" | log_host + return 1 + fi + + warn_after=$(vm_dmesg_warn_count "${ns}") + if [[ "${warn_after}" -gt "${warn_before}" ]]; then + echo "FAIL: kernel warning detected on vm in ns ${ns}" | log_host + return 1 + fi - wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}" + return 0 } vm_vsock_test() { - local host=$1 - local cid=$2 - local port=$3 + local ns=$1 + local host=$2 + local cid=$3 + local port=$4 local rc # log output and use pipefail to respect vsock_test errors set -o pipefail if [[ "${host}" != server ]]; then - vm_ssh -- "${VSOCK_TEST}" \ + vm_ssh "${ns}" -- "${VSOCK_TEST}" \ --mode=client \ --control-host="${host}" \ --peer-cid="${cid}" \ @@ -337,7 +586,7 @@ vm_vsock_test() { 2>&1 | log_guest rc=$? else - vm_ssh -- "${VSOCK_TEST}" \ + vm_ssh "${ns}" -- "${VSOCK_TEST}" \ --mode=server \ --peer-cid="${cid}" \ --control-port="${port}" \ @@ -349,7 +598,7 @@ vm_vsock_test() { return $rc fi - vm_wait_for_listener "${port}" + vm_wait_for_listener "${ns}" "${port}" "tcp" rc=$? fi set +o pipefail @@ -358,25 +607,35 @@ vm_vsock_test() { } host_vsock_test() { - local host=$1 - local cid=$2 - local port=$3 + local ns=$1 + local host=$2 + local cid=$3 + local port=$4 + shift 4 + local extra_args=("$@") local rc + local cmd="${VSOCK_TEST}" + if [[ "${ns}" != "init_ns" ]]; then + cmd="ip netns exec ${ns} ${cmd}" + fi + # log output and use pipefail to respect vsock_test errors set -o pipefail if [[ "${host}" != server ]]; then - ${VSOCK_TEST} \ + ${cmd} \ --mode=client \ --peer-cid="${cid}" \ --control-host="${host}" \ - --control-port="${port}" 2>&1 | log_host + --control-port="${port}" \ + "${extra_args[@]}" 2>&1 | log_host rc=$? else - ${VSOCK_TEST} \ + ${cmd} \ --mode=server \ --peer-cid="${cid}" \ - --control-port="${port}" 2>&1 | log_host & + --control-port="${port}" \ + "${extra_args[@]}" 2>&1 | log_host & rc=$? if [[ $rc -ne 0 ]]; then @@ -384,7 +643,7 @@ host_vsock_test() { return $rc fi - host_wait_for_listener "${port}" + host_wait_for_listener "${ns}" "${port}" "tcp" rc=$? fi set +o pipefail @@ -427,12 +686,584 @@ log_guest() { LOG_PREFIX=guest log "$@" } +ns_get_mode() { + local ns=$1 + + ip netns exec "${ns}" cat /proc/sys/net/vsock/ns_mode 2>/dev/null +} + +test_ns_host_vsock_ns_mode_ok() { + for mode in "${NS_MODES[@]}"; do + local actual + + actual=$(ns_get_mode "${mode}0") + if [[ "${actual}" != "${mode}" ]]; then + log_host "expected mode ${mode}, got ${actual}" + return "${KSFT_FAIL}" + fi + done + + return "${KSFT_PASS}" +} + +test_ns_diff_global_host_connect_to_global_vm_ok() { + local oops_before warn_before + local pids pid pidfile + local ns0 ns1 port + declare -a pids + local unixfile + ns0="global0" + ns1="global1" + port=1234 + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns0}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + unixfile=$(mktemp -u /tmp/XXXX.sock) + ip netns exec "${ns1}" \ + socat TCP-LISTEN:"${TEST_HOST_PORT}",fork \ + UNIX-CONNECT:"${unixfile}" & + pids+=($!) + host_wait_for_listener "${ns1}" "${TEST_HOST_PORT}" "tcp" + + ip netns exec "${ns0}" socat UNIX-LISTEN:"${unixfile}",fork \ + TCP-CONNECT:localhost:"${TEST_HOST_PORT}" & + pids+=($!) + host_wait_for_listener "${ns0}" "${unixfile}" "unix" + + vm_vsock_test "${ns0}" "server" 2 "${TEST_GUEST_PORT}" + vm_wait_for_listener "${ns0}" "${TEST_GUEST_PORT}" "tcp" + host_vsock_test "${ns1}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pids "${pids[@]}" + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_diff_global_host_connect_to_local_vm_fails() { + local oops_before warn_before + local ns0="global0" + local ns1="local0" + local port=12345 + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + outfile=$(mktemp) + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns1}"; then + log_host "failed to start vm (cid=${VSOCK_CID}, ns=${ns0})" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns1}" + oops_before=$(vm_dmesg_oops_count "${ns1}") + warn_before=$(vm_dmesg_warn_count "${ns1}") + + vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" & + vm_wait_for_listener "${ns1}" "${port}" "vsock" + echo TEST | ip netns exec "${ns0}" \ + socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null + + vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" == "TEST" ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_diff_global_vm_connect_to_global_host_ok() { + local oops_before warn_before + local ns0="global0" + local ns1="global1" + local port=12345 + local unixfile + local dmesg_rc + local pidfile + local pids + local rc + + init_namespaces + + declare -a pids + + log_host "Setup socat bridge from ns ${ns0} to ns ${ns1} over port ${port}" + + unixfile=$(mktemp -u /tmp/XXXX.sock) + + ip netns exec "${ns0}" \ + socat TCP-LISTEN:"${port}" UNIX-CONNECT:"${unixfile}" & + pids+=($!) + host_wait_for_listener "${ns0}" "${port}" "tcp" + + ip netns exec "${ns1}" \ + socat UNIX-LISTEN:"${unixfile}" TCP-CONNECT:127.0.0.1:"${port}" & + pids+=($!) + host_wait_for_listener "${ns1}" "${unixfile}" "unix" + + log_host "Launching ${VSOCK_TEST} in ns ${ns1}" + host_vsock_test "${ns1}" "server" "${VSOCK_CID}" "${port}" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + terminate_pids "${pids[@]}" + rm -f "${unixfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_vsock_test "${ns0}" "10.0.2.2" 2 "${port}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pids[@]}" + rm -f "${unixfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" + +} + +test_ns_diff_global_vm_connect_to_local_host_fails() { + local ns0="global0" + local ns1="local0" + local port=12345 + local oops_before warn_before + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + terminate_pids "${pid}" + rm -f "${outfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_ssh "${ns0}" -- \ + bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_host_connect_to_local_vm_fails() { + local ns0="local0" + local ns1="local1" + local port=12345 + local oops_before warn_before + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + outfile=$(mktemp) + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns1}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns1}" + oops_before=$(vm_dmesg_oops_count "${ns1}") + warn_before=$(vm_dmesg_warn_count "${ns1}") + + vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" & + vm_wait_for_listener "${ns1}" "${port}" "vsock" + + echo TEST | ip netns exec "${ns0}" \ + socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null + + vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_vm_connect_to_local_host_fails() { + local oops_before warn_before + local ns0="local0" + local ns1="local1" + local port=12345 + local dmesg_rc + local pidfile + local result + local pid + + init_namespaces + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + log_host "failed to start vm (cid=${cid}, ns=${ns0})" + rm -f "${outfile}" + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns0}" + oops_before=$(vm_dmesg_oops_count "${ns0}") + warn_before=$(vm_dmesg_warn_count "${ns0}") + + vm_ssh "${ns0}" -- \ + bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest + + vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +__test_loopback_two_netns() { + local ns0=$1 + local ns1=$2 + local port=12345 + local result + local pid + + modprobe vsock_loopback &> /dev/null || : + + log_host "Launching socat in ns ${ns1}" + outfile=$(mktemp) + + ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" 2>/dev/null & + pid=$! + host_wait_for_listener "${ns1}" "${port}" "vsock" + + log_host "Launching socat in ns ${ns0}" + echo TEST | ip netns exec "${ns0}" socat STDIN VSOCK-CONNECT:1:"${port}" 2>/dev/null + terminate_pids "${pid}" + + result=$(cat "${outfile}") + rm -f "${outfile}" + + if [[ "${result}" == TEST ]]; then + return 0 + fi + + return 1 +} + +test_ns_diff_global_to_local_loopback_local_fails() { + init_namespaces + + if ! __test_loopback_two_netns "global0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_to_global_loopback_fails() { + init_namespaces + + if ! __test_loopback_two_netns "local0" "global0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_local_to_local_loopback_fails() { + init_namespaces + + if ! __test_loopback_two_netns "local0" "local1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_diff_global_to_global_loopback_ok() { + init_namespaces + + if __test_loopback_two_netns "global0" "global1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_same_local_loopback_ok() { + init_namespaces + + if __test_loopback_two_netns "local0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_same_local_host_connect_to_local_vm_ok() { + local oops_before warn_before + local ns="local0" + local port=1234 + local dmesg_rc + local pidfile + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns}" + oops_before=$(vm_dmesg_oops_count "${ns}") + warn_before=$(vm_dmesg_warn_count "${ns}") + + vm_vsock_test "${ns}" "server" 2 "${TEST_GUEST_PORT}" + + # Skip test 29 (transport release use-after-free): This test attempts + # binding both G2H and H2G CIDs. Because virtio-vsock (G2H) doesn't + # support local namespaces the test will fail when + # transport_g2h->stream_allow() returns false. This edge case only + # happens for vsock_test in client mode on the host in a local + # namespace. This is a false positive. + host_vsock_test "${ns}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" --skip=29 + rc=$? + + vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_same_local_vm_connect_to_local_host_ok() { + local oops_before warn_before + local ns="local0" + local port=1234 + local dmesg_rc + local pidfile + local rc + + init_namespaces + + pidfile="$(create_pidfile)" + + if ! vm_start "${pidfile}" "${ns}"; then + return "${KSFT_FAIL}" + fi + + vm_wait_for_ssh "${ns}" + oops_before=$(vm_dmesg_oops_count "${ns}") + warn_before=$(vm_dmesg_warn_count "${ns}") + + host_vsock_test "${ns}" "server" "${VSOCK_CID}" "${port}" + vm_vsock_test "${ns}" "10.0.2.2" 2 "${port}" + rc=$? + + vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}" + dmesg_rc=$? + + terminate_pidfiles "${pidfile}" + + if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +namespaces_can_boot_same_cid() { + local ns0=$1 + local ns1=$2 + local pidfile1 pidfile2 + local rc + + pidfile1="$(create_pidfile)" + + # The first VM should be able to start. If it can't then we have + # problems and need to return non-zero. + if ! vm_start "${pidfile1}" "${ns0}"; then + return 1 + fi + + pidfile2="$(create_pidfile)" + vm_start "${pidfile2}" "${ns1}" + rc=$? + terminate_pidfiles "${pidfile1}" "${pidfile2}" + + return "${rc}" +} + +test_ns_global_same_cid_fails() { + init_namespaces + + if namespaces_can_boot_same_cid "global0" "global1"; then + return "${KSFT_FAIL}" + fi + + return "${KSFT_PASS}" +} + +test_ns_local_global_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "local0" "global0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_global_local_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "global0" "local0"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_local_same_cid_ok() { + init_namespaces + + if namespaces_can_boot_same_cid "local0" "local1"; then + return "${KSFT_PASS}" + fi + + return "${KSFT_FAIL}" +} + +test_ns_host_vsock_child_ns_mode_ok() { + local orig_mode + local rc + + orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode) + + rc="${KSFT_PASS}" + for mode in "${NS_MODES[@]}"; do + local ns="${mode}0" + + if echo "${mode}" 2>/dev/null > /proc/sys/net/vsock/ns_mode; then + log_host "ns_mode should be read-only but write succeeded" + rc="${KSFT_FAIL}" + continue + fi + + if ! echo "${mode}" > /proc/sys/net/vsock/child_ns_mode; then + log_host "child_ns_mode should be writable to ${mode}" + rc="${KSFT_FAIL}" + continue + fi + done + + echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode + + return "${rc}" +} + test_vm_server_host_client() { - if ! vm_vsock_test "server" 2 "${TEST_GUEST_PORT}"; then + if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then return "${KSFT_FAIL}" fi - if ! host_vsock_test "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then + if ! host_vsock_test "init_ns" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then return "${KSFT_FAIL}" fi @@ -440,11 +1271,11 @@ test_vm_server_host_client() { } test_vm_client_host_server() { - if ! host_vsock_test "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then + if ! host_vsock_test "init_ns" "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then return "${KSFT_FAIL}" fi - if ! vm_vsock_test "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then + if ! vm_vsock_test "init_ns" "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then return "${KSFT_FAIL}" fi @@ -454,19 +1285,92 @@ test_vm_client_host_server() { test_vm_loopback() { local port=60000 # non-forwarded local port - vm_ssh -- modprobe vsock_loopback &> /dev/null || : + vm_ssh "init_ns" -- modprobe vsock_loopback &> /dev/null || : - if ! vm_vsock_test "server" 1 "${port}"; then + if ! vm_vsock_test "init_ns" "server" 1 "${port}"; then return "${KSFT_FAIL}" fi - if ! vm_vsock_test "127.0.0.1" 1 "${port}"; then + + if ! vm_vsock_test "init_ns" "127.0.0.1" 1 "${port}"; then return "${KSFT_FAIL}" fi return "${KSFT_PASS}" } +check_ns_delete_doesnt_break_connection() { + local pipefile pidfile outfile + local ns0="global0" + local ns1="global1" + local port=12345 + local pids=() + local rc=0 + + init_namespaces + + pidfile="$(create_pidfile)" + if ! vm_start "${pidfile}" "${ns0}"; then + return "${KSFT_FAIL}" + fi + vm_wait_for_ssh "${ns0}" + + outfile=$(mktemp) + vm_ssh "${ns0}" -- \ + socat VSOCK-LISTEN:"${port}",fork STDOUT > "${outfile}" 2>/dev/null & + pids+=($!) + vm_wait_for_listener "${ns0}" "${port}" "vsock" + + # We use a pipe here so that we can echo into the pipe instead of using + # socat and a unix socket file. We just need a name for the pipe (not a + # regular file) so use -u. + pipefile=$(mktemp -u /tmp/vmtest_pipe_XXXX) + ip netns exec "${ns1}" \ + socat PIPE:"${pipefile}" VSOCK-CONNECT:"${VSOCK_CID}":"${port}" & + pids+=($!) + + timeout "${WAIT_PERIOD}" \ + bash -c 'while [[ ! -e '"${pipefile}"' ]]; do sleep 1; done; exit 0' + + if [[ "$1" == "vm" ]]; then + ip netns del "${ns0}" + elif [[ "$1" == "host" ]]; then + ip netns del "${ns1}" + elif [[ "$1" == "both" ]]; then + ip netns del "${ns0}" + ip netns del "${ns1}" + fi + + echo "TEST" > "${pipefile}" + + timeout "${WAIT_PERIOD}" \ + bash -c 'while [[ ! -s '"${outfile}"' ]]; do sleep 1; done; exit 0' + + if grep -q "TEST" "${outfile}"; then + rc="${KSFT_PASS}" + else + rc="${KSFT_FAIL}" + fi + + terminate_pidfiles "${pidfile}" + terminate_pids "${pids[@]}" + rm -f "${outfile}" "${pipefile}" + + return "${rc}" +} + +test_ns_delete_vm_ok() { + check_ns_delete_doesnt_break_connection "vm" +} + +test_ns_delete_host_ok() { + check_ns_delete_doesnt_break_connection "host" +} + +test_ns_delete_both_ok() { + check_ns_delete_doesnt_break_connection "both" +} + shared_vm_test() { local tname @@ -499,6 +1403,11 @@ run_shared_vm_tests() { continue fi + if ! check_netns "${arg}"; then + check_result "${KSFT_SKIP}" "${arg}" + continue + fi + run_shared_vm_test "${arg}" check_result "$?" "${arg}" done @@ -518,8 +1427,8 @@ run_shared_vm_test() { host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') - vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') + vm_oops_cnt_before=$(vm_dmesg_oops_count "init_ns") + vm_warn_cnt_before=$(vm_dmesg_warn_count "init_ns") name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -537,13 +1446,13 @@ run_shared_vm_test() { rc=$KSFT_FAIL fi - vm_oops_cnt_after=$(vm_ssh -- dmesg | grep -i 'Oops' | wc -l) + vm_oops_cnt_after=$(vm_dmesg_oops_count "init_ns") if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then echo "FAIL: kernel oops detected on vm" | log_host rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') + vm_warn_cnt_after=$(vm_dmesg_warn_count "init_ns") if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host rc=$KSFT_FAIL @@ -552,6 +1461,49 @@ run_shared_vm_test() { return "${rc}" } +run_ns_tests() { + for arg in "${ARGS[@]}"; do + if shared_vm_test "${arg}"; then + continue + fi + + if ! check_netns "${arg}"; then + check_result "${KSFT_SKIP}" "${arg}" + continue + fi + + add_namespaces + + name=$(echo "${arg}" | awk '{ print $1 }') + log_host "Executing test_${name}" + + host_oops_before=$(dmesg 2>/dev/null | grep -c -i 'Oops') + host_warn_before=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock') + eval test_"${name}" + rc=$? + + host_oops_after=$(dmesg 2>/dev/null | grep -c -i 'Oops') + if [[ "${host_oops_after}" -gt "${host_oops_before}" ]]; then + echo "FAIL: kernel oops detected on host" | log_host + check_result "${KSFT_FAIL}" "${name}" + del_namespaces + continue + fi + + host_warn_after=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock') + if [[ "${host_warn_after}" -gt "${host_warn_before}" ]]; then + echo "FAIL: kernel warning detected on host" | log_host + check_result "${KSFT_FAIL}" "${name}" + del_namespaces + continue + fi + + check_result "${rc}" "${name}" + + del_namespaces + done +} + BUILD=0 QEMU="qemu-system-$(uname -m)" @@ -577,6 +1529,7 @@ fi check_args "${ARGS[@]}" check_deps check_vng +check_socat handle_build echo "1..${#ARGS[@]}" @@ -589,14 +1542,16 @@ cnt_total=0 if shared_vm_tests_requested "${ARGS[@]}"; then log_host "Booting up VM" pidfile="$(create_pidfile)" - vm_start "${pidfile}" - vm_wait_for_ssh + vm_start "${pidfile}" "init_ns" + vm_wait_for_ssh "init_ns" log_host "VM booted up" run_shared_vm_tests "${ARGS[@]}" terminate_pidfiles "${pidfile}" fi +run_ns_tests "${ARGS[@]}" + echo "SUMMARY: PASS=${cnt_pass} SKIP=${cnt_skip} FAIL=${cnt_fail}" echo "Log: ${LOG}" diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 0504c11c2de6..bb89d2dfaa2a 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y CONFIG_DETECT_HUNG_TASK=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_PANIC_TIMEOUT=-1 CONFIG_STACKTRACE=y diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index 5fb531e3ad7c..2e423a335e1c 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -31,7 +31,7 @@ void test_syscall_ins(void); extern const char test_page[]; -static void const *current_test_page_addr = test_page; +static const void *current_test_page_addr = test_page; /* State used by our signal handlers. */ static gregset_t initial_regs; @@ -40,7 +40,7 @@ static volatile unsigned long rip; static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n", @@ -56,7 +56,7 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void) static void sigusr1(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); @@ -69,8 +69,6 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) ctx->uc_mcontext.gregs[REG_R11]); sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND); - - return; } static void test_sigreturn_to(unsigned long ip) @@ -84,7 +82,7 @@ static jmp_buf jmpbuf; static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; if (rip != ctx->uc_mcontext.gregs[REG_RIP]) { printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n", @@ -130,7 +128,7 @@ static void test_syscall_fallthrough_to(unsigned long ip) printf("[OK]\tWe survived\n"); } -int main() +int main(void) { /* * When the kernel returns from a slow-path syscall, it will diff --git a/tools/testing/shared/linux/kernel.h b/tools/testing/shared/linux/kernel.h index c0a2bb785b92..dc2b4ccfb185 100644 --- a/tools/testing/shared/linux/kernel.h +++ b/tools/testing/shared/linux/kernel.h @@ -21,9 +21,5 @@ #define schedule() #define PAGE_SHIFT 12 -#define __acquires(x) -#define __releases(x) -#define __must_hold(x) - #define EXPORT_PER_CPU_SYMBOL_GPL(x) #endif /* _KERNEL_H */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 9f0a9f5ed0fe..7fa56dcc53a6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -600,6 +600,14 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru return 0; } -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { /* For testing purposes. We indicate that an anon_vma has been cloned. */ if (src->anon_vma != NULL) { @@ -1265,11 +1274,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *vma, - struct vm_area_struct *next) -{ -} - static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/tools/tracing/rtla/.gitignore b/tools/tracing/rtla/.gitignore index 1a394ad26cc1..4d39d64ac08c 100644 --- a/tools/tracing/rtla/.gitignore +++ b/tools/tracing/rtla/.gitignore @@ -5,3 +5,7 @@ fixdep feature FEATURE-DUMP *.skel.h +custom_filename.txt +osnoise_irq_noise_hist.txt +osnoise_trace.txt +timerlat_trace.txt diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 746ccf2f5808..2701256abaf3 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -73,9 +73,21 @@ src/timerlat.bpf.o: src/timerlat.bpf.c src/timerlat.skel.h: src/timerlat.bpf.o $(QUIET_GENSKEL)$(SYSTEM_BPFTOOL) gen skeleton $< > $@ + +example/timerlat_bpf_action.o: example/timerlat_bpf_action.c + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@ + +tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@ else src/timerlat.skel.h: $(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h + +example/timerlat_bpf_action.o: example/timerlat_bpf_action.c + $(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o" + +tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c + $(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o" endif $(RTLA): $(RTLA_IN) @@ -96,7 +108,8 @@ clean: doc_clean fixdep-clean $(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-* $(Q)rm -rf feature - $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h -check: $(RTLA) - RTLA=$(RTLA) prove -o -f tests/ + $(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o +check: $(RTLA) tests/bpf/bpf_action_map.o + RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f -v tests/ +examples: example/timerlat_bpf_action.o .PHONY: FORCE clean check diff --git a/tools/tracing/rtla/example/timerlat_bpf_action.c b/tools/tracing/rtla/example/timerlat_bpf_action.c new file mode 100644 index 000000000000..ac1be049a848 --- /dev/null +++ b/tools/tracing/rtla/example/timerlat_bpf_action.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_tracing.h> + +char LICENSE[] SEC("license") = "GPL"; + +struct trace_event_raw_timerlat_sample { + unsigned long long timer_latency; +} __attribute__((preserve_access_index)); + +SEC("tp/timerlat_action") +int action_handler(struct trace_event_raw_timerlat_sample *tp_args) +{ + bpf_printk("Latency: %lld\n", tp_args->timer_latency); + return 0; +} diff --git a/tools/tracing/rtla/sample/timerlat_load.py b/tools/tracing/rtla/example/timerlat_load.py index a819c3588073..a819c3588073 100644 --- a/tools/tracing/rtla/sample/timerlat_load.py +++ b/tools/tracing/rtla/example/timerlat_load.py diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c index 8945aee58d51..a42615011962 100644 --- a/tools/tracing/rtla/src/actions.c +++ b/tools/tracing/rtla/src/actions.c @@ -19,8 +19,6 @@ actions_init(struct actions *self) self->len = 0; self->continue_flag = false; - memset(&self->present, 0, sizeof(self->present)); - /* This has to be set by the user */ self->trace_output_inst = NULL; } @@ -32,7 +30,9 @@ void actions_destroy(struct actions *self) { /* Free any action-specific data */ - for (struct action *action = self->list; action < self->list + self->len; action++) { + struct action *action; + + for_each_action(self, action) { if (action->type == ACTION_SHELL) free(action->command); if (action->type == ACTION_TRACE_OUTPUT) @@ -141,6 +141,8 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn) strcpy(trigger_c, trigger); token = strtok(trigger_c, ","); + if (!token) + return -1; if (strcmp(token, "trace") == 0) type = ACTION_TRACE_OUTPUT; @@ -179,12 +181,13 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn) /* Takes two arguments, num (signal) and pid */ while (token != NULL) { if (strlen(token) > 4 && strncmp(token, "num=", 4) == 0) { - signal = atoi(token + 4); + if (strtoi(token + 4, &signal)) + return -1; } else if (strlen(token) > 4 && strncmp(token, "pid=", 4) == 0) { if (strncmp(token + 4, "parent", 7) == 0) pid = -1; - else - pid = atoi(token + 4); + else if (strtoi(token + 4, &pid)) + return -1; } else { /* Invalid argument */ return -1; @@ -223,7 +226,7 @@ actions_perform(struct actions *self) int pid, retval; const struct action *action; - for (action = self->list; action < self->list + self->len; action++) { + for_each_action(self, action) { switch (action->type) { case ACTION_TRACE_OUTPUT: retval = save_trace_to_file(self->trace_output_inst, action->trace_output); diff --git a/tools/tracing/rtla/src/actions.h b/tools/tracing/rtla/src/actions.h index a4f9b570775b..fb77069c972b 100644 --- a/tools/tracing/rtla/src/actions.h +++ b/tools/tracing/rtla/src/actions.h @@ -42,6 +42,11 @@ struct actions { struct tracefs_instance *trace_output_inst; }; +#define for_each_action(actions, action) \ + for ((action) = (actions)->list; \ + (action) < (actions)->list + (actions)->len; \ + (action)++) + void actions_init(struct actions *self); void actions_destroy(struct actions *self); int actions_add_trace_output(struct actions *self, const char *trace_output); diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c index b197037fc58b..ceff76a62a30 100644 --- a/tools/tracing/rtla/src/common.c +++ b/tools/tracing/rtla/src/common.c @@ -4,11 +4,13 @@ #include <pthread.h> #include <signal.h> #include <stdlib.h> +#include <string.h> #include <unistd.h> +#include <getopt.h> #include "common.h" struct trace_instance *trace_inst; -int stop_tracing; +volatile int stop_tracing; static void stop_trace(int sig) { @@ -38,6 +40,84 @@ static void set_signals(struct common_params *params) } /* + * common_parse_options - parse common command line options + * + * @argc: argument count + * @argv: argument vector + * @common: common parameters structure + * + * Parse command line options that are common to all rtla tools. + * + * Returns: non zero if a common option was parsed, or 0 + * if the option should be handled by tool-specific parsing. + */ +int common_parse_options(int argc, char **argv, struct common_params *common) +{ + struct trace_events *tevent; + int saved_state = optind; + int c; + + static struct option long_options[] = { + {"cpus", required_argument, 0, 'c'}, + {"cgroup", optional_argument, 0, 'C'}, + {"debug", no_argument, 0, 'D'}, + {"duration", required_argument, 0, 'd'}, + {"event", required_argument, 0, 'e'}, + {"house-keeping", required_argument, 0, 'H'}, + {"priority", required_argument, 0, 'P'}, + {0, 0, 0, 0} + }; + + opterr = 0; + c = getopt_long(argc, argv, "c:C::Dd:e:H:P:", long_options, NULL); + opterr = 1; + + switch (c) { + case 'c': + if (parse_cpu_set(optarg, &common->monitored_cpus)) + fatal("Invalid -c cpu list"); + common->cpus = optarg; + break; + case 'C': + common->cgroup = 1; + common->cgroup_name = parse_optional_arg(argc, argv); + break; + case 'D': + config_debug = 1; + break; + case 'd': + common->duration = parse_seconds_duration(optarg); + if (!common->duration) + fatal("Invalid -d duration"); + break; + case 'e': + tevent = trace_event_alloc(optarg); + if (!tevent) + fatal("Error alloc trace event"); + + if (common->events) + tevent->next = common->events; + common->events = tevent; + break; + case 'H': + common->hk_cpus = 1; + if (parse_cpu_set(optarg, &common->hk_cpu_set)) + fatal("Error parsing house keeping CPUs"); + break; + case 'P': + if (parse_prio(optarg, &common->sched_param) == -1) + fatal("Invalid -P priority"); + common->set_sched = 1; + break; + default: + optind = saved_state; + return 0; + } + + return c; +} + +/* * common_apply_config - apply common configs to the initialized tool */ int @@ -348,3 +428,61 @@ int hist_main_loop(struct osnoise_tool *tool) return retval; } + +int osn_set_stop(struct osnoise_tool *tool) +{ + struct common_params *params = tool->params; + int retval; + + retval = osnoise_set_stop_us(tool->context, params->stop_us); + if (retval) { + err_msg("Failed to set stop us\n"); + return retval; + } + + retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us); + if (retval) { + err_msg("Failed to set stop total us\n"); + return retval; + } + + return 0; +} + +static void print_msg_array(const char * const *msgs) +{ + if (!msgs) + return; + + for (int i = 0; msgs[i]; i++) + fprintf(stderr, "%s\n", msgs[i]); +} + +/* + * common_usage - print complete usage information + */ +void common_usage(const char *tool, const char *mode, + const char *desc, const char * const *start_msgs, const char * const *opt_msgs) +{ + static const char * const common_options[] = { + " -h/--help: print this menu", + NULL + }; + fprintf(stderr, "rtla %s", tool); + if (strcmp(mode, "")) + fprintf(stderr, " %s", mode); + fprintf(stderr, ": %s (version %s)\n\n", desc, VERSION); + fprintf(stderr, " usage: [rtla] %s ", tool); + + if (strcmp(mode, "top") == 0) + fprintf(stderr, "[top] [-h] "); + else + fprintf(stderr, "%s [-h] ", mode); + + print_msg_array(start_msgs); + fprintf(stderr, "\n"); + print_msg_array(common_options); + print_msg_array(opt_msgs); + + exit(EXIT_SUCCESS); +} diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h index 9ec2b7632c37..7602c5593ef5 100644 --- a/tools/tracing/rtla/src/common.h +++ b/tools/tracing/rtla/src/common.h @@ -54,7 +54,7 @@ struct osnoise_context { }; extern struct trace_instance *trace_inst; -extern int stop_tracing; +extern volatile int stop_tracing; struct hist_params { char no_irq; @@ -152,7 +152,15 @@ void osnoise_destroy_tool(struct osnoise_tool *top); struct osnoise_tool *osnoise_init_tool(char *tool_name); struct osnoise_tool *osnoise_init_trace_tool(const char *tracer); bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record); +int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us); +int osnoise_set_stop_total_us(struct osnoise_context *context, + long long stop_total_us); +int common_parse_options(int argc, char **argv, struct common_params *common); int common_apply_config(struct osnoise_tool *tool, struct common_params *params); int top_main_loop(struct osnoise_tool *tool); int hist_main_loop(struct osnoise_tool *tool); +int osn_set_stop(struct osnoise_tool *tool); + +void common_usage(const char *tool, const char *mode, + const char *desc, const char * const *start_msgs, const char * const *opt_msgs); diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c index 312c511fa004..945eb61efc46 100644 --- a/tools/tracing/rtla/src/osnoise.c +++ b/tools/tracing/rtla/src/osnoise.c @@ -1128,18 +1128,6 @@ osnoise_apply_config(struct osnoise_tool *tool, struct osnoise_params *params) goto out_err; } - retval = osnoise_set_stop_us(tool->context, params->common.stop_us); - if (retval) { - err_msg("Failed to set stop us\n"); - goto out_err; - } - - retval = osnoise_set_stop_total_us(tool->context, params->common.stop_total_us); - if (retval) { - err_msg("Failed to set stop total us\n"); - goto out_err; - } - retval = osnoise_set_tracing_thresh(tool->context, params->threshold); if (retval) { err_msg("Failed to set tracing_thresh\n"); @@ -1184,9 +1172,12 @@ int osnoise_enable(struct osnoise_tool *tool) debug_msg("Error cleaning up the buffer"); return retval; } - } + retval = osn_set_stop(tool); + if (retval) + return retval; + return 0; } diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h index 895687030c0b..168669aa7e0d 100644 --- a/tools/tracing/rtla/src/osnoise.h +++ b/tools/tracing/rtla/src/osnoise.h @@ -34,12 +34,7 @@ int osnoise_set_runtime_period(struct osnoise_context *context, unsigned long long period); void osnoise_restore_runtime_period(struct osnoise_context *context); -int osnoise_set_stop_us(struct osnoise_context *context, - long long stop_us); void osnoise_restore_stop_us(struct osnoise_context *context); - -int osnoise_set_stop_total_us(struct osnoise_context *context, - long long stop_total_us); void osnoise_restore_stop_total_us(struct osnoise_context *context); int osnoise_set_timerlat_period_us(struct osnoise_context *context, @@ -58,8 +53,6 @@ int osnoise_set_irq_disable(struct osnoise_context *context, bool onoff); void osnoise_report_missed_events(struct osnoise_tool *tool); int osnoise_apply_config(struct osnoise_tool *tool, struct osnoise_params *params); -int osnoise_hist_main(int argc, char *argv[]); -int osnoise_top_main(int argc, char **argv); int osnoise_enable(struct osnoise_tool *tool); int osnoise_main(int argc, char **argv); int hwnoise_main(int argc, char **argv); @@ -68,4 +61,3 @@ extern struct tool_ops timerlat_top_ops, timerlat_hist_ops; extern struct tool_ops osnoise_top_ops, osnoise_hist_ops; int run_tool(struct tool_ops *ops, int argc, char *argv[]); -int hist_main_loop(struct osnoise_tool *tool); diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c index ff8c231e47c4..9d70ea34807f 100644 --- a/tools/tracing/rtla/src/osnoise_hist.c +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -9,7 +9,6 @@ #include <string.h> #include <signal.h> #include <unistd.h> -#include <errno.h> #include <stdio.h> #include <time.h> @@ -409,16 +408,15 @@ osnoise_print_stats(struct osnoise_tool *tool) */ static void osnoise_hist_usage(void) { - int i; - - static const char * const msg[] = { - "", - " usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", + static const char * const msg_start[] = { + "[-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", " [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [-C [cgroup_name]] [--warm-up]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char * const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", " -p/--period us: osnoise period in us", " -r/--runtime us: osnoise runtime in us", @@ -453,13 +451,8 @@ static void osnoise_hist_usage(void) NULL, }; - fprintf(stderr, "rtla osnoise hist: a per-cpu histogram of the OS noise (version %s)\n", - VERSION); - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage("osnoise", "hist", "a per-cpu histogram of the OS noise", + msg_start, msg_opts); } /* @@ -469,7 +462,6 @@ static struct common_params *osnoise_hist_parse_args(int argc, char *argv[]) { struct osnoise_params *params; - struct trace_events *tevent; int retval; int c; char *trace_output = NULL; @@ -491,19 +483,12 @@ static struct common_params {"auto", required_argument, 0, 'a'}, {"bucket-size", required_argument, 0, 'b'}, {"entries", required_argument, 0, 'E'}, - {"cpus", required_argument, 0, 'c'}, - {"cgroup", optional_argument, 0, 'C'}, - {"debug", no_argument, 0, 'D'}, - {"duration", required_argument, 0, 'd'}, - {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"runtime", required_argument, 0, 'r'}, {"stop", required_argument, 0, 's'}, {"stop-total", required_argument, 0, 'S'}, {"trace", optional_argument, 0, 't'}, - {"event", required_argument, 0, 'e'}, {"threshold", required_argument, 0, 'T'}, {"no-header", no_argument, 0, '0'}, {"no-summary", no_argument, 0, '1'}, @@ -518,7 +503,10 @@ static struct common_params {0, 0, 0, 0} }; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:", + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + + c = getopt_long(argc, argv, "a:b:E:hp:r:s:S:t::T:01234:5:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -544,34 +532,6 @@ static struct common_params params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; - break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = parse_optional_arg(argc, argv); - break; - case 'D': - config_debug = 1; - break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -D duration"); - break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - - params->common.events = tevent; - break; case 'E': params->common.hist.entries = get_llong_from_str(optarg); if (params->common.hist.entries < 10 || @@ -582,23 +542,11 @@ static struct common_params case '?': osnoise_hist_usage(); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'p': params->period = get_llong_from_str(optarg); if (params->period > 10000000) fatal("Period longer than 10 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 'r': params->runtime = get_llong_from_str(optarg); if (params->runtime < 100) diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c index 04c699bdd736..d54d47947fb4 100644 --- a/tools/tracing/rtla/src/osnoise_top.c +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -257,14 +257,16 @@ osnoise_print_stats(struct osnoise_tool *top) */ static void osnoise_top_usage(struct osnoise_params *params) { - int i; + const char *tool, *mode, *desc; - static const char * const msg[] = { - " [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", + static const char * const msg_start[] = { + "[-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\", " [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\", " [-c cpu-list] [-H cpu-list] [-P priority] [-C [cgroup_name]] [--warm-up s]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char * const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit", " -p/--period us: osnoise period in us", " -r/--runtime us: osnoise runtime in us", @@ -295,25 +297,16 @@ static void osnoise_top_usage(struct osnoise_params *params) }; if (params->mode == MODE_OSNOISE) { - fprintf(stderr, - "rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n", - VERSION); - - fprintf(stderr, " usage: rtla osnoise [top]"); + tool = "osnoise"; + mode = "top"; + desc = "a per-cpu summary of the OS noise"; + } else { + tool = "hwnoise"; + mode = ""; + desc = "a summary of hardware-related noise"; } - if (params->mode == MODE_HWNOISE) { - fprintf(stderr, - "rtla hwnoise: a summary of hardware-related noise (version %s)\n", - VERSION); - - fprintf(stderr, " usage: rtla hwnoise"); - } - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage(tool, mode, desc, msg_start, msg_opts); } /* @@ -322,7 +315,6 @@ static void osnoise_top_usage(struct osnoise_params *params) struct common_params *osnoise_top_parse_args(int argc, char **argv) { struct osnoise_params *params; - struct trace_events *tevent; int retval; int c; char *trace_output = NULL; @@ -346,15 +338,8 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cpus", required_argument, 0, 'c'}, - {"cgroup", optional_argument, 0, 'C'}, - {"debug", no_argument, 0, 'D'}, - {"duration", required_argument, 0, 'd'}, - {"event", required_argument, 0, 'e'}, - {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"quiet", no_argument, 0, 'q'}, {"runtime", required_argument, 0, 'r'}, {"stop", required_argument, 0, 's'}, @@ -370,7 +355,10 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) {0, 0, 0, 0} }; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:", + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + + c = getopt_long(argc, argv, "a:hp:qr:s:S:t::T:0:1:2:3:", long_options, NULL); /* Detect the end of the options. */ @@ -390,55 +378,15 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv) trace_output = "osnoise_trace.txt"; break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; - break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = parse_optional_arg(argc, argv); - break; - case 'D': - config_debug = 1; - break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -d duration"); - break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - params->common.events = tevent; - - break; case 'h': case '?': osnoise_top_usage(params); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'p': params->period = get_llong_from_str(optarg); if (params->period > 10000000) fatal("Period longer than 10 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 'q': params->common.quiet = 1; break; diff --git a/tools/tracing/rtla/src/timerlat.bpf.c b/tools/tracing/rtla/src/timerlat.bpf.c index e2265b5d6491..549d2d2191d2 100644 --- a/tools/tracing/rtla/src/timerlat.bpf.c +++ b/tools/tracing/rtla/src/timerlat.bpf.c @@ -40,6 +40,17 @@ struct { __uint(max_entries, 1); } signal_stop_tracing SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(unsigned int)); + __uint(max_entries, 1); + __array(values, unsigned int (void *)); +} bpf_action SEC(".maps") = { + .values = { + [0] = 0 + }, +}; + /* Params to be set by rtla */ const volatile int bucket_size = 1; const volatile int output_divisor = 1000; @@ -109,7 +120,7 @@ nosubprog void update_summary(void *map, map_set(map, SUMMARY_SUM, map_get(map, SUMMARY_SUM) + latency); } -nosubprog void set_stop_tracing(void) +nosubprog void set_stop_tracing(struct trace_event_raw_timerlat_sample *tp_args) { int value = 0; @@ -118,6 +129,12 @@ nosubprog void set_stop_tracing(void) /* Signal to userspace */ bpf_ringbuf_output(&signal_stop_tracing, &value, sizeof(value), 0); + + /* + * Call into BPF action program, if attached. + * Otherwise, just silently fail. + */ + bpf_tail_call(tp_args, &bpf_action, 0); } SEC("tp/osnoise/timerlat_sample") @@ -138,19 +155,19 @@ int handle_timerlat_sample(struct trace_event_raw_timerlat_sample *tp_args) update_summary(&summary_irq, latency, bucket); if (irq_threshold != 0 && latency_us >= irq_threshold) - set_stop_tracing(); + set_stop_tracing(tp_args); } else if (tp_args->context == 1) { update_main_hist(&hist_thread, bucket); update_summary(&summary_thread, latency, bucket); if (thread_threshold != 0 && latency_us >= thread_threshold) - set_stop_tracing(); + set_stop_tracing(tp_args); } else { update_main_hist(&hist_user, bucket); update_summary(&summary_user, latency, bucket); if (thread_threshold != 0 && latency_us >= thread_threshold) - set_stop_tracing(); + set_stop_tracing(tp_args); } return 0; diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c index df4f9bfe3433..8f8811f7a13b 100644 --- a/tools/tracing/rtla/src/timerlat.c +++ b/tools/tracing/rtla/src/timerlat.c @@ -9,7 +9,6 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> -#include <errno.h> #include <fcntl.h> #include <stdio.h> #include <sched.h> @@ -48,25 +47,17 @@ timerlat_apply_config(struct osnoise_tool *tool, struct timerlat_params *params) } } - if (params->mode != TRACING_MODE_BPF) { - /* - * In tracefs and mixed mode, timerlat tracer handles stopping - * on threshold - */ - retval = osnoise_set_stop_us(tool->context, params->common.stop_us); - if (retval) { - err_msg("Failed to set stop us\n"); + /* Check if BPF action program is requested but BPF is not available */ + if (params->bpf_action_program) { + if (params->mode == TRACING_MODE_TRACEFS) { + err_msg("BPF actions are not supported in tracefs-only mode\n"); goto out_err; } - retval = osnoise_set_stop_total_us(tool->context, params->common.stop_total_us); - if (retval) { - err_msg("Failed to set stop total us\n"); + if (timerlat_load_bpf_action_program(params->bpf_action_program)) goto out_err; - } } - retval = osnoise_set_timerlat_period_us(tool->context, params->timerlat_period_us ? params->timerlat_period_us : @@ -184,6 +175,16 @@ int timerlat_enable(struct osnoise_tool *tool) } } + /* + * In tracefs and mixed mode, timerlat tracer handles stopping + * on threshold + */ + if (params->mode != TRACING_MODE_BPF) { + retval = osn_set_stop(tool); + if (retval) + return retval; + } + return 0; } diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h index fd6065f48bb7..8dd5d134ce08 100644 --- a/tools/tracing/rtla/src/timerlat.h +++ b/tools/tracing/rtla/src/timerlat.h @@ -27,6 +27,7 @@ struct timerlat_params { int dump_tasks; int deepest_idle_state; enum timerlat_tracing_mode mode; + const char *bpf_action_program; }; #define to_timerlat_params(ptr) container_of(ptr, struct timerlat_params, common) @@ -36,4 +37,3 @@ int timerlat_main(int argc, char *argv[]); int timerlat_enable(struct osnoise_tool *tool); void timerlat_analyze(struct osnoise_tool *tool, bool stopped); void timerlat_free(struct osnoise_tool *tool); - diff --git a/tools/tracing/rtla/src/timerlat_bpf.c b/tools/tracing/rtla/src/timerlat_bpf.c index e97d16646bcd..05adf18303df 100644 --- a/tools/tracing/rtla/src/timerlat_bpf.c +++ b/tools/tracing/rtla/src/timerlat_bpf.c @@ -7,6 +7,10 @@ static struct timerlat_bpf *bpf; +/* BPF object and program for action program */ +static struct bpf_object *obj; +static struct bpf_program *prog; + /* * timerlat_bpf_init - load and initialize BPF program to collect timerlat data */ @@ -60,6 +64,19 @@ int timerlat_bpf_init(struct timerlat_params *params) } /* + * timerlat_bpf_set_action - set action on threshold executed on BPF side + */ +static int timerlat_bpf_set_action(struct bpf_program *prog) +{ + unsigned int key = 0, value = bpf_program__fd(prog); + + return bpf_map__update_elem(bpf->maps.bpf_action, + &key, sizeof(key), + &value, sizeof(value), + BPF_ANY); +} + +/* * timerlat_bpf_attach - attach BPF program to collect timerlat data */ int timerlat_bpf_attach(void) @@ -83,6 +100,11 @@ void timerlat_bpf_detach(void) void timerlat_bpf_destroy(void) { timerlat_bpf__destroy(bpf); + bpf = NULL; + if (obj) + bpf_object__close(obj); + obj = NULL; + prog = NULL; } static int handle_rb_event(void *ctx, void *data, size_t data_sz) @@ -177,4 +199,48 @@ int timerlat_bpf_get_summary_value(enum summary_field key, bpf->maps.summary_user, key, value_irq, value_thread, value_user, cpus); } + +/* + * timerlat_load_bpf_action_program - load and register a BPF action program + */ +int timerlat_load_bpf_action_program(const char *program_path) +{ + int err; + + obj = bpf_object__open_file(program_path, NULL); + if (!obj) { + err_msg("Failed to open BPF action program: %s\n", program_path); + goto out_err; + } + + err = bpf_object__load(obj); + if (err) { + err_msg("Failed to load BPF action program: %s\n", program_path); + goto out_obj_err; + } + + prog = bpf_object__find_program_by_name(obj, "action_handler"); + if (!prog) { + err_msg("BPF action program must have 'action_handler' function: %s\n", + program_path); + goto out_obj_err; + } + + err = timerlat_bpf_set_action(prog); + if (err) { + err_msg("Failed to register BPF action program: %s\n", program_path); + goto out_prog_err; + } + + return 0; + +out_prog_err: + prog = NULL; +out_obj_err: + bpf_object__close(obj); + obj = NULL; +out_err: + return 1; +} + #endif /* HAVE_BPF_SKEL */ diff --git a/tools/tracing/rtla/src/timerlat_bpf.h b/tools/tracing/rtla/src/timerlat_bpf.h index 118487436d30..169abeaf4363 100644 --- a/tools/tracing/rtla/src/timerlat_bpf.h +++ b/tools/tracing/rtla/src/timerlat_bpf.h @@ -12,6 +12,7 @@ enum summary_field { }; #ifndef __bpf__ +#include <bpf/libbpf.h> #ifdef HAVE_BPF_SKEL int timerlat_bpf_init(struct timerlat_params *params); int timerlat_bpf_attach(void); @@ -29,7 +30,7 @@ int timerlat_bpf_get_summary_value(enum summary_field key, long long *value_thread, long long *value_user, int cpus); - +int timerlat_load_bpf_action_program(const char *program_path); static inline int have_libbpf_support(void) { return 1; } #else static inline int timerlat_bpf_init(struct timerlat_params *params) @@ -57,6 +58,10 @@ static inline int timerlat_bpf_get_summary_value(enum summary_field key, { return -1; } +static inline int timerlat_load_bpf_action_program(const char *program_path) +{ + return -1; +} static inline int have_libbpf_support(void) { return 0; } #endif /* HAVE_BPF_SKEL */ #endif /* __bpf__ */ diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c index 1fb471a787b7..4e8c38a61197 100644 --- a/tools/tracing/rtla/src/timerlat_hist.c +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -696,17 +696,16 @@ timerlat_print_stats(struct osnoise_tool *tool) */ static void timerlat_hist_usage(void) { - int i; - - char *msg[] = { - "", - " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", + static const char * const msg_start[] = { + "[-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\", " [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", " [--no-index] [--with-zeros] [--dma-latency us] [-C [cgroup_name]] [--no-aa] [--dump-task] [-u|-k]", " [--warm-up s] [--deepest-idle-state n]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char * const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", " -p/--period us: timerlat period in us", " -i/--irq us: stop trace if the irq latency is higher than the argument in us", @@ -747,16 +746,12 @@ static void timerlat_hist_usage(void) " --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency", " --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed", " --on-end <action>: define action to be executed at measurement end, multiple are allowed", + " --bpf-action <program>: load and execute BPF program when latency threshold is exceeded", NULL, }; - fprintf(stderr, "rtla timerlat hist: a per-cpu histogram of the timer latency (version %s)\n", - VERSION); - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage("timerlat", "hist", "a per-cpu histogram of the timer latency", + msg_start, msg_opts); } /* @@ -766,7 +761,6 @@ static struct common_params *timerlat_hist_parse_args(int argc, char *argv[]) { struct timerlat_params *params; - struct trace_events *tevent; int auto_thresh; int retval; int c; @@ -796,25 +790,18 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cpus", required_argument, 0, 'c'}, - {"cgroup", optional_argument, 0, 'C'}, {"bucket-size", required_argument, 0, 'b'}, - {"debug", no_argument, 0, 'D'}, {"entries", required_argument, 0, 'E'}, - {"duration", required_argument, 0, 'd'}, - {"house-keeping", required_argument, 0, 'H'}, {"help", no_argument, 0, 'h'}, {"irq", required_argument, 0, 'i'}, {"nano", no_argument, 0, 'n'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"stack", required_argument, 0, 's'}, {"thread", required_argument, 0, 'T'}, {"trace", optional_argument, 0, 't'}, {"user-threads", no_argument, 0, 'u'}, {"kernel-threads", no_argument, 0, 'k'}, {"user-load", no_argument, 0, 'U'}, - {"event", required_argument, 0, 'e'}, {"no-irq", no_argument, 0, '0'}, {"no-thread", no_argument, 0, '1'}, {"no-header", no_argument, 0, '2'}, @@ -831,10 +818,14 @@ static struct common_params {"deepest-idle-state", required_argument, 0, '\4'}, {"on-threshold", required_argument, 0, '\5'}, {"on-end", required_argument, 0, '\6'}, + {"bpf-action", required_argument, 0, '\7'}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:", + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + + c = getopt_long(argc, argv, "a:b:E:hi:knp:s:t::T:uU0123456:7:8:9\1\2:\3:", long_options, NULL); /* detect the end of the options. */ @@ -857,40 +848,12 @@ static struct common_params trace_output = "timerlat_trace.txt"; break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; - break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = parse_optional_arg(argc, argv); - break; case 'b': params->common.hist.bucket_size = get_llong_from_str(optarg); if (params->common.hist.bucket_size == 0 || params->common.hist.bucket_size >= 1000000) fatal("Bucket size needs to be > 0 and <= 1000000"); break; - case 'D': - config_debug = 1; - break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -D duration"); - break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - - params->common.events = tevent; - break; case 'E': params->common.hist.entries = get_llong_from_str(optarg); if (params->common.hist.entries < 10 || @@ -901,12 +864,6 @@ static struct common_params case '?': timerlat_hist_usage(); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'i': params->common.stop_us = get_llong_from_str(optarg); break; @@ -921,12 +878,6 @@ static struct common_params if (params->timerlat_period_us > 1000000) fatal("Period longer than 1 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 's': params->print_stack = get_llong_from_str(optarg); break; @@ -1012,6 +963,9 @@ static struct common_params if (retval) fatal("Invalid action %s", optarg); break; + case '\7': + params->bpf_action_program = optarg; + break; default: fatal("Invalid option"); } diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c index 29c2c1f717ed..284b74773c2b 100644 --- a/tools/tracing/rtla/src/timerlat_top.c +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -11,7 +11,6 @@ #include <unistd.h> #include <stdio.h> #include <time.h> -#include <errno.h> #include <sched.h> #include <pthread.h> @@ -476,15 +475,14 @@ timerlat_print_stats(struct osnoise_tool *top) */ static void timerlat_top_usage(void) { - int i; - - static const char *const msg[] = { - "", - " usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", + static const char *const msg_start[] = { + "[-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\", " [[-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\", " [-P priority] [--dma-latency us] [--aa-only us] [-C [cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]", - "", - " -h/--help: print this menu", + NULL, + }; + + static const char *const msg_opts[] = { " -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit", " --aa-only us: stop if <us> latency is hit, only printing the auto analysis (reduces CPU usage)", " -p/--period us: timerlat period in us", @@ -519,16 +517,12 @@ static void timerlat_top_usage(void) " --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency", " --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed", " --on-end: define action to be executed at measurement end, multiple are allowed", + " --bpf-action <program>: load and execute BPF program when latency threshold is exceeded", NULL, }; - fprintf(stderr, "rtla timerlat top: a per-cpu summary of the timer latency (version %s)\n", - VERSION); - - for (i = 0; msg[i]; i++) - fprintf(stderr, "%s\n", msg[i]); - - exit(EXIT_SUCCESS); + common_usage("timerlat", "top", "a per-cpu summary of the timer latency", + msg_start, msg_opts); } /* @@ -538,7 +532,6 @@ static struct common_params *timerlat_top_parse_args(int argc, char **argv) { struct timerlat_params *params; - struct trace_events *tevent; long long auto_thresh; int retval; int c; @@ -566,17 +559,10 @@ static struct common_params while (1) { static struct option long_options[] = { {"auto", required_argument, 0, 'a'}, - {"cpus", required_argument, 0, 'c'}, - {"cgroup", optional_argument, 0, 'C'}, - {"debug", no_argument, 0, 'D'}, - {"duration", required_argument, 0, 'd'}, - {"event", required_argument, 0, 'e'}, {"help", no_argument, 0, 'h'}, - {"house-keeping", required_argument, 0, 'H'}, {"irq", required_argument, 0, 'i'}, {"nano", no_argument, 0, 'n'}, {"period", required_argument, 0, 'p'}, - {"priority", required_argument, 0, 'P'}, {"quiet", no_argument, 0, 'q'}, {"stack", required_argument, 0, 's'}, {"thread", required_argument, 0, 'T'}, @@ -595,10 +581,14 @@ static struct common_params {"deepest-idle-state", required_argument, 0, '8'}, {"on-threshold", required_argument, 0, '9'}, {"on-end", required_argument, 0, '\1'}, + {"bpf-action", required_argument, 0, '\2'}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:", + if (common_parse_options(argc, argv, ¶ms->common)) + continue; + + c = getopt_long(argc, argv, "a:hi:knp:qs:t::T:uU0:1:2:345:6:7:", long_options, NULL); /* detect the end of the options. */ @@ -635,43 +625,10 @@ static struct common_params /* set aa_only to avoid parsing the trace */ params->common.aa_only = 1; break; - case 'c': - retval = parse_cpu_set(optarg, ¶ms->common.monitored_cpus); - if (retval) - fatal("Invalid -c cpu list"); - params->common.cpus = optarg; - break; - case 'C': - params->common.cgroup = 1; - params->common.cgroup_name = optarg; - break; - case 'D': - config_debug = 1; - break; - case 'd': - params->common.duration = parse_seconds_duration(optarg); - if (!params->common.duration) - fatal("Invalid -d duration"); - break; - case 'e': - tevent = trace_event_alloc(optarg); - if (!tevent) - fatal("Error alloc trace event"); - - if (params->common.events) - tevent->next = params->common.events; - params->common.events = tevent; - break; case 'h': case '?': timerlat_top_usage(); break; - case 'H': - params->common.hk_cpus = 1; - retval = parse_cpu_set(optarg, ¶ms->common.hk_cpu_set); - if (retval) - fatal("Error parsing house keeping CPUs"); - break; case 'i': params->common.stop_us = get_llong_from_str(optarg); break; @@ -686,12 +643,6 @@ static struct common_params if (params->timerlat_period_us > 1000000) fatal("Period longer than 1 s"); break; - case 'P': - retval = parse_prio(optarg, ¶ms->common.sched_param); - if (retval == -1) - fatal("Invalid -P priority"); - params->common.set_sched = 1; - break; case 'q': params->common.quiet = 1; break; @@ -762,6 +713,9 @@ static struct common_params if (retval) fatal("Invalid action %s", optarg); break; + case '\2': + params->bpf_action_program = optarg; + break; default: fatal("Invalid option"); } diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c index 69cbc48d53d3..b8be3e28680e 100644 --- a/tools/tracing/rtla/src/trace.c +++ b/tools/tracing/rtla/src/trace.c @@ -2,7 +2,6 @@ #define _GNU_SOURCE #include <sys/sendfile.h> #include <tracefs.h> -#include <signal.h> #include <stdlib.h> #include <unistd.h> #include <errno.h> diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c index 9cf5a0098e9a..0da3b2470c31 100644 --- a/tools/tracing/rtla/src/utils.c +++ b/tools/tracing/rtla/src/utils.c @@ -17,6 +17,7 @@ #include <fcntl.h> #include <sched.h> #include <stdio.h> +#include <limits.h> #include "utils.h" @@ -112,7 +113,7 @@ void get_duration(time_t start_time, char *output, int output_size) * Receives a cpu list, like 1-3,5 (cpus 1, 2, 3, 5), and then set * filling cpu_set_t argument. * - * Returns 1 on success, 0 otherwise. + * Returns 0 on success, 1 otherwise. */ int parse_cpu_set(char *cpu_list, cpu_set_t *set) { @@ -314,6 +315,7 @@ static int procfs_is_workload_pid(const char *comm_prefix, struct dirent *proc_e if (retval <= 0) return 0; + buffer[MAX_PATH-1] = '\0'; retval = strncmp(comm_prefix, buffer, strlen(comm_prefix)); if (retval) return 0; @@ -337,6 +339,7 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr) struct dirent *proc_entry; DIR *procfs; int retval; + int pid; if (strlen(comm_prefix) >= MAX_PATH) { err_msg("Command prefix is too long: %d < strlen(%s)\n", @@ -356,8 +359,12 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr) if (!retval) continue; + if (strtoi(proc_entry->d_name, &pid)) { + err_msg("'%s' is not a valid pid", proc_entry->d_name); + goto out_err; + } /* procfs_is_workload_pid confirmed it is a pid */ - retval = __set_sched_attr(atoi(proc_entry->d_name), attr); + retval = __set_sched_attr(pid, attr); if (retval) { err_msg("Error setting sched attributes for pid:%s\n", proc_entry->d_name); goto out_err; @@ -742,6 +749,7 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg) if (fd < 0) return 0; + memset(path, 0, sizeof(path)); retval = read(fd, path, MAX_PATH); close(fd); @@ -749,6 +757,7 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg) if (retval <= 0) return 0; + path[MAX_PATH-1] = '\0'; start = path; start = strstr(start, ":"); @@ -784,27 +793,27 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg) } /* - * set_comm_cgroup - Set cgroup to pid_t pid + * open_cgroup_procs - Open the cgroup.procs file for the given cgroup * - * If cgroup argument is not NULL, the threads will move to the given cgroup. - * Otherwise, the cgroup of the calling, i.e., rtla, thread will be used. + * If cgroup argument is not NULL, the cgroup.procs file for that cgroup + * will be opened. Otherwise, the cgroup of the calling, i.e., rtla, thread + * will be used. * * Supports cgroup v2. * - * Returns 1 on success, 0 otherwise. + * Returns the file descriptor on success, -1 otherwise. */ -int set_pid_cgroup(pid_t pid, const char *cgroup) +static int open_cgroup_procs(const char *cgroup) { char cgroup_path[MAX_PATH - strlen("/cgroup.procs")]; char cgroup_procs[MAX_PATH]; - char pid_str[24]; int retval; int cg_fd; retval = find_mount("cgroup2", cgroup_path, sizeof(cgroup_path)); if (!retval) { err_msg("Did not find cgroupv2 mount point\n"); - return 0; + return -1; } if (!cgroup) { @@ -812,7 +821,7 @@ int set_pid_cgroup(pid_t pid, const char *cgroup) sizeof(cgroup_path) - strlen(cgroup_path)); if (!retval) { err_msg("Did not find self cgroup\n"); - return 0; + return -1; } } else { snprintf(&cgroup_path[strlen(cgroup_path)], @@ -825,6 +834,29 @@ int set_pid_cgroup(pid_t pid, const char *cgroup) cg_fd = open(cgroup_procs, O_RDWR); if (cg_fd < 0) + return -1; + + return cg_fd; +} + +/* + * set_pid_cgroup - Set cgroup to pid_t pid + * + * If cgroup argument is not NULL, the threads will move to the given cgroup. + * Otherwise, the cgroup of the calling, i.e., rtla, thread will be used. + * + * Supports cgroup v2. + * + * Returns 1 on success, 0 otherwise. + */ +int set_pid_cgroup(pid_t pid, const char *cgroup) +{ + char pid_str[24]; + int retval; + int cg_fd; + + cg_fd = open_cgroup_procs(cgroup); + if (cg_fd < 0) return 0; snprintf(pid_str, sizeof(pid_str), "%d\n", pid); @@ -853,8 +885,6 @@ int set_pid_cgroup(pid_t pid, const char *cgroup) */ int set_comm_cgroup(const char *comm_prefix, const char *cgroup) { - char cgroup_path[MAX_PATH - strlen("/cgroup.procs")]; - char cgroup_procs[MAX_PATH]; struct dirent *proc_entry; DIR *procfs; int retval; @@ -866,29 +896,7 @@ int set_comm_cgroup(const char *comm_prefix, const char *cgroup) return 0; } - retval = find_mount("cgroup2", cgroup_path, sizeof(cgroup_path)); - if (!retval) { - err_msg("Did not find cgroupv2 mount point\n"); - return 0; - } - - if (!cgroup) { - retval = get_self_cgroup(&cgroup_path[strlen(cgroup_path)], - sizeof(cgroup_path) - strlen(cgroup_path)); - if (!retval) { - err_msg("Did not find self cgroup\n"); - return 0; - } - } else { - snprintf(&cgroup_path[strlen(cgroup_path)], - sizeof(cgroup_path) - strlen(cgroup_path), "%s/", cgroup); - } - - snprintf(cgroup_procs, MAX_PATH, "%s/cgroup.procs", cgroup_path); - - debug_msg("Using cgroup path at: %s\n", cgroup_procs); - - cg_fd = open(cgroup_procs, O_RDWR); + cg_fd = open_cgroup_procs(cgroup); if (cg_fd < 0) return 0; @@ -1000,3 +1008,25 @@ char *parse_optional_arg(int argc, char **argv) return NULL; } } + +/* + * strtoi - convert string to integer with error checking + * + * Returns 0 on success, -1 if conversion fails or result is out of int range. + */ +int strtoi(const char *s, int *res) +{ + char *end_ptr; + long lres; + + if (!*s) + return -1; + + errno = 0; + lres = strtol(s, &end_ptr, 0); + if (errno || *end_ptr || lres > INT_MAX || lres < INT_MIN) + return -1; + + *res = (int) lres; + return 0; +} diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h index 091df4ba4587..f7c2a52a0ab5 100644 --- a/tools/tracing/rtla/src/utils.h +++ b/tools/tracing/rtla/src/utils.h @@ -3,6 +3,8 @@ #include <stdint.h> #include <time.h> #include <sched.h> +#include <stdbool.h> +#include <stdlib.h> /* * '18446744073709551615\0' @@ -24,7 +26,6 @@ void fatal(const char *fmt, ...); long parse_seconds_duration(char *val); void get_duration(time_t start_time, char *output, int output_size); -int parse_cpu_list(char *cpu_list, char **monitored_cpus); char *parse_optional_arg(int argc, char **argv); long long get_llong_from_str(char *start); @@ -82,12 +83,13 @@ static inline int set_deepest_cpu_idle_state(unsigned int cpu, unsigned int stat static inline int have_libcpupower_support(void) { return 0; } #endif /* HAVE_LIBCPUPOWER_SUPPORT */ int auto_house_keeping(cpu_set_t *monitored_cpus); +__attribute__((__warn_unused_result__)) int strtoi(const char *s, int *res); #define ns_to_usf(x) (((double)x/1000)) #define ns_to_per(total, part) ((part * 100) / (double)total) enum result { - PASSED = 0, /* same as EXIT_SUCCESS */ - ERROR = 1, /* same as EXIT_FAILURE, an error in arguments */ - FAILED = 2, /* test hit the stop tracing condition */ + PASSED = EXIT_SUCCESS, + ERROR = EXIT_FAILURE, + FAILED, /* test hit the stop tracing condition */ }; diff --git a/tools/tracing/rtla/tests/bpf/bpf_action_map.c b/tools/tracing/rtla/tests/bpf/bpf_action_map.c new file mode 100644 index 000000000000..1686e0b858e6 --- /dev/null +++ b/tools/tracing/rtla/tests/bpf/bpf_action_map.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_tracing.h> + +char LICENSE[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, unsigned int); + __type(value, unsigned long long); +} rtla_test_map SEC(".maps"); + +struct trace_event_raw_timerlat_sample; + +SEC("tp/timerlat_action") +int action_handler(struct trace_event_raw_timerlat_sample *tp_args) +{ + unsigned int key = 0; + unsigned long long value = 42; + + bpf_map_update_elem(&rtla_test_map, &key, &value, BPF_ANY); + + return 0; +} diff --git a/tools/tracing/rtla/tests/engine.sh b/tools/tracing/rtla/tests/engine.sh index c7de3d6ed6a8..ed261e07c6d9 100644 --- a/tools/tracing/rtla/tests/engine.sh +++ b/tools/tracing/rtla/tests/engine.sh @@ -105,7 +105,6 @@ check_with_osnoise_options() { [ "$1" == "" ] && continue option=$(echo $1 | cut -d '=' -f 1) value=$(echo $1 | cut -d '=' -f 2) - echo "option: $option, value: $value" echo "$value" > "/sys/kernel/tracing/osnoise/$option" || return 1 done fi diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t index bbaa1897d8a8..fd4935fd7b49 100644 --- a/tools/tracing/rtla/tests/timerlat.t +++ b/tools/tracing/rtla/tests/timerlat.t @@ -67,6 +67,21 @@ check "hist with trace output at end" \ "timerlat hist -d 1s --on-end trace" 0 "^ Saving trace to timerlat_trace.txt$" check "top with trace output at end" \ "timerlat top -d 1s --on-end trace" 0 "^ Saving trace to timerlat_trace.txt$" + +# BPF action program tests +if [ "$option" -eq 0 ] +then + # Test BPF action program properly in BPF mode + [ -z "$BPFTOOL" ] && BPFTOOL=bpftool + check "hist with BPF action program (BPF mode)" \ + "timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \ + 2 '"value": 42' +else + # Test BPF action program failure in non-BPF mode + check "hist with BPF action program (non-BPF mode)" \ + "timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o" \ + 1 "BPF actions are not supported in tracefs-only mode" +fi done test_end diff --git a/tools/verification/rvgen/rvgen/automata.py b/tools/verification/rvgen/rvgen/automata.py index d9a3fe2b74bf..3f06aef8d4fd 100644 --- a/tools/verification/rvgen/rvgen/automata.py +++ b/tools/verification/rvgen/rvgen/automata.py @@ -28,7 +28,7 @@ class Automata: self.function = self.__create_matrix() self.events_start, self.events_start_run = self.__store_init_events() - def __get_model_name(self): + def __get_model_name(self) -> str: basename = ntpath.basename(self.__dot_path) if not basename.endswith(".dot") and not basename.endswith(".gv"): print("not a dot file") @@ -40,7 +40,7 @@ class Automata: return model_name - def __open_dot(self): + def __open_dot(self) -> list[str]: cursor = 0 dot_lines = [] try: @@ -60,13 +60,13 @@ class Automata: cursor += 1 return dot_lines - def __get_cursor_begin_states(self): + def __get_cursor_begin_states(self) -> int: cursor = 0 while self.__dot_lines[cursor].split()[0] != "{node": cursor += 1 return cursor - def __get_cursor_begin_events(self): + def __get_cursor_begin_events(self) -> int: cursor = 0 while self.__dot_lines[cursor].split()[0] != "{node": cursor += 1 @@ -76,7 +76,7 @@ class Automata: cursor += 1 return cursor - def __get_state_variables(self): + def __get_state_variables(self) -> tuple[list[str], str, list[str]]: # wait for node declaration states = [] final_states = [] @@ -116,7 +116,7 @@ class Automata: return states, initial_state, final_states - def __get_event_variables(self): + def __get_event_variables(self) -> list[str]: # here we are at the begin of transitions, take a note, we will return later. cursor = self.__get_cursor_begin_events() @@ -140,7 +140,7 @@ class Automata: return sorted(set(events)) - def __create_matrix(self): + def __create_matrix(self) -> list[list[str]]: # transform the array into a dictionary events = self.events states = self.states @@ -174,7 +174,7 @@ class Automata: return matrix - def __store_init_events(self): + def __store_init_events(self) -> tuple[list[bool], list[bool]]: events_start = [False] * len(self.events) events_start_run = [False] * len(self.events) for i, _ in enumerate(self.events): @@ -196,10 +196,10 @@ class Automata: events_start_run[i] = True return events_start, events_start_run - def is_start_event(self, event): + def is_start_event(self, event: str) -> bool: return self.events_start[self.events.index(event)] - def is_start_run_event(self, event): + def is_start_run_event(self, event: str) -> bool: # prefer handle_start_event if there if any(self.events_start): return False diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py index b9b6f14cc536..06a26bf15a7e 100644 --- a/tools/verification/rvgen/rvgen/dot2c.py +++ b/tools/verification/rvgen/rvgen/dot2c.py @@ -26,64 +26,42 @@ class Dot2c(Automata): super().__init__(file_path, model_name) self.line_length = 100 - def __buff_to_string(self, buff): - string = "" - - for line in buff: - string = string + line + "\n" - - # cut off the last \n - return string[:-1] - - def __get_enum_states_content(self): + def __get_enum_states_content(self) -> list[str]: buff = [] - buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix)) + buff.append("\t%s%s," % (self.initial_state, self.enum_suffix)) for state in self.states: if state != self.initial_state: buff.append("\t%s%s," % (state, self.enum_suffix)) - buff.append("\tstate_max%s" % (self.enum_suffix)) + buff.append("\tstate_max%s," % (self.enum_suffix)) return buff - def get_enum_states_string(self): - buff = self.__get_enum_states_content() - return self.__buff_to_string(buff) - - def format_states_enum(self): + def format_states_enum(self) -> list[str]: buff = [] buff.append("enum %s {" % self.enum_states_def) - buff.append(self.get_enum_states_string()) + buff += self.__get_enum_states_content() buff.append("};\n") return buff - def __get_enum_events_content(self): + def __get_enum_events_content(self) -> list[str]: buff = [] - first = True for event in self.events: - if first: - buff.append("\t%s%s = 0," % (event, self.enum_suffix)) - first = False - else: - buff.append("\t%s%s," % (event, self.enum_suffix)) + buff.append("\t%s%s," % (event, self.enum_suffix)) - buff.append("\tevent_max%s" % self.enum_suffix) + buff.append("\tevent_max%s," % self.enum_suffix) return buff - def get_enum_events_string(self): - buff = self.__get_enum_events_content() - return self.__buff_to_string(buff) - - def format_events_enum(self): + def format_events_enum(self) -> list[str]: buff = [] buff.append("enum %s {" % self.enum_events_def) - buff.append(self.get_enum_events_string()) + buff += self.__get_enum_events_content() buff.append("};\n") return buff - def get_minimun_type(self): + def get_minimun_type(self) -> str: min_type = "unsigned char" if self.states.__len__() > 255: @@ -97,7 +75,7 @@ class Dot2c(Automata): return min_type - def format_automaton_definition(self): + def format_automaton_definition(self) -> list[str]: min_type = self.get_minimun_type() buff = [] buff.append("struct %s {" % self.struct_automaton_def) @@ -109,50 +87,37 @@ class Dot2c(Automata): buff.append("};\n") return buff - def format_aut_init_header(self): + def format_aut_init_header(self) -> list[str]: buff = [] buff.append("static const struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def)) return buff - def __get_string_vector_per_line_content(self, buff): - first = True - string = "" - for entry in buff: - if first: - string = string + "\t\t\"" + entry - first = False; - else: - string = string + "\",\n\t\t\"" + entry - string = string + "\"" - - return string - - def get_aut_init_events_string(self): - return self.__get_string_vector_per_line_content(self.events) - - def get_aut_init_states_string(self): - return self.__get_string_vector_per_line_content(self.states) + def __get_string_vector_per_line_content(self, entries: list[str]) -> str: + buff = [] + for entry in entries: + buff.append(f"\t\t\"{entry}\",") + return "\n".join(buff) - def format_aut_init_events_string(self): + def format_aut_init_events_string(self) -> list[str]: buff = [] buff.append("\t.event_names = {") - buff.append(self.get_aut_init_events_string()) + buff.append(self.__get_string_vector_per_line_content(self.events)) buff.append("\t},") return buff - def format_aut_init_states_string(self): + def format_aut_init_states_string(self) -> list[str]: buff = [] buff.append("\t.state_names = {") - buff.append(self.get_aut_init_states_string()) + buff.append(self.__get_string_vector_per_line_content(self.states)) buff.append("\t},") return buff - def __get_max_strlen_of_states(self): + def __get_max_strlen_of_states(self) -> int: max_state_name = max(self.states, key = len).__len__() return max(max_state_name, self.invalid_state_str.__len__()) - def get_aut_init_function(self): + def get_aut_init_function(self) -> str: nr_states = self.states.__len__() nr_events = self.events.__len__() buff = [] @@ -175,12 +140,12 @@ class Dot2c(Automata): if y != nr_events-1: line += ",\n" if linetoolong else ", " else: - line += "\n\t\t}," if linetoolong else " }," + line += ",\n\t\t}," if linetoolong else " }," buff.append(line) - return self.__buff_to_string(buff) + return '\n'.join(buff) - def format_aut_init_function(self): + def format_aut_init_function(self) -> list[str]: buff = [] buff.append("\t.function = {") buff.append(self.get_aut_init_function()) @@ -188,54 +153,54 @@ class Dot2c(Automata): return buff - def get_aut_init_initial_state(self): + def get_aut_init_initial_state(self) -> str: return self.initial_state - def format_aut_init_initial_state(self): + def format_aut_init_initial_state(self) -> list[str]: buff = [] initial_state = self.get_aut_init_initial_state() buff.append("\t.initial_state = " + initial_state + self.enum_suffix + ",") return buff - def get_aut_init_final_states(self): + def get_aut_init_final_states(self) -> str: line = "" first = True for state in self.states: - if first == False: + if not first: line = line + ', ' else: first = False - if self.final_states.__contains__(state): + if state in self.final_states: line = line + '1' else: line = line + '0' return line - def format_aut_init_final_states(self): + def format_aut_init_final_states(self) -> list[str]: buff = [] buff.append("\t.final_states = { %s }," % self.get_aut_init_final_states()) return buff - def __get_automaton_initialization_footer_string(self): + def __get_automaton_initialization_footer_string(self) -> str: footer = "};\n" return footer - def format_aut_init_footer(self): + def format_aut_init_footer(self) -> list[str]: buff = [] buff.append(self.__get_automaton_initialization_footer_string()) return buff - def format_invalid_state(self): + def format_invalid_state(self) -> list[str]: buff = [] buff.append("#define %s state_max%s\n" % (self.invalid_state_str, self.enum_suffix)) return buff - def format_model(self): + def format_model(self) -> list[str]: buff = [] buff += self.format_states_enum() buff += self.format_invalid_state() @@ -253,4 +218,4 @@ class Dot2c(Automata): def print_model_classic(self): buff = self.format_model() - print(self.__buff_to_string(buff)) + print('\n'.join(buff)) diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py index ed0a3c901106..6128fe238430 100644 --- a/tools/verification/rvgen/rvgen/dot2k.py +++ b/tools/verification/rvgen/rvgen/dot2k.py @@ -21,10 +21,10 @@ class dot2k(Monitor, Dot2c): Dot2c.__init__(self, file_path, extra_params.get("model_name")) self.enum_suffix = "_%s" % self.name - def fill_monitor_type(self): + def fill_monitor_type(self) -> str: return self.monitor_type.upper() - def fill_tracepoint_handlers_skel(self): + def fill_tracepoint_handlers_skel(self) -> str: buff = [] for event in self.events: buff.append("static void handle_%s(void *data, /* XXX: fill header */)" % event) @@ -38,26 +38,26 @@ class dot2k(Monitor, Dot2c): handle = "handle_start_run_event" if self.monitor_type == "per_task": buff.append("\tstruct task_struct *p = /* XXX: how do I get p? */;"); - buff.append("\tda_%s_%s(p, %s%s);" % (handle, self.name, event, self.enum_suffix)); + buff.append("\tda_%s(p, %s%s);" % (handle, event, self.enum_suffix)); else: - buff.append("\tda_%s_%s(%s%s);" % (handle, self.name, event, self.enum_suffix)); + buff.append("\tda_%s(%s%s);" % (handle, event, self.enum_suffix)); buff.append("}") buff.append("") return '\n'.join(buff) - def fill_tracepoint_attach_probe(self): + def fill_tracepoint_attach_probe(self) -> str: buff = [] for event in self.events: buff.append("\trv_attach_trace_probe(\"%s\", /* XXX: tracepoint */, handle_%s);" % (self.name, event)) return '\n'.join(buff) - def fill_tracepoint_detach_helper(self): + def fill_tracepoint_detach_helper(self) -> str: buff = [] for event in self.events: buff.append("\trv_detach_trace_probe(\"%s\", /* XXX: tracepoint */, handle_%s);" % (self.name, event)) return '\n'.join(buff) - def fill_model_h_header(self): + def fill_model_h_header(self) -> list[str]: buff = [] buff.append("/* SPDX-License-Identifier: GPL-2.0 */") buff.append("/*") @@ -66,10 +66,12 @@ class dot2k(Monitor, Dot2c): buff.append(" * Documentation/trace/rv/deterministic_automata.rst") buff.append(" */") buff.append("") + buff.append("#define MONITOR_NAME %s" % (self.name)) + buff.append("") return buff - def fill_model_h(self): + def fill_model_h(self) -> str: # # Adjust the definition names # @@ -83,17 +85,17 @@ class dot2k(Monitor, Dot2c): return '\n'.join(buff) - def fill_monitor_class_type(self): + def fill_monitor_class_type(self) -> str: if self.monitor_type == "per_task": return "DA_MON_EVENTS_ID" return "DA_MON_EVENTS_IMPLICIT" - def fill_monitor_class(self): + def fill_monitor_class(self) -> str: if self.monitor_type == "per_task": return "da_monitor_id" return "da_monitor" - def fill_tracepoint_args_skel(self, tp_type): + def fill_tracepoint_args_skel(self, tp_type: str) -> str: buff = [] tp_args_event = [ ("char *", "state"), @@ -115,7 +117,7 @@ class dot2k(Monitor, Dot2c): buff.append(" TP_ARGS(%s)" % tp_args_c) return '\n'.join(buff) - def fill_main_c(self): + def fill_main_c(self) -> str: main_c = super().fill_main_c() min_type = self.get_minimun_type() diff --git a/tools/verification/rvgen/rvgen/templates/container/main.c b/tools/verification/rvgen/rvgen/templates/container/main.c index 7d9b2f95c7e9..5fc89b46f279 100644 --- a/tools/verification/rvgen/rvgen/templates/container/main.c +++ b/tools/verification/rvgen/rvgen/templates/container/main.c @@ -8,8 +8,6 @@ #include "%%MODEL_NAME%%.h" -struct rv_monitor rv_%%MODEL_NAME%%; - struct rv_monitor rv_%%MODEL_NAME%% = { .name = "%%MODEL_NAME%%", .description = "%%DESCRIPTION%%", diff --git a/tools/verification/rvgen/rvgen/templates/dot2k/main.c b/tools/verification/rvgen/rvgen/templates/dot2k/main.c index e0fd1134bd85..a14e4f0883db 100644 --- a/tools/verification/rvgen/rvgen/templates/dot2k/main.c +++ b/tools/verification/rvgen/rvgen/templates/dot2k/main.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "%%MODEL_NAME%%" @@ -20,15 +19,9 @@ * This is the self-generated part of the monitor. Generally, there is no need * to touch this section. */ +#define RV_MON_TYPE RV_MON_%%MONITOR_TYPE%% #include "%%MODEL_NAME%%.h" - -/* - * Declare the deterministic automata monitor. - * - * The rv monitor reference is needed for the monitor declaration. - */ -static struct rv_monitor rv_%%MODEL_NAME%%; -DECLARE_DA_MON_%%MONITOR_TYPE%%(%%MODEL_NAME%%, %%MIN_TYPE%%); +#include <rv/da_monitor.h> /* * This is the instrumentation part of the monitor. @@ -42,7 +35,7 @@ static int enable_%%MODEL_NAME%%(void) { int retval; - retval = da_monitor_init_%%MODEL_NAME%%(); + retval = da_monitor_init(); if (retval) return retval; @@ -53,33 +46,33 @@ static int enable_%%MODEL_NAME%%(void) static void disable_%%MODEL_NAME%%(void) { - rv_%%MODEL_NAME%%.enabled = 0; + rv_this.enabled = 0; %%TRACEPOINT_DETACH%% - da_monitor_destroy_%%MODEL_NAME%%(); + da_monitor_destroy(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_%%MODEL_NAME%% = { +static struct rv_monitor rv_this = { .name = "%%MODEL_NAME%%", .description = "%%DESCRIPTION%%", .enable = enable_%%MODEL_NAME%%, .disable = disable_%%MODEL_NAME%%, - .reset = da_monitor_reset_all_%%MODEL_NAME%%, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_%%MODEL_NAME%%(void) { - return rv_register_monitor(&rv_%%MODEL_NAME%%, %%PARENT%%); + return rv_register_monitor(&rv_this, %%PARENT%%); } static void __exit unregister_%%MODEL_NAME%%(void) { - rv_unregister_monitor(&rv_%%MODEL_NAME%%); + rv_unregister_monitor(&rv_this); } module_init(register_%%MODEL_NAME%%); |
