From b5964b968ac64c2ec2debee7518499113b27c34e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:50 -0800 Subject: bpf: Add skb dynptrs Add skb dynptrs, which are dynptrs whose underlying pointer points to a skb. The dynptr acts on skb data. skb dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of skb->data and skb->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For bpf prog types that don't support writes on skb data, the dynptr is read-only (bpf_dynptr_write() will return an error) For reads and writes through the bpf_dynptr_read() and bpf_dynptr_write() interfaces, reading and writing from/to data in the head as well as from/to non-linear paged buffers is supported. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() (added in subsequent commit) should be used. For examples of how skb dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-8-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 62ce1f5d1b1d..d0351d30e551 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5325,11 +5325,17 @@ union bpf_attr { * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. - * *flags* is currently unused. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr or if *flags* is not 0. + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) * Description @@ -5337,6 +5343,9 @@ union bpf_attr { * * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. + * + * skb type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is * read-only, if the dynptr is invalid, or if the offset and length -- cgit v1.2.3 From 05421aecd4ed65da0dc17b0c3c13779ef334e9e5 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:51 -0800 Subject: bpf: Add xdp dynptrs Add xdp dynptrs, which are dynptrs whose underlying pointer points to a xdp_buff. The dynptr acts on xdp data. xdp dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of xdp->data and xdp->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For reads and writes on the dynptr, this includes reading/writing from/to and across fragments. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() should be used. For examples of how xdp dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-9-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d0351d30e551..faa304c926cf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5344,7 +5344,7 @@ union bpf_attr { * *len* must be a statically known value. The returned data slice * is invalidated whenever the dynptr is invalidated. * - * skb type dynptrs may not use bpf_dynptr_data. They should + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. * Return * Pointer to the underlying dynptr data, NULL if the dynptr is -- cgit v1.2.3 From 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:52 -0800 Subject: bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr. The user must pass in a buffer to store the contents of the data slice if a direct pointer to the data cannot be obtained. For skb and xdp type dynptrs, these two APIs are the only way to obtain a data slice. However, for other types of dynptrs, there is no difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data. For skb type dynptrs, the data is copied into the user provided buffer if any of the data is not in the linear portion of the skb. For xdp type dynptrs, the data is copied into the user provided buffer if the data is between xdp frags. If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then the skb will be uncloned (see bpf_unclone_prologue()). Please note that any bpf_dynptr_write() automatically invalidates any prior data slices of the skb dynptr. This is because the skb may be cloned or may need to pull its paged buffer into the head. As such, any bpf_dynptr_write() will automatically have its prior data slices invalidated, even if the write is to data in the skb head of an uncloned skb. Please note as well that any other helper calls that change the underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data slices of the skb dynptr as well, for the same reasons. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index faa304c926cf..c9699304aed2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5329,6 +5329,11 @@ union bpf_attr { * *flags* must be 0 except for skb-type dynptrs. * * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * * * For *flags*, please see the flags accepted by * **bpf_skb_store_bytes**\ (). * Return -- cgit v1.2.3 From f71f8530494bb5ab43d3369ef0ce8373eb1ee077 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Thu, 2 Mar 2023 13:46:13 +0200 Subject: bpf: Add support for absolute value BPF timers Add a new flag BPF_F_TIMER_ABS that can be passed to bpf_timer_start() to start an absolute value timer instead of the default relative value. This makes the timer expire at an exact point in time, instead of a time with latencies induced by both the BPF and timer subsystems. Suggested-by: Artem Bityutskiy Signed-off-by: Tero Kristo Link: https://lore.kernel.org/r/20230302114614.2985072-2-tero.kristo@linux.intel.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c9699304aed2..976b194eb775 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4969,6 +4969,12 @@ union bpf_attr { * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * * Return * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier @@ -7097,4 +7103,13 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From d509c55cda22096a1836e35b03f66e1ef411c0c2 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 1 Mar 2023 12:09:13 +0200 Subject: wifi: nl80211: Update the documentation of NL80211_SCAN_FLAG_COLOCATED_6GHZ Add a detailed description of NL80211_SCAN_FLAG_COLOCATED_6GHZ flag. Signed-off-by: Ilan Peer Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230301115906.487ab04feb39.I5129fd61841332474693046241586f057b134c3c@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f14621a954e1..c22eeb18b996 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6510,8 +6510,14 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. - * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by - * 2.4/5 GHz APs + * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for collocated APs reported by + * 2.4/5 GHz APs. When the flag is set, the scan logic will use the + * information from the RNR element found in beacons/probe responses + * received on the 2.4/5 GHz channels to actively scan only the 6GHz + * channels on which APs are expected to be found. Note that when not set, + * the scan logic would scan all 6GHz channels, but since transmission of + * probe requests on non PSC channels is limited, it is highly likely that + * these channels would passively be scanned. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, -- cgit v1.2.3 From cbbaf2bb829b6c4ef911d4a725fc9b1fadc1e43f Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Wed, 1 Mar 2023 12:09:21 +0200 Subject: wifi: nl80211: add a command to enable/disable HW timestamping Add a command to enable and disable HW timestamping of TM and FTM frames. HW timestamping can be enabled for a specific mac address or for all addresses. The low level driver will indicate how many peers HW timestamping can be enabled concurrently, and this information will be passed to userspace. Signed-off-by: Avraham Stern Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230301115906.05678d7b1c17.Iccc08869ea8156f1c71a3111a47f86dd56234bd0@changeid [switch to needing netdev UP, minor edits] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c22eeb18b996..c8520c150f9c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1299,6 +1299,16 @@ * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station * + * @NL80211_CMD_SET_HW_TIMESTAMP: Enable/disable HW timestamping of Timing + * measurement and Fine timing measurement frames. If %NL80211_ATTR_MAC + * is included, enable/disable HW timestamping only for frames to/from the + * specified MAC address. Otherwise enable/disable HW timestamping for + * all TM/FTM frames (including ones that were enabled with specific MAC + * address). If %NL80211_ATTR_HW_TIMESTAMP_ENABLED is not included, disable + * HW timestamping. + * The number of peers that HW timestamping can be enabled for concurrently + * is indicated by %NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1550,6 +1560,8 @@ enum nl80211_commands { NL80211_CMD_MODIFY_LINK_STA, NL80211_CMD_REMOVE_LINK_STA, + NL80211_CMD_SET_HW_TIMESTAMP, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2775,6 +2787,13 @@ enum nl80211_commands { * indicates that the sub-channel is punctured. Higher 16 bits are * reserved. * + * @NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS: Maximum number of peers that HW + * timestamping can be enabled for concurrently (u16), a wiphy attribute. + * A value of 0xffff indicates setting for all peers (i.e. not specifying + * an address with %NL80211_CMD_SET_HW_TIMESTAMP) is supported. + * @NL80211_ATTR_HW_TIMESTAMP_ENABLED: Indicates whether HW timestamping should + * be enabled or not (flag attribute). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3306,6 +3325,9 @@ enum nl80211_attrs { NL80211_ATTR_PUNCT_BITMAP, + NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS, + NL80211_ATTR_HW_TIMESTAMP_ENABLED, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From 6933486133ecf71bbe273d7ac72cfc4a51286af3 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Thu, 12 Jan 2023 06:54:13 +0530 Subject: wifi: nl80211: Add support for randomizing TA of auth and deauth frames Add support to use a random local address in authentication and deauthentication frames sent to unassociated peer when the driver supports. The driver needs to configure receive behavior to accept frames with random transmit address specified in TX path authentication frames during the time of the frame exchange is pending and such frames need to be acknowledged similarly to frames sent to the local permanent address when this random address functionality is used. This capability allows use of randomized transmit address for PASN authentication frames to improve privacy of WLAN clients. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20230112012415.167556-2-quic_vjakkam@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c8520c150f9c..9a0ac0363f1f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6348,6 +6348,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SECURE_NAN: Device supports NAN Pairing which enables * authentication, data encryption and message integrity. * + * @NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA: Device supports randomized TA + * in authentication and deauthentication frames sent to unassociated peer + * using @NL80211_CMD_FRAME. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6418,6 +6422,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE, NL80211_EXT_FEATURE_PUNCT, NL80211_EXT_FEATURE_SECURE_NAN, + NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 4386b921857793440ebd4db3d6b70639149c7074 Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Fri, 24 Feb 2023 10:52:51 +0100 Subject: netfilter: bridge: introduce broute meta statement nftables equivalent for ebtables -t broute. Implement broute meta statement to set br_netfilter_broute flag in skb to force a packet to be routed instead of being bridged. Signed-off-by: Sriram Yagnaraman Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ff677f3a6cad..9c6f02c26054 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -931,6 +931,7 @@ enum nft_exthdr_attributes { * @NFT_META_TIME_HOUR: hour of day (in seconds) * @NFT_META_SDIF: slave device interface index * @NFT_META_SDIFNAME: slave device interface name + * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit */ enum nft_meta_keys { NFT_META_LEN, @@ -969,6 +970,7 @@ enum nft_meta_keys { NFT_META_TIME_HOUR, NFT_META_SDIF, NFT_META_SDIFNAME, + NFT_META_BRI_BROUTE, __NFT_META_IIFTYPE, }; -- cgit v1.2.3 From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:17 -0800 Subject: bpf: implement numbers iterator Implement the first open-coded iterator type over a range of integers. It's public API consists of: - bpf_iter_num_new() constructor, which accepts [start, end) range (that is, start is inclusive, end is exclusive). - bpf_iter_num_next() which will keep returning read-only pointer to int until the range is exhausted, at which point NULL will be returned. If bpf_iter_num_next() is kept calling after this, NULL will be persistently returned. - bpf_iter_num_destroy() destructor, which needs to be called at some point to clean up iterator state. BPF verifier enforces that iterator destructor is called at some point before BPF program exits. Note that `start = end = X` is a valid combination to setup an empty iterator. bpf_iter_num_new() will return 0 (success) for any such combination. If bpf_iter_num_new() detects invalid combination of input arguments, it returns error, resets iterator state to, effectively, empty iterator, so any subsequent call to bpf_iter_num_next() will keep returning NULL. BPF verifier has no knowledge that returned integers are in the [start, end) value range, as both `start` and `end` are not statically known and enforced: they are runtime values. While the implementation is pretty trivial, some care needs to be taken to avoid overflows and underflows. Subsequent selftests will validate correctness of [start, end) semantics, especially around extremes (INT_MIN and INT_MAX). Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can be specified. bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded BPF loops and bpf_loop() helper and is the basis for implementing ergonomic BPF loops with no statically known or verified bounds. Subsequent patches implement bpf_for() macro, demonstrating how this can be wrapped into something that works and feels like a normal for() loop in C language. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 976b194eb775..4abddb668a10 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7112,4 +7112,12 @@ enum { BPF_F_TIMER_ABS = (1ULL << 0), }; +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 4821a076eb602a6238528e9ebafeac853c833415 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Mar 2023 16:23:26 -0500 Subject: sctp: add fair capacity stream scheduler As it says in rfc8260#section-3.5 about the fair capacity scheduler: A fair capacity distribution between the streams is used. This scheduler considers the lengths of the messages of each stream and schedules them in a specific way to maintain an equal capacity for all streams. The details are implementation dependent. interleaving user messages allows for a better realization of the fair capacity usage. This patch adds Fair Capacity Scheduler based on the foundations added by commit 5bbbbe32a431 ("sctp: introduce stream scheduler foundations"): A fc_list and a fc_length are added into struct sctp_stream_out_ext and a fc_list is added into struct sctp_stream. In .enqueue, when there are chunks enqueued into a stream, this stream will be linked into stream-> fc_list by its fc_list ordered by its fc_length. In .dequeue, it always picks up the 1st skb from stream->fc_list. In .dequeue_done, fc_length is increased by chunk's len and update its location in stream->fc_list according to the its new fc_length. Note that when the new fc_length overflows in .dequeue_done, instead of resetting all fc_lengths to 0, we only reduced them by U32_MAX / 4 to avoid a moment of imbalance in the scheduling, as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Paolo Abeni --- include/uapi/linux/sctp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index ed7d4ecbf53d..6814c5a1c4bc 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1211,7 +1211,8 @@ enum sctp_sched_type { SCTP_SS_DEFAULT = SCTP_SS_FCFS, SCTP_SS_PRIO, SCTP_SS_RR, - SCTP_SS_MAX = SCTP_SS_RR + SCTP_SS_FC, + SCTP_SS_MAX = SCTP_SS_FC }; /* Probe Interval socket option */ -- cgit v1.2.3 From 42d452e7709fdb4d42376d2a97369e22cc80a5d2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Mar 2023 16:23:27 -0500 Subject: sctp: add weighted fair queueing stream scheduler As it says in rfc8260#section-3.6 about the weighted fair queueing scheduler: A Weighted Fair Queueing scheduler between the streams is used. The weight is configurable per outgoing SCTP stream. This scheduler considers the lengths of the messages of each stream and schedules them in a specific way to use the capacity according to the given weights. If the weight of stream S1 is n times the weight of stream S2, the scheduler should assign to stream S1 n times the capacity it assigns to stream S2. The details are implementation dependent. Interleaving user messages allows for a better realization of the capacity usage according to the given weights. This patch adds Weighted Fair Queueing Scheduler actually based on the code of Fair Capacity Scheduler by adding fc_weight into struct sctp_stream_out_ext and taking it into account when sorting stream-> fc_list in sctp_sched_fc_sched() and sctp_sched_fc_dequeue_done(). Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Paolo Abeni --- include/uapi/linux/sctp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6814c5a1c4bc..b7d91d4cf0db 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1212,7 +1212,8 @@ enum sctp_sched_type { SCTP_SS_PRIO, SCTP_SS_RR, SCTP_SS_FC, - SCTP_SS_MAX = SCTP_SS_FC + SCTP_SS_WFQ, + SCTP_SS_MAX = SCTP_SS_WFQ }; /* Probe Interval socket option */ -- cgit v1.2.3 From 5a70f4a63000ba68004fb3c1aaf2f90303dd228f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Wei=C3=9F?= Date: Thu, 9 Mar 2023 14:38:23 +0100 Subject: bpf: Fix a typo for BPF_F_ANY_ALIGNMENT in bpf.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix s/BPF_PROF_LOAD/BPF_PROG_LOAD/ typo in the documentation comment for BPF_F_ANY_ALIGNMENT in bpf.h. Signed-off-by: Michael Weiß Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230309133823.944097-1-michael.weiss@aisec.fraunhofer.de --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4abddb668a10..d8c534e05b0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1108,7 +1108,7 @@ enum bpf_link_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) -/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will allow any alignment whatsoever. On platforms * with strict alignment requirements for loads ands stores (such * as sparc and mips) the verifier validates that all loads and -- cgit v1.2.3 From be50da3e9d4ad1958f7b11322d44d94d5c25a4c1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 9 Mar 2023 10:45:59 +0100 Subject: net: virtio_net: implement exact header length guest feature Virtio spec introduced a feature VIRTIO_NET_F_GUEST_HDRLEN which when set implicates that device benefits from knowing the exact size of the header. For compatibility, to signal to the device that the header is reliable driver also needs to set this feature. Without this feature set by driver, device has to figure out the header size itself. Quoting the original virtio spec: "hdr_len is a hint to the device as to how much of the header needs to be kept to copy into each packet" "a hint" might not be clear for the reader what does it mean, if it is "maybe like that" of "exactly like that". This feature just makes it crystal clear and let the device count on the hdr_len being filled up by the exact length of header. Also note the spec already has following note about hdr_len: "Due to various bugs in implementations, this field is not useful as a guarantee of the transport header size." Without this feature the device needs to parse the header in core data path handling. Accurate information helps the device to eliminate such header parsing and directly use the hardware accelerators for GSO operation. virtio_net_hdr_from_skb() fills up hdr_len to skb_headlen(skb). The driver already complies to fill the correct value. Introduce the feature and advertise it. Note that virtio spec also includes following note for device implementation: "Caution should be taken by the implementation so as to prevent a malicious driver from attacking the device by setting an incorrect hdr_len." There is a plan to support this feature in our emulated device. A device of SolidRun offers this feature bit. They claim this feature will save the device a few cycles for every GSO packet. Link: https://docs.oasis-open.org/virtio/virtio/v1.2/cs01/virtio-v1.2-cs01.html#x1-230006x3 Signed-off-by: Jiri Pirko Reviewed-by: Parav Pandit Reviewed-by: Alvaro Karsz Acked-by: Michael S. Tsirkin Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230309094559.917857-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/uapi/linux/virtio_net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index b4062bed186a..12c1c9699935 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -61,6 +61,7 @@ #define VIRTIO_NET_F_GUEST_USO6 55 /* Guest can handle USOv6 in. */ #define VIRTIO_NET_F_HOST_USO 56 /* Host can handle USO in. */ #define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ +#define VIRTIO_NET_F_GUEST_HDRLEN 59 /* Guest provides the exact hdr_len value. */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device -- cgit v1.2.3 From 27d7fdf06fdb84455ff585b58c8034e2fab42583 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 13 Mar 2023 14:56:27 -0600 Subject: bpf: use canonical ftrace path The canonical location for the tracefs filesystem is at /sys/kernel/tracing. But, from Documentation/trace/ftrace.rst: Before 4.1, all ftrace tracing control files were within the debugfs file system, which is typically located at /sys/kernel/debug/tracing. For backward compatibility, when mounting the debugfs file system, the tracefs file system will be automatically mounted at: /sys/kernel/debug/tracing Many comments and samples in the bpf code still refer to this older debugfs path, so let's update them to avoid confusion. There are a few spots where the bpf code explicitly checks both tracefs and debugfs (tools/bpf/bpftool/tracelog.c and tools/lib/api/fs/fs.c) and I've left those alone so that the tools can continue to work with both paths. Signed-off-by: Ross Zwisler Acked-by: Michael S. Tsirkin Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20230313205628.1058720-2-zwisler@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d8c534e05b0a..13129df937cd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1647,17 +1647,17 @@ union bpf_attr { * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) - * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * to file *\/sys/kernel/tracing/trace* from TraceFS, if * available. It can take up to three additional **u64** * arguments (as an eBPF helpers, the total number of arguments is * limited to five). * * Each time the helper is called, it appends a line to the trace. - * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is - * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. * The format of the trace is customizable, and the exact output * one will get depends on the options set in - * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *\/sys/kernel/tracing/trace_options* (see also the * *README* file under the same directory). However, it usually * defaults to something like: * -- cgit v1.2.3 From a3a48de5eade770e911d35291217bdd69ce04ef1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Mar 2023 15:11:51 +0200 Subject: vxlan: mdb: Add MDB control path support Implement MDB control path support, enabling the creation, deletion, replacement and dumping of MDB entries in a similar fashion to the bridge driver. Unlike the bridge driver, each entry stores a list of remote VTEPs to which matched packets need to be replicated to and not a list of bridge ports. The motivating use case is the installation of MDB entries by a user space control plane in response to received EVPN routes. As such, only allow permanent MDB entries to be installed and do not implement snooping functionality, avoiding a lot of unnecessary complexity. Since entries can only be modified by user space under RTNL, use RTNL as the write lock. Use RCU to ensure that MDB entries and remotes are not freed while being accessed from the data path during transmission. In terms of uAPI, reuse the existing MDB netlink interface, but add a few new attributes to request and response messages: * IP address of the destination VXLAN tunnel endpoint where the multicast receivers reside. * UDP destination port number to use to connect to the remote VXLAN tunnel endpoint. * VXLAN VNI Network Identifier to use to connect to the remote VXLAN tunnel endpoint. Required when Ingress Replication (IR) is used and the remote VTEP is not a member of originating broadcast domain (VLAN/VNI) [1]. * Source VNI Network Identifier the MDB entry belongs to. Used only when the VXLAN device is in external mode. * Interface index of the outgoing interface to reach the remote VXLAN tunnel endpoint. This is required when the underlay destination IP is multicast (P2MP), as the multicast routing tables are not consulted. All the new attributes are added under the 'MDBA_SET_ENTRY_ATTRS' nest which is strictly validated by the bridge driver, thereby automatically rejecting the new attributes. [1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-3.2.2 Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index d60c456710b3..c9d624f528c5 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -633,6 +633,11 @@ enum { MDBA_MDB_EATTR_GROUP_MODE, MDBA_MDB_EATTR_SOURCE, MDBA_MDB_EATTR_RTPROT, + MDBA_MDB_EATTR_DST, + MDBA_MDB_EATTR_DST_PORT, + MDBA_MDB_EATTR_VNI, + MDBA_MDB_EATTR_IFINDEX, + MDBA_MDB_EATTR_SRC_VNI, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) @@ -728,6 +733,11 @@ enum { MDBE_ATTR_SRC_LIST, MDBE_ATTR_GROUP_MODE, MDBE_ATTR_RTPROT, + MDBE_ATTR_DST, + MDBE_ATTR_DST_PORT, + MDBE_ATTR_VNI, + MDBE_ATTR_IFINDEX, + MDBE_ATTR_SRC_VNI, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) -- cgit v1.2.3 From 8e40c3b6e1538401d2cf8d43087ebe1db8026af9 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Wed, 8 Mar 2023 16:15:56 +0530 Subject: wifi: nl80211: Update the documentation of NL80211_SCAN_FLAG_COLOCATED_6GHZ Currently when NL80211_SCAN_FLAG_COLOCATED_6GHZ is set in the scan flags, in addition to the co-located APs, PSC channels in the 6 GHz band would also be scanned if the user space has asked for it. In other words, the scan would happen on PSC channels & co-located 6 GHz channels that were reported in the RNR IE. Update the documentation of NL80211_SCAN_FLAG_COLOCATED_6GHZ flag to reflect the above said behavior. Signed-off-by: Manikanta Pubbisetty Link: https://lore.kernel.org/r/20230308104556.9399-1-quic_mpubbise@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9a0ac0363f1f..14e958a32b84 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6544,7 +6544,9 @@ enum nl80211_timeout_reason { * channels on which APs are expected to be found. Note that when not set, * the scan logic would scan all 6GHz channels, but since transmission of * probe requests on non PSC channels is limited, it is highly likely that - * these channels would passively be scanned. + * these channels would passively be scanned. Also note that when the flag + * is set, in addition to the colocated APs, PSC channels would also be + * scanned if the user space has asked for it. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, -- cgit v1.2.3 From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:00 -0700 Subject: bpf: Create links for BPF struct_ops maps. Make bpf_link support struct_ops. Previously, struct_ops were always used alone without any associated links. Upon updating its value, a struct_ops would be activated automatically. Yet other BPF program types required to make a bpf_link with their instances before they could become active. Now, however, you can create an inactive struct_ops, and create a link to activate it later. With bpf_links, struct_ops has a behavior similar to other BPF program types. You can pin/unpin them from their links and the struct_ops will be deactivated when its link is removed while previously need someone to delete the value for it to be deactivated. bpf_links are responsible for registering their associated struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag set to create a bpf_link, while a structs without this flag behaves in the same manner as before and is registered upon updating its value. The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it used to craft the links for BPF struct_ops programs, but also to create links for BPF struct_ops them-self. Since the links of BPF struct_ops programs are only used to create trampolines internally, they are never seen in other contexts. Thus, they can be reused for struct_ops themself. To maintain a reference to the map supporting this link, we add bpf_struct_ops_link as an additional type. The pointer of the map is RCU and won't be necessary until later in the patchset. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 13129df937cd..42f40ee083bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1033,6 +1033,7 @@ enum bpf_attach_type { BPF_PERF_EVENT, BPF_TRACE_KPROBE_MULTI, BPF_LSM_CGROUP, + BPF_STRUCT_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -1266,6 +1267,9 @@ enum { /* Create a map that is suitable to be an inner map with dynamic max entries */ BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), }; /* Flags for BPF_PROG_QUERY. */ @@ -1507,7 +1511,10 @@ union bpf_attr { } task_fd_query; struct { /* struct used by BPF_LINK_CREATE command */ - __u32 prog_fd; /* eBPF program to attach */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; union { __u32 target_fd; /* object to attach to */ __u32 target_ifindex; /* target ifindex */ @@ -6379,6 +6386,9 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 map_id; + } struct_ops; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:02 -0700 Subject: bpf: Update the struct_ops of a bpf_link. By improving the BPF_LINK_UPDATE command of bpf(), it should allow you to conveniently switch between different struct_ops on a single bpf_link. This would enable smoother transitions from one struct_ops to another. The struct_ops maps passing along with BPF_LINK_UPDATE should have the BPF_F_LINK flag. Signed-off-by: Kui-Feng Lee Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42f40ee083bf..e3d3b5160d26 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1555,12 +1555,23 @@ union bpf_attr { struct { /* struct used by BPF_LINK_UPDATE command */ __u32 link_fd; /* link fd */ - /* new program fd to update link with */ - __u32 new_prog_fd; + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; __u32 flags; /* extra flags */ - /* expected link's program fd; is specified only if - * BPF_F_REPLACE flag is set in flags */ - __u32 old_prog_fd; + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; } link_update; struct { -- cgit v1.2.3 From dbbb27e183b1568d5a907ace1cd144b0709ea52a Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Thu, 23 Mar 2023 04:38:00 -0700 Subject: cfg80211: support RNR for EMA AP As per IEEE Std 802.11ax-2021, 11.1.3.8.3 Discovery of a nontransmitted BSSID profile, an EMA AP that transmits a Beacon frame carrying a partial list of nontransmitted BSSID profiles should include in the frame a Reduced Neighbor Report element carrying information for at least the nontransmitted BSSIDs that are not present in the Multiple BSSID element carried in that frame. Add new nested attribute NL80211_ATTR_EMA_RNR_ELEMS to support the above. Number of RNR elements must be more than or equal to the number of MBSSID elements. This attribute can be used only when EMA is enabled. Userspace is responsible for splitting the RNR into multiple elements such that each element excludes the non-transmitting profiles already included in the MBSSID element (%NL80211_ATTR_MBSSID_ELEMS) at the same index. Each EMA beacon will be generated by adding MBSSID and RNR elements at the same index. If the userspace provides more RNR elements than the number of MBSSID elements then these will be added in every EMA beacon. Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20230323113801.6903-2-quic_alokad@quicinc.com [Johannes: validate elements] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 14e958a32b84..cf4fb981e131 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2794,6 +2794,17 @@ enum nl80211_commands { * @NL80211_ATTR_HW_TIMESTAMP_ENABLED: Indicates whether HW timestamping should * be enabled or not (flag attribute). * + * @NL80211_ATTR_EMA_RNR_ELEMS: Optional nested attribute for + * reduced neighbor report (RNR) elements. This attribute can be used + * only when NL80211_MBSSID_CONFIG_ATTR_EMA is enabled. + * Userspace is responsible for splitting the RNR into multiple + * elements such that each element excludes the non-transmitting + * profiles already included in the MBSSID element + * (%NL80211_ATTR_MBSSID_ELEMS) at the same index. Each EMA beacon + * will be generated by adding MBSSID and RNR elements at the same + * index. If the userspace includes more RNR elements than number of + * MBSSID elements then these will be added in every EMA beacon. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3328,6 +3339,8 @@ enum nl80211_attrs { NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS, NL80211_ATTR_HW_TIMESTAMP_ENABLED, + NL80211_ATTR_EMA_RNR_ELEMS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From 233eb4e786b57ea686b51c13a04cc2839fd682fc Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Thu, 23 Mar 2023 18:36:05 +0200 Subject: ethtool: Add support for configuring tx_push_buf_len This attribute, which is part of ethtool's ring param configuration allows the user to specify the maximum number of the packet's payload that can be written directly to the device. Example usage: # ethtool -G [interface] tx-push-buf-len [number of bytes] Co-developed-by: Jakub Kicinski Signed-off-by: Shay Agroskin Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index d39ce21381c5..1ebf8d455f07 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -357,6 +357,8 @@ enum { ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_RINGS_CNT, -- cgit v1.2.3 From 954d1fa1ac93aa8a66f7d9a9ba545cf7f020d348 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 28 Mar 2023 10:57:59 +0800 Subject: macvlan: Add netlink attribute for broadcast cutoff Make the broadcast cutoff configurable through netlink. Note that macvlan is weird because there is no central device for us to configure (the lowerdev could be anything). So all the options are duplicated over what could be thousands of child devices. IFLA_MACVLAN_BC_QUEUE_LEN took the approach of taking the maximum of all child device settings. This is unnecessary as we could simply store the option in the port device and take the last child device that gets updated as the value to use. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 57ceb788250f..8d679688efe0 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -635,6 +635,7 @@ enum { IFLA_MACVLAN_MACADDR_COUNT, IFLA_MACVLAN_BC_QUEUE_LEN, IFLA_MACVLAN_BC_QUEUE_LEN_USED, + IFLA_MACVLAN_BC_CUTOFF, __IFLA_MACVLAN_MAX, }; -- cgit v1.2.3 From 9a8aac92eba90b3b7c71d0531db535f5588388f5 Mon Sep 17 00:00:00 2001 From: Kieran Frewen Date: Fri, 24 Feb 2023 10:29:17 +1300 Subject: wifi: nl80211: support advertising S1G capabilities Include S1G capabilities in netlink band info messages. Signed-off-by: Kieran Frewen Co-developed-by: Gilad Itzkovitch Signed-off-by: Gilad Itzkovitch Link: https://lore.kernel.org/r/20230223212917.4010246-1-gilad.itzkovitch@virscient.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index cf4fb981e131..c59fec406da5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4061,6 +4061,10 @@ enum nl80211_band_iftype_attr { * @NL80211_BAND_ATTR_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes * the allowed channel bandwidth configurations. * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. + * @NL80211_BAND_ATTR_S1G_MCS_NSS_SET: S1G capabilities, supported S1G-MCS and NSS + * set subfield, as in the S1G information IE, 5 bytes + * @NL80211_BAND_ATTR_S1G_CAPA: S1G capabilities information subfield as in the + * S1G information IE, 10 bytes * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined * @__NL80211_BAND_ATTR_AFTER_LAST: internal use */ @@ -4081,6 +4085,9 @@ enum nl80211_band_attr { NL80211_BAND_ATTR_EDMG_CHANNELS, NL80211_BAND_ATTR_EDMG_BW_CONFIG, + NL80211_BAND_ATTR_S1G_MCS_NSS_SET, + NL80211_BAND_ATTR_S1G_CAPA, + /* keep last */ __NL80211_BAND_ATTR_AFTER_LAST, NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1 -- cgit v1.2.3 From 28c1b6df436819a7ed8a781835766e45139771a3 Mon Sep 17 00:00:00 2001 From: Eric Sage Date: Mon, 27 Mar 2023 13:44:49 -0400 Subject: netfilter: nfnetlink_queue: enable classid socket info retrieval This enables associating a socket with a v1 net_cls cgroup. Useful for applying a per-cgroup policy when processing packets in userspace. Signed-off-by: Eric Sage Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nfnetlink_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index ef7c97f21a15..efcb7c044a74 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -62,6 +62,7 @@ enum nfqnl_attr_type { NFQA_VLAN, /* nested attribute: packet vlan info */ NFQA_L2HDR, /* full L2 header */ NFQA_PRIORITY, /* skb->priority */ + NFQA_CGROUP_CLASSID, /* __u32 cgroup classid */ __NFQA_MAX }; -- cgit v1.2.3 From a25b8b7136ad43760bd876af62b6e59abd30496c Mon Sep 17 00:00:00 2001 From: Matthieu De Beule Date: Wed, 29 Mar 2023 12:52:18 +0000 Subject: netfilter: Correct documentation errors in nf_tables.h NFTA_RANGE_OP incorrectly says nft_cmp_ops instead of nft_range_ops. NFTA_LOG_GROUP and NFTA_LOG_QTHRESHOLD claim NLA_U32 instead of NLA_U16 NFTA_EXTHDR_SREG isn't documented as a register Signed-off-by: Matthieu De Beule Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9c6f02c26054..c4d4d8e42dc8 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -685,7 +685,7 @@ enum nft_range_ops { * enum nft_range_attributes - nf_tables range expression netlink attributes * * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers) - * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops) + * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_range_ops) * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes) * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes) */ @@ -878,7 +878,7 @@ enum nft_exthdr_op { * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) * @NFTA_EXTHDR_OP: option match type (NLA_U32) - * @NFTA_EXTHDR_SREG: option match type (NLA_U32) + * @NFTA_EXTHDR_SREG: source register (NLA_U32: nft_registers) */ enum nft_exthdr_attributes { NFTA_EXTHDR_UNSPEC, @@ -1262,10 +1262,10 @@ enum nft_last_attributes { /** * enum nft_log_attributes - nf_tables log expression netlink attributes * - * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U32) + * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U16) * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING) * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32) - * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32) + * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U16) * @NFTA_LOG_LEVEL: log level (NLA_U32) * @NFTA_LOG_FLAGS: logging flags (NLA_U32) */ -- cgit v1.2.3 From 2384127e98db52a6ac2577924ad9cae25f3e7472 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 29 Mar 2023 11:54:52 +0200 Subject: net/sched: act_tunnel_key: add support for "don't fragment" extend "act_tunnel_key" to allow specifying TUNNEL_DONT_FRAGMENT. Suggested-by: Ilya Maximets Reviewed-by: Pedro Tammela Acked-by: Jamal Hadi Salim Signed-off-by: Davide Caratti Signed-off-by: Jakub Kicinski --- include/uapi/linux/tc_act/tc_tunnel_key.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 49ad4033951b..37c6f612f161 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -34,6 +34,7 @@ enum { */ TCA_TUNNEL_KEY_ENC_TOS, /* u8 */ TCA_TUNNEL_KEY_ENC_TTL, /* u8 */ + TCA_TUNNEL_KEY_NO_FRAG, /* flag */ __TCA_TUNNEL_KEY_MAX, }; -- cgit v1.2.3 From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:58 -0700 Subject: bpf: Add log_true_size output field to return necessary log buffer size Add output-only log_true_size and btf_log_true_size field to BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return the size of log buffer necessary to fit in all the log contents at specified log_level. This is very useful for BPF loader libraries like libbpf to be able to size log buffer correctly, but could be used by users directly, if necessary, as well. This patch plumbs all this through the code, taking into account actual bpf_attr size provided by user to determine if these new fields are expected by users. And if they are, set them from kernel on return. We refactory btf_parse() function to accommodate this, moving attr and uattr handling inside it. The rest is very straightforward code, which is split from the logging accounting changes in the previous patch to make it simpler to review logic vs UAPI changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e3d3b5160d26..3823100b7934 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1407,6 +1407,11 @@ union bpf_attr { __aligned_u64 fd_array; /* array of FDs */ __aligned_u64 core_relos; __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -1492,6 +1497,11 @@ union bpf_attr { __u32 btf_size; __u32 btf_log_size; __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; }; struct { -- cgit v1.2.3 From f62af20bed2d9e824f51cfc97ff01bc261f40e58 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 11 Apr 2023 21:01:54 +0300 Subject: net/sched: mqprio: allow per-TC user input of FP adminStatus IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each packet priority can be assigned to a "frame preemption status" value of either "express" or "preemptible". Express priorities are transmitted by the local device through the eMAC, and preemptible priorities through the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge layer). The FP adminStatus is defined per packet priority, but 802.1Q clause 12.30.1.1.1 framePreemptionAdminStatus also says that: | Priorities that all map to the same traffic class should be | constrained to use the same value of preemption status. It is impossible to ignore the cognitive dissonance in the standard here, because it practically means that the FP adminStatus only takes distinct values per traffic class, even though it is defined per priority. I can see no valid use case which is prevented by having the kernel take the FP adminStatus as input per traffic class (what we do here). In addition, this also enforces the above constraint by construction. User space network managers which wish to expose FP adminStatus per priority are free to do so; they must only observe the prio_tc_map of the netdev (which presumably is also under their control, when constructing the mqprio netlink attributes). The reason for configuring frame preemption as a property of the Qdisc layer is that the information about "preemptible TCs" is closest to the place which handles the num_tc and prio_tc_map of the netdev. If the UAPI would have been any other layer, it would be unclear what to do with the FP information when num_tc collapses to 0. A key assumption is that only mqprio/taprio change the num_tc and prio_tc_map of the netdev. Not sure if that's a great assumption to make. Having FP in tc-mqprio can be seen as an implementation of the use case defined in 802.1Q Annex S.2 "Preemption used in isolation". There will be a separate implementation of FP in tc-taprio, for the other use cases. Signed-off-by: Vladimir Oltean Reviewed-by: Ferenc Fejes Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 000eec106856..b8d29be91b62 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -719,6 +719,11 @@ enum { #define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) +enum { + TC_FP_EXPRESS = 1, + TC_FP_PREEMPTIBLE = 2, +}; + struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; @@ -732,12 +737,23 @@ struct tc_mqprio_qopt { #define TC_MQPRIO_F_MIN_RATE 0x4 #define TC_MQPRIO_F_MAX_RATE 0x8 +enum { + TCA_MQPRIO_TC_ENTRY_UNSPEC, + TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */ + TCA_MQPRIO_TC_ENTRY_FP, /* u32 */ + + /* add new constants above here */ + __TCA_MQPRIO_TC_ENTRY_CNT, + TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1) +}; + enum { TCA_MQPRIO_UNSPEC, TCA_MQPRIO_MODE, TCA_MQPRIO_SHAPER, TCA_MQPRIO_MIN_RATE64, TCA_MQPRIO_MAX_RATE64, + TCA_MQPRIO_TC_ENTRY, __TCA_MQPRIO_MAX, }; -- cgit v1.2.3 From a721c3e54b80e45cd9202e7fca29ef018bed9069 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 11 Apr 2023 21:01:55 +0300 Subject: net/sched: taprio: allow per-TC user input of FP adminStatus This is a duplication of the FP adminStatus logic introduced for tc-mqprio. Offloading is done through the tc_mqprio_qopt_offload structure embedded within tc_taprio_qopt_offload. So practically, if a device driver is written to treat the mqprio portion of taprio just like standalone mqprio, it gets unified handling of frame preemption. I would have reused more code with taprio, but this is mostly netlink attribute parsing, which is hard to transform into generic code without having something that stinks as a result. We have the same variables with the same semantics, just different nlattr type values (TCA_MQPRIO_TC_ENTRY=5 vs TCA_TAPRIO_ATTR_TC_ENTRY=12; TCA_MQPRIO_TC_ENTRY_FP=2 vs TCA_TAPRIO_TC_ENTRY_FP=3, etc) and consequently, different policies for the nest. Every time nla_parse_nested() is called, an on-stack table "tb" of nlattr pointers is allocated statically, up to the maximum understood nlattr type. That array size is hardcoded as a constant, but when transforming this into a common parsing function, it would become either a VLA (which the Linux kernel rightfully doesn't like) or a call to the allocator. Having FP adminStatus in tc-taprio can be seen as addressing the 802.1Q Annex S.3 "Scheduling and preemption used in combination, no HOLD/RELEASE" and S.4 "Scheduling and preemption used in combination with HOLD/RELEASE" use cases. HOLD and RELEASE events are emitted towards the underlying MAC Merge layer when the schedule hits a Set-And-Hold-MAC or a Set-And-Release-MAC gate operation. So within the tc-taprio UAPI space, one can distinguish between the 2 use cases by choosing whether to use the TC_TAPRIO_CMD_SET_AND_HOLD and TC_TAPRIO_CMD_SET_AND_RELEASE gate operations within the schedule, or just TC_TAPRIO_CMD_SET_GATES. A small part of the change is dedicated to refactoring the max_sdu nlattr parsing to put all logic under the "if" that tests for presence of that nlattr. Signed-off-by: Vladimir Oltean Reviewed-by: Ferenc Fejes Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index b8d29be91b62..51a7addc56c6 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1252,6 +1252,7 @@ enum { TCA_TAPRIO_TC_ENTRY_UNSPEC, TCA_TAPRIO_TC_ENTRY_INDEX, /* u32 */ TCA_TAPRIO_TC_ENTRY_MAX_SDU, /* u32 */ + TCA_TAPRIO_TC_ENTRY_FP, /* u32 */ /* add new constants above here */ __TCA_TAPRIO_TC_ENTRY_CNT, -- cgit v1.2.3 From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:04 -0700 Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types meant for use in BPF programs. Similarly to other opaque types like bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in user-defined struct types a bpf_refcount can be located, so necessary btf_record plumbing is added to enable this. bpf_refcount is sized to hold a refcount_t. Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in btf_record as refcount_off in addition to being in the field array. Caching refcount_off makes sense for this field because further patches in the series will modify functions that take local kptrs (e.g. bpf_obj_drop) to change their behavior if the type they're operating on is refcounted. So enabling fast "is this type refcounted?" checks is desirable. No such verifier behavior changes are introduced in this patch, just logic to recognize 'struct bpf_refcount' in btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3823100b7934..4b20a7269bee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6985,6 +6985,10 @@ struct bpf_rb_node { __u64 :64; } __attribute__((aligned(8))); +struct bpf_refcount { + __u32 :32; +} __attribute__((aligned(4))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3 From 3b3009ea8abb713b022d94fba95ec270cf6e7eae Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 17 Apr 2023 10:32:26 -0400 Subject: net/handshake: Create a NETLINK service for handling handshake requests When a kernel consumer needs a transport layer security session, it first needs a handshake to negotiate and establish a session. This negotiation can be done in user space via one of the several existing library implementations, or it can be done in the kernel. No in-kernel handshake implementations yet exist. In their absence, we add a netlink service that can: a. Notify a user space daemon that a handshake is needed. b. Once notified, the daemon calls the kernel back via this netlink service to get the handshake parameters, including an open socket on which to establish the session. c. Once the handshake is complete, the daemon reports the session status and other information via a second netlink operation. This operation marks that it is safe for the kernel to use the open socket and the security session established there. The notification service uses a multicast group. Each handshake mechanism (eg, tlshd) adopts its own group number so that the handshake services are completely independent of one another. The kernel can then tell via netlink_has_listeners() whether a handshake service is active and prepared to handle a handshake request. A new netlink operation, ACCEPT, acts like accept(2) in that it instantiates a file descriptor in the user space daemon's fd table. If this operation is successful, the reply carries the fd number, which can be treated as an open and ready file descriptor. While user space is performing the handshake, the kernel keeps its muddy paws off the open socket. A second new netlink operation, DONE, indicates that the user space daemon is finished with the socket and it is safe for the kernel to use again. The operation also indicates whether a session was established successfully. Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- include/uapi/linux/handshake.h | 71 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 include/uapi/linux/handshake.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h new file mode 100644 index 000000000000..7f66ff489b87 --- /dev/null +++ b/include/uapi/linux/handshake.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/handshake.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_HANDSHAKE_H +#define _UAPI_LINUX_HANDSHAKE_H + +#define HANDSHAKE_FAMILY_NAME "handshake" +#define HANDSHAKE_FAMILY_VERSION 1 + +enum handshake_handler_class { + HANDSHAKE_HANDLER_CLASS_NONE, + HANDSHAKE_HANDLER_CLASS_MAX, +}; + +enum handshake_msg_type { + HANDSHAKE_MSG_TYPE_UNSPEC, + HANDSHAKE_MSG_TYPE_CLIENTHELLO, + HANDSHAKE_MSG_TYPE_SERVERHELLO, +}; + +enum handshake_auth { + HANDSHAKE_AUTH_UNSPEC, + HANDSHAKE_AUTH_UNAUTH, + HANDSHAKE_AUTH_PSK, + HANDSHAKE_AUTH_X509, +}; + +enum { + HANDSHAKE_A_X509_CERT = 1, + HANDSHAKE_A_X509_PRIVKEY, + + __HANDSHAKE_A_X509_MAX, + HANDSHAKE_A_X509_MAX = (__HANDSHAKE_A_X509_MAX - 1) +}; + +enum { + HANDSHAKE_A_ACCEPT_SOCKFD = 1, + HANDSHAKE_A_ACCEPT_HANDLER_CLASS, + HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, + HANDSHAKE_A_ACCEPT_TIMEOUT, + HANDSHAKE_A_ACCEPT_AUTH_MODE, + HANDSHAKE_A_ACCEPT_PEER_IDENTITY, + HANDSHAKE_A_ACCEPT_CERTIFICATE, + + __HANDSHAKE_A_ACCEPT_MAX, + HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1) +}; + +enum { + HANDSHAKE_A_DONE_STATUS = 1, + HANDSHAKE_A_DONE_SOCKFD, + HANDSHAKE_A_DONE_REMOTE_AUTH, + + __HANDSHAKE_A_DONE_MAX, + HANDSHAKE_A_DONE_MAX = (__HANDSHAKE_A_DONE_MAX - 1) +}; + +enum { + HANDSHAKE_CMD_READY = 1, + HANDSHAKE_CMD_ACCEPT, + HANDSHAKE_CMD_DONE, + + __HANDSHAKE_CMD_MAX, + HANDSHAKE_CMD_MAX = (__HANDSHAKE_CMD_MAX - 1) +}; + +#define HANDSHAKE_MCGRP_NONE "none" + +#endif /* _UAPI_LINUX_HANDSHAKE_H */ -- cgit v1.2.3 From 2fd5532044a89d2403b543520b4902e196f7d165 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 17 Apr 2023 10:32:33 -0400 Subject: net/handshake: Add a kernel API for requesting a TLSv1.3 handshake To enable kernel consumers of TLS to request a TLS handshake, add support to net/handshake/ to request a handshake upcall. This patch also acts as a template for adding handshake upcall support for other kernel transport layer security providers. Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- include/uapi/linux/handshake.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 7f66ff489b87..1de4d0b95325 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -11,6 +11,7 @@ enum handshake_handler_class { HANDSHAKE_HANDLER_CLASS_NONE, + HANDSHAKE_HANDLER_CLASS_TLSHD, HANDSHAKE_HANDLER_CLASS_MAX, }; @@ -67,5 +68,6 @@ enum { }; #define HANDSHAKE_MCGRP_NONE "none" +#define HANDSHAKE_MCGRP_TLSHD "tlshd" #endif /* _UAPI_LINUX_HANDSHAKE_H */ -- cgit v1.2.3 From 83f6d600796c65ab34b08dbddb5795099dfda4d1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 19 Apr 2023 18:34:58 +0300 Subject: bridge: vlan: Allow setting VLAN neighbor suppression state Add a new VLAN attribute that allows user space to set the neighbor suppression state of the port VLAN. Example: # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]' false # bridge vlan set vid 10 dev swp1 neigh_suppress on # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]' true # bridge vlan set vid 10 dev swp1 neigh_suppress off # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]' false # bridge vlan set vid 10 dev br0 neigh_suppress on Error: bridge: Can't set neigh_suppress for non-port vlans. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c9d624f528c5..f95326fce6bb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -525,6 +525,7 @@ enum { BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS, + BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS, __BRIDGE_VLANDB_ENTRY_MAX, }; #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) -- cgit v1.2.3 From 160656d7201d861a1f2a0bf279a765e8cda2317a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 19 Apr 2023 18:34:59 +0300 Subject: bridge: Allow setting per-{Port, VLAN} neighbor suppression state Add a new bridge port attribute that allows user space to enable per-{Port, VLAN} neighbor suppression. Example: # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]' false # bridge link set dev swp1 neigh_vlan_suppress on # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]' true # bridge link set dev swp1 neigh_vlan_suppress off # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]' false Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8d679688efe0..4ac1000b0ef2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -569,6 +569,7 @@ enum { IFLA_BRPORT_MAB, IFLA_BRPORT_MCAST_N_GROUPS, IFLA_BRPORT_MCAST_MAX_GROUPS, + IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From dfc39d4026fb2432363c0f77543c4cf3adca4c7b Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Wed, 19 Apr 2023 15:24:16 +0800 Subject: net/packet: support mergeable feature of virtio Packet sockets, like tap, can be used as the backend for kernel vhost. In packet sockets, virtio net header size is currently hardcoded to be the size of struct virtio_net_hdr, which is 10 bytes; however, it is not always the case: some virtio features, such as mrg_rxbuf, need virtio net header to be 12-byte long. Mergeable buffers, as a virtio feature, is worthy of supporting: packets that are larger than one-mbuf size will be dropped in vhost worker's handle_rx if mrg_rxbuf feature is not used, but large packets cannot be avoided and increasing mbuf's size is not economical. With this virtio feature enabled by virtio-user, packet sockets with hardcoded 10-byte virtio net header will parse mac head incorrectly in packet_snd by taking the last two bytes of virtio net header as part of mac header. This incorrect mac header parsing will cause packet to be dropped due to invalid ether head checking in later under-layer device packet receiving. By adding extra field vnet_hdr_sz with utilizing holes in struct packet_sock to record currently used virtio net header size and supporting extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet sockets can know the exact length of virtio net header that virtio user gives. In packet_snd, tpacket_snd and packet_recvmsg, instead of using hardcoded virtio net header size, it can get the exact vnet_hdr_sz from corresponding packet_sock, and parse mac header correctly based on this information to avoid the packets being mistakenly dropped. Signed-off-by: Jianfeng Tan Co-developed-by: Anqi Shen Signed-off-by: Anqi Shen Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 78c981d6a9d4..9efc42382fdb 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -59,6 +59,7 @@ struct sockaddr_ll { #define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_DATA 22 #define PACKET_IGNORE_OUTGOING 23 +#define PACKET_VNET_HDR_SZ 24 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 -- cgit v1.2.3 From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:54 +0200 Subject: bpf: add bpf_link support for BPF_NETFILTER programs Add bpf_link support skeleton. To keep this reviewable, no bpf program can be invoked yet, if a program is attached only a c-stub is called and not the actual bpf program. Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig. Uapi example usage: union bpf_attr attr = { }; attr.link_create.prog_fd = progfd; attr.link_create.attach_type = 0; /* unused */ attr.link_create.netfilter.pf = PF_INET; attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; attr.link_create.netfilter.priority = -128; err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); ... this would attach progfd to ipv4:input hook. Such hook gets removed automatically if the calling program exits. BPF_NETFILTER program invocation is added in followup change. NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it allows to tell userspace which program is attached at the given hook when user runs 'nft hook list' command rather than just the priority and not-very-helpful 'this hook runs a bpf prog but I can't tell which one'. Will also be used to disallow registration of two bpf programs with same priority in a followup patch. v4: arm32 cmpxchg only supports 32bit operand s/prio/priority/ v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if more use cases pop up (arptables, ebtables, netdev ingress/egress etc). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4b20a7269bee..1bb11a6ee667 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -986,6 +986,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, }; enum bpf_attach_type { @@ -1050,6 +1051,7 @@ enum bpf_link_type { BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, MAX_BPF_LINK_TYPE, }; @@ -1560,6 +1562,12 @@ union bpf_attr { */ __u64 cookie; } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; }; } link_create; @@ -6410,6 +6418,12 @@ struct bpf_link_info { struct { __u32 map_id; } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From 506a74db7e019a277e987fa65654bdd953859d5b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:56 +0200 Subject: netfilter: nfnetlink hook: dump bpf prog id This allows userspace ("nft list hooks") to show which bpf program is attached to which hook. Without this, user only knows bpf prog is attached at prio x, y, z at INPUT and FORWARD, but can't tell which program is where. v4: kdoc fixups (Simon Horman) Link: https://lore.kernel.org/bpf/ZEELzpNCnYJuZyod@corigine.com/ Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-4-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/uapi/linux/netfilter/nfnetlink_hook.h | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h index bbcd285b22e1..84a561a74b98 100644 --- a/include/uapi/linux/netfilter/nfnetlink_hook.h +++ b/include/uapi/linux/netfilter/nfnetlink_hook.h @@ -32,8 +32,12 @@ enum nfnl_hook_attributes { /** * enum nfnl_hook_chain_info_attributes - chain description * - * NFNLA_HOOK_INFO_DESC: nft chain and table name (enum nft_table_attributes) (NLA_NESTED) - * NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32) + * @NFNLA_HOOK_INFO_DESC: nft chain and table name (NLA_NESTED) + * @NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32) + * + * NFNLA_HOOK_INFO_DESC depends on NFNLA_HOOK_INFO_TYPE value: + * NFNL_HOOK_TYPE_NFTABLES: enum nft_table_attributes + * NFNL_HOOK_TYPE_BPF: enum nfnl_hook_bpf_attributes */ enum nfnl_hook_chain_info_attributes { NFNLA_HOOK_INFO_UNSPEC, @@ -55,10 +59,24 @@ enum nfnl_hook_chain_desc_attributes { /** * enum nfnl_hook_chaintype - chain type * - * @NFNL_HOOK_TYPE_NFTABLES nf_tables base chain + * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain + * @NFNL_HOOK_TYPE_BPF: bpf program */ enum nfnl_hook_chaintype { NFNL_HOOK_TYPE_NFTABLES = 0x1, + NFNL_HOOK_TYPE_BPF, +}; + +/** + * enum nfnl_hook_bpf_attributes - bpf prog description + * + * @NFNLA_HOOK_BPF_ID: bpf program id (NLA_U32) + */ +enum nfnl_hook_bpf_attributes { + NFNLA_HOOK_BPF_UNSPEC, + NFNLA_HOOK_BPF_ID, + __NFNLA_HOOK_BPF_MAX, }; +#define NFNLA_HOOK_BPF_MAX (__NFNLA_HOOK_BPF_MAX - 1) #endif /* _NFNL_HOOK_H */ -- cgit v1.2.3