From 731cddce6dd110fb2cdee34eddb48599e7251517 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 1 Feb 2022 19:06:27 +0100 Subject: net: mac802154: Convert the symbol duration into nanoseconds Tdsym is often given in the spec as pretty small numbers in microseconds and hence was reflected in the code as symbol_duration and was stored as a u8. Actually, for UWB PHYs, the symbol duration is given in nanoseconds and are as precise as picoseconds. In order to handle better these PHYs, change the type of symbol_duration to u32 and store this value in nanoseconds. All the users of this variable are updated in a mechanical way. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220201180629.93410-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 833672d6fbe4..92d4f178caca 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -203,8 +203,8 @@ struct wpan_phy { /* PHY depended MAC PIB values */ - /* 802.15.4 acronym: Tdsym in usec */ - u8 symbol_duration; + /* 802.15.4 acronym: Tdsym in nsec */ + u32 symbol_duration; /* lifs and sifs periods timing */ u16 lifs_period; u16 sifs_period; -- cgit v1.2.3 From 781830c800ddd19566846c634de09fadf37eafde Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 1 Feb 2022 19:06:28 +0100 Subject: net: mac802154: Set durations automatically As depicted in the IEEE 802.15.4 specification, modulation/bands are tight to a number of page/channels so we can for most of them derive the durations automatically. The two locations that must call this new helper to set the variou symbol durations are: - when manually requesting a channel change though the netlink interface - at PHY creation, once the device driver has set the default page/channel If an information is missing, the symbol duration is not touched, a debug message is eventually printed. This keeps the compatibility with the unconverted drivers for which it was too complicated for me to find their precise information. If they initially provided a symbol duration, it would be kept. If they don't, the symbol duration value is left untouched. Once the symbol duration derived, the lifs and sifs durations are updated as well. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220201180629.93410-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 92d4f178caca..85f9e8417688 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -415,4 +415,6 @@ static inline const char *wpan_phy_name(struct wpan_phy *phy) return dev_name(&phy->dev); } +void ieee802154_configure_durations(struct wpan_phy *phy); + #endif /* __NET_CFG802154_H */ -- cgit v1.2.3 From 40570375356c874b1578e05c1dcc3ff7c1322dbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Apr 2022 16:35:38 -0700 Subject: tcp: add accessors to read/set tp->snd_cwnd We had various bugs over the years with code breaking the assumption that tp->snd_cwnd is greater than zero. Lately, syzbot reported the WARN_ON_ONCE(!tp->prior_cwnd) added in commit 8b8a321ff72c ("tcp: fix zero cwnd in tcp_cwnd_reduction") can trigger, and without a repro we would have to spend considerable time finding the bug. Instead of complaining too late, we want to catch where and when tp->snd_cwnd is set to an illegal value. Signed-off-by: Eric Dumazet Suggested-by: Yuchung Cheng Cc: Neal Cardwell Acked-by: Yuchung Cheng Link: https://lore.kernel.org/r/20220405233538.947344-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 70ca4a5e330a..54aa2f02bb63 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1207,9 +1207,20 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) +{ + return tp->snd_cwnd; +} + +static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) +{ + WARN_ON_ONCE((int)val <= 0); + tp->snd_cwnd = val; +} + static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd < tp->snd_ssthresh; + return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) @@ -1235,8 +1246,8 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, - ((tp->snd_cwnd >> 1) + - (tp->snd_cwnd >> 2))); + ((tcp_snd_cwnd(tp) >> 1) + + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ @@ -1278,7 +1289,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; + return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; return tp->is_cwnd_limited; } -- cgit v1.2.3 From 51454ea42c1ab4e0c2828bb0d4d53957976980de Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Mon, 4 Apr 2022 01:15:24 +0200 Subject: ipv6: fix locking issues with loops over idev->addr_list idev->addr_list needs to be protected by idev->lock. However, it is not always possible to do so while iterating and performing actions on inet6_ifaddr instances. For example, multiple functions (like addrconf_{join,leave}_anycast) eventually call down to other functions that acquire the idev->lock. The current code temporarily unlocked the idev->lock during the loops, which can cause race conditions. Moving the locks up is also not an appropriate solution as the ordering of lock acquisition will be inconsistent with for example mc_lock. This solution adds an additional field to inet6_ifaddr that is used to temporarily add the instances to a temporary list while holding idev->lock. The temporary list can then be traversed without holding idev->lock. This change was done in two places. In addrconf_ifdown, the list_for_each_entry_safe variant of the list loop is also no longer necessary as there is no deletion within that specific loop. Suggested-by: Paolo Abeni Signed-off-by: Niels Dossche Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220403231523.45843-1-dossche.niels@gmail.com Signed-off-by: Jakub Kicinski --- include/net/if_inet6.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 4cfdef6ca4f6..c8490729b4ae 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -64,6 +64,14 @@ struct inet6_ifaddr { struct hlist_node addr_lst; struct list_head if_list; + /* + * Used to safely traverse idev->addr_list in process context + * if the idev->lock needed to protect idev->addr_list cannot be held. + * In that case, add the items to this list temporarily and iterate + * without holding idev->lock. + * See addrconf_ifdown and dev_forward_change. + */ + struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; -- cgit v1.2.3 From 15fcdf6ae116d1e8da4ff76c6fd82514f5ea501b Mon Sep 17 00:00:00 2001 From: Ping Gan Date: Wed, 6 Apr 2022 09:09:56 +0800 Subject: tcp: Add tracepoint for tcp_set_ca_state The congestion status of a tcp flow may be updated since there is congestion between tcp sender and receiver. It makes sense to add tracepoint for congestion status set function to summate cc status duration and evaluate the performance of network and congestion algorithm. the backgound of this patch is below. Link: https://github.com/iovisor/bcc/pull/3899 Signed-off-by: Ping Gan Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220406010956.19656-1-jacky_gam_2001@163.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 54aa2f02bb63..6d50a662bf89 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1139,15 +1139,6 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk) return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } -static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - if (icsk->icsk_ca_ops->set_state) - icsk->icsk_ca_ops->set_state(sk, ca_state); - icsk->icsk_ca_state = ca_state; -} - static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1156,6 +1147,9 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk->icsk_ca_ops->cwnd_event(sk, event); } +/* From tcp_cong.c */ +void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From 9027ce0b071a1bbd046682907fc2e23ca3592883 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 23 Mar 2022 14:22:01 +0100 Subject: netfilter: ecache: move to separate structure This makes it easier for a followup patch to only expose ecache related parts of nf_conntrack_net structure. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index b08b70989d2c..69e6c6a218be 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -43,6 +43,11 @@ union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; +struct nf_conntrack_net_ecache { + struct delayed_work dwork; + struct netns_ct *ct_net; +}; + struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; @@ -58,8 +63,7 @@ struct nf_conntrack_net { struct ctl_table_header *sysctl_header; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS - struct delayed_work ecache_dwork; - struct netns_ct *ct_net; + struct nf_conntrack_net_ecache ecache; #endif }; -- cgit v1.2.3 From c3f6bb74137c68b515b7e2ff123a80611e801013 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Apr 2022 20:38:16 -0700 Subject: tls: rx: don't store the record type in socket context Original TLS implementation was handling one record at a time. It stashed the type of the record inside tls context (per socket structure) for convenience. When async crypto support was added [1] the author had to use skb->cb to store the type per-message. The use of skb->cb overlaps with strparser, however, so a hybrid approach was taken where type is stored in context while parsing (since we parse a message at a time) but once parsed its copied to skb->cb. Recently a workaround for sockmaps [2] exposed the previously private struct _strp_msg and started a trend of adding user fields directly in strparser's header. This is cleaner than storing information about an skb in the context. This change is not strictly necessary, but IMHO the ownership of the context field is confusing. Information naturally belongs to the skb. [1] commit 94524d8fc965 ("net/tls: Add support for async decryption of tls records") [2] commit b2c4618162ec ("bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/strparser.h | 3 +++ include/net/tls.h | 10 +++------- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/strparser.h b/include/net/strparser.h index 732b7097d78e..c271543076cf 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -70,6 +70,9 @@ struct sk_skb_cb { * when dst_reg == src_reg. */ u64 temp_reg; + struct tls_msg { + u8 control; + } tls; }; static inline struct strp_msg *strp_msg(struct sk_buff *skb) diff --git a/include/net/tls.h b/include/net/tls.h index b6968a5b5538..c3717cd1f1cd 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -117,11 +117,6 @@ struct tls_rec { u8 aead_req_ctx[]; }; -struct tls_msg { - struct strp_msg rxm; - u8 control; -}; - struct tx_work { struct delayed_work work; struct sock *sk; @@ -152,7 +147,6 @@ struct tls_sw_context_rx { void (*saved_data_ready)(struct sock *sk); struct sk_buff *recv_pkt; - u8 control; u8 async_capable:1; u8 decrypted:1; atomic_t decrypt_pending; @@ -411,7 +405,9 @@ void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); static inline struct tls_msg *tls_msg(struct sk_buff *skb) { - return (struct tls_msg *)strp_msg(skb); + struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb; + + return &scb->tls; } static inline bool tls_is_partially_sent_record(struct tls_context *ctx) -- cgit v1.2.3 From 7dc59c33d62c4520a119051d4486c214ef5caa23 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Apr 2022 20:38:17 -0700 Subject: tls: rx: don't store the decryption status in socket context Similar justification to previous change, the information about decryption status belongs in the skb. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/strparser.h | 1 + include/net/tls.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/strparser.h b/include/net/strparser.h index c271543076cf..a191486eb1e4 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -72,6 +72,7 @@ struct sk_skb_cb { u64 temp_reg; struct tls_msg { u8 control; + u8 decrypted; } tls; }; diff --git a/include/net/tls.h b/include/net/tls.h index c3717cd1f1cd..f040edc97c50 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -148,7 +148,6 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 async_capable:1; - u8 decrypted:1; atomic_t decrypt_pending; /* protect crypto_wait with decrypt_pending*/ spinlock_t decrypt_compl_lock; -- cgit v1.2.3 From a8340cc02beed4ffbb5e7b1b0eadca445323fc6a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Apr 2022 20:38:19 -0700 Subject: tls: rx: use a define for tag length TLS 1.3 has to strip padding, and it starts out 16 bytes from the end of the record. Make it clear this is because of the auth tag. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index f040edc97c50..a01c264e5f15 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -64,6 +64,7 @@ #define TLS_AAD_SPACE_SIZE 13 #define MAX_IV_SIZE 16 +#define TLS_TAG_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 /* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes. -- cgit v1.2.3 From c2ccf84ecb715bb81dc7f51e69d680a95bf055ae Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Apr 2022 10:35:22 +0300 Subject: net/sched: act_api: Add extack to offload_act_setup() callback The callback is used by various actions to populate the flow action structure prior to offload. Pass extack to this callback so that the various actions will be able to report accurate error messages to user space. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++- include/net/pkt_cls.h | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 3049cb69c025..9cf6870b526e 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -134,7 +134,8 @@ struct tc_action_ops { (*get_psample_group)(const struct tc_action *a, tc_action_priv_destructor *destructor); int (*offload_act_setup)(struct tc_action *act, void *entry_data, - u32 *index_inc, bool bind); + u32 *index_inc, bool bind, + struct netlink_ext_ack *extack); }; struct tc_action_net { diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3b57a93228a..8cf001aed858 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -547,10 +547,12 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } int tc_setup_offload_action(struct flow_action *flow_action, - const struct tcf_exts *exts); + const struct tcf_exts *exts, + struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, - struct tc_action *actions[]); + struct tc_action *actions[], + struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); -- cgit v1.2.3 From 69642c2ab2f553fd8c4c121fcdf3b3054c727e86 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Apr 2022 10:35:23 +0300 Subject: net/sched: act_gact: Add extack messages for offload failure For better error reporting to user space, add extack messages when gact action offload fails. Example: # echo 1 > /sys/kernel/tracing/events/netlink/netlink_extack/enable # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action continue Error: cls_matchall: Failed to setup flow action. We have an error talking to the kernel # cat /sys/kernel/tracing/trace_pipe tc-181 [002] b..1. 105.493450: netlink_extack: msg=act_gact: Offload of "continue" action is not supported tc-181 [002] ..... 105.493466: netlink_extack: msg=cls_matchall: Failed to setup flow action # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action reclassify Error: cls_matchall: Failed to setup flow action. We have an error talking to the kernel # cat /sys/kernel/tracing/trace_pipe tc-183 [002] b..1. 124.126477: netlink_extack: msg=act_gact: Offload of "reclassify" action is not supported tc-183 [002] ..... 124.126489: netlink_extack: msg=cls_matchall: Failed to setup flow action # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action pipe action drop Error: cls_matchall: Failed to setup flow action. We have an error talking to the kernel # cat /sys/kernel/tracing/trace_pipe tc-185 [002] b..1. 137.097791: netlink_extack: msg=act_gact: Offload of "pipe" action is not supported tc-185 [002] ..... 137.097804: netlink_extack: msg=cls_matchall: Failed to setup flow action Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index eb8f01c819e6..832efd40e023 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -59,4 +59,19 @@ static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a) return READ_ONCE(a->tcfa_action) & TC_ACT_EXT_VAL_MASK; } +static inline bool is_tcf_gact_continue(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_UNSPEC, false); +} + +static inline bool is_tcf_gact_reclassify(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_RECLASSIFY, false); +} + +static inline bool is_tcf_gact_pipe(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_PIPE, false); +} + #endif /* __NET_TC_GACT_H */ -- cgit v1.2.3 From a9c64939b669b3c83cc54738d1986430e6beaff2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Apr 2022 10:35:28 +0300 Subject: net/sched: act_skbedit: Add extack messages for offload failure For better error reporting to user space, add extack messages when skbedit action offload fails. Example: # echo 1 > /sys/kernel/tracing/events/netlink/netlink_extack/enable # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action skbedit queue_mapping 1234 Error: cls_matchall: Failed to setup flow action. We have an error talking to the kernel # cat /sys/kernel/tracing/trace_pipe tc-185 [002] b..1. 31.802414: netlink_extack: msg=act_skbedit: Offload not supported when "queue_mapping" option is used tc-185 [002] ..... 31.802418: netlink_extack: msg=cls_matchall: Failed to setup flow action # tc filter add dev dummy0 ingress pref 1 proto all matchall skip_sw action skbedit inheritdsfield Error: cls_matchall: Failed to setup flow action. We have an error talking to the kernel # cat /sys/kernel/tracing/trace_pipe tc-187 [002] b..1. 45.985145: netlink_extack: msg=act_skbedit: Offload not supported when "inheritdsfield" option is used tc-187 [002] ..... 45.985160: netlink_extack: msg=cls_matchall: Failed to setup flow action Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 00bfee70609e..cab8229b9bed 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -94,4 +94,16 @@ static inline u32 tcf_skbedit_priority(const struct tc_action *a) return priority; } +/* Return true iff action is queue_mapping */ +static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); +} + +/* Return true iff action is inheritdsfield */ +static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) +{ + return is_tcf_skbedit_with_flag(a, SKBEDIT_F_INHERITDSFIELD); +} + #endif /* __NET_TC_SKBEDIT_H */ -- cgit v1.2.3 From 37943f047bfb88ba4dfc7a522563f57c86d088a0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Apr 2022 11:31:27 -0700 Subject: tls: rx: simplify async wait Since we are protected from async completions by decrypt_compl_lock we can drop the async_notify and reinit the completion before we start waiting. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index a01c264e5f15..6fe78361c8c8 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -152,7 +152,6 @@ struct tls_sw_context_rx { atomic_t decrypt_pending; /* protect crypto_wait with decrypt_pending*/ spinlock_t decrypt_compl_lock; - bool async_notify; }; struct tls_record_info { -- cgit v1.2.3 From c1b8a56755ee121f9fa374de9eec36c91a7da25f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Apr 2022 14:20:49 +0800 Subject: net: sock: introduce sock_queue_rcv_skb_reason() In order to report the reasons of skb drops in 'sock_queue_rcv_skb()', introduce the function 'sock_queue_rcv_skb_reason()'. As the return value of 'sock_queue_rcv_skb()' is used as the error code, we can't make it as drop reason and have to pass extra output argument. 'sock_queue_rcv_skb()' is used in many places, so we can't change it directly. Introduce the new function 'sock_queue_rcv_skb_reason()' and make 'sock_queue_rcv_skb()' an inline call to it. Reviewed-by: Hao Peng Reviewed-by: Jiang Biao Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/sock.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index c4b91fc19b9c..1a988e605f09 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2392,7 +2392,14 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); + +static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + return sock_queue_rcv_skb_reason(sk, skb, NULL); +} int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); -- cgit v1.2.3 From b384c95a861eebf47e88695cf6a29f34e0b10b0f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Apr 2022 14:20:52 +0800 Subject: net: icmp: add skb drop reasons to icmp protocol Replace kfree_skb() used in icmp_rcv() and icmpv6_rcv() with kfree_skb_reason(). In order to get the reasons of the skb drops after icmp message handle, we change the return type of 'handler()' in 'struct icmp_control' from 'bool' to 'enum skb_drop_reason'. This may change its original intention, as 'false' means failure, but 'SKB_NOT_DROPPED_YET' means success now. Therefore, all 'handler' and the call of them need to be handled. Following 'handler' functions are involved: icmp_unreach() icmp_redirect() icmp_echo() icmp_timestamp() icmp_discard() And following new drop reasons are added: SKB_DROP_REASON_ICMP_CSUM SKB_DROP_REASON_INVALID_PROTO The reason 'INVALID_PROTO' is introduced for the case that the packet doesn't follow rfc 1122 and is dropped. This is not a common case, and I believe we can locate the problem from the data in the packet. For now, this 'INVALID_PROTO' is used for the icmp broadcasts with wrong types. Maybe there should be a document file for these reasons. For example, list all the case that causes the 'UNHANDLED_PROTO' and 'INVALID_PROTO' drop reason. Therefore, users can locate their problems according to the document. Reviewed-by: Hao Peng Reviewed-by: Jiang Biao Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ping.h b/include/net/ping.h index 2fe78874318c..b68fbfdb606f 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -76,7 +76,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len); int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -bool ping_rcv(struct sk_buff *skb); +enum skb_drop_reason ping_rcv(struct sk_buff *skb); #ifdef CONFIG_PROC_FS void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family); -- cgit v1.2.3 From e5c95ca094cfe59a4013d711f87ff034cba30f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 4 Apr 2022 23:01:08 +0200 Subject: mac80211: Improve confusing comment around tx_info clearing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment above the ieee80211_tx_info_clear_status() helper was somewhat confusing as to which fields it was or wasn't clearing. So replace it by something that is hopefully more, well, clear. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220404210108.2684907-1-toke@toke.dk Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 382ebb862ea8..db992f71604d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1201,9 +1201,9 @@ static inline struct ieee80211_rx_status *IEEE80211_SKB_RXCB(struct sk_buff *skb * in the TX status but the rate control information (it does clear * the count since you need to fill that in anyway). * - * NOTE: You can only use this function if you do NOT use - * info->driver_data! Use info->rate_driver_data - * instead if you need only the less space that allows. + * NOTE: While the rates array is kept intact, this will wipe all of the + * driver_data fields in info, so it's up to the driver to restore + * any fields it needs after calling this helper. */ static inline void ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) -- cgit v1.2.3 From 6d945a33f2b0aa24fc210dadaa0af3e8218e7002 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 25 Mar 2022 11:42:41 +0100 Subject: mac80211: introduce BSS color collision detection Add ieee80211_rx_check_bss_color_collision routine in order to introduce BSS color collision detection in mac80211 if it is not supported in HW/FW (e.g. for mt7915 chipset). Add IEEE80211_HW_DETECTS_COLOR_COLLISION flag to let the driver notify BSS color collision detection is supported in HW/FW. Set this for ath11k which apparently didn't need this code. Tested-by: Peter Chiu Co-developed-by: Ryder Lee Signed-off-by: Ryder Lee Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/a05eeeb1841a84560dc5aaec77894fcb69a54f27.1648204871.git.lorenzo@kernel.org [clarify commit message a bit, move flag to mac80211] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index db992f71604d..0943b2626f4d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2434,6 +2434,9 @@ struct ieee80211_txq { * usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to * the stack. * + * @IEEE80211_HW_DETECTS_COLOR_COLLISION: HW/driver has support for BSS color + * collision detection and doesn't need it in software. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2489,6 +2492,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, + IEEE80211_HW_DETECTS_COLOR_COLLISION, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 046d2e7c50e3087a32a85fd384c21f896dbccf83 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Mon, 4 Apr 2022 21:11:23 +0530 Subject: mac80211: prepare sta handling for MLO support Currently in mac80211 each STA object is represented using sta_info datastructure with the associated STA specific information and drivers access ieee80211_sta part of it. With MLO (Multi Link Operation) support being added in 802.11be standard, though the association is logically with a single Multi Link capable STA, at the physical level communication can happen via different advertised links (uniquely identified by Channel, operating class, BSSID) and hence the need to handle multiple link STA parameters within a composite sta_info object called the MLD STA. The different link STA part of MLD STA are identified using the link address which can be same or different as the MLD STA address and unique link id based on the link vif. To support extension of such a model, the sta_info datastructure is modified to hold multiple link STA objects with link specific params currently within sta_info moved to this new structure. Similarly this is done for ieee80211_sta as well which will be accessed within mac80211 as well as by drivers, hence trivial driver changes are expected to support this. For current non MLO supported drivers, only one link STA is present and link information is accessed via 'deflink' member. For MLO drivers, we still need to define the APIs etc. to get the correct link ID and access the correct part of the station info. Currently in mac80211, all link STA info are accessed directly via deflink. These will be updated to access via link pointers indexed by link id with MLO support patches, with link id being 0 for non MLO supported cases. Except for couple of macro related changes, below spatch takes care of updating mac80211 and driver code to access to the link STA info via deflink. @ieee80211_sta@ struct ieee80211_sta *s; struct sta_info *si; identifier var = {supp_rates, ht_cap, vht_cap, he_cap, he_6ghz_capa, eht_cap, rx_nss, bandwidth, txpwr}; @@ ( s-> - var + deflink.var | si->sta. - var + deflink.var ) @sta_info@ struct sta_info *si; identifier var = {gtk, pcpu_rx_stats, rx_stats, rx_stats_avg, status_stats, tx_stats, cur_max_bandwidth}; @@ ( si-> - var + deflink.var ) Signed-off-by: Sriram R Link: https://lore.kernel.org/r/1649086883-13246-1-git-send-email-quic_srirrama@quicinc.com [remove MLO-drivers notes from commit message, not clear yet; run spatch] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 80 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0943b2626f4d..75880fc70700 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2056,6 +2056,45 @@ struct ieee80211_sta_txpwr { enum nl80211_tx_power_setting type; }; +#define MAX_STA_LINKS 15 + +/** + * struct ieee80211_link_sta - station Link specific info + * All link specific info for a STA link for a non MLD STA(single) + * or a MLD STA(multiple entries) are stored here. + * + * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr + * in ieee80211_sta. For MLO Link STA this addr can be same or different + * from addr in ieee80211_sta (representing MLD STA addr) + * @supp_rates: Bitmap of supported rates + * @ht_cap: HT capabilities of this STA; restricted to our own capabilities + * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities + * @he_cap: HE capabilities of this STA + * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities + * @eht_cap: EHT capabilities of this STA + * @bandwidth: current bandwidth the station can receive with + * @rx_nss: in HT/VHT, the maximum number of spatial streams the + * station can receive at the moment, changed by operating mode + * notifications and capabilities. The value is only valid after + * the station moves to associated state. + * @txpwr: the station tx power configuration + * + */ +struct ieee80211_link_sta { + u8 addr[ETH_ALEN]; + + u32 supp_rates[NUM_NL80211_BANDS]; + struct ieee80211_sta_ht_cap ht_cap; + struct ieee80211_sta_vht_cap vht_cap; + struct ieee80211_sta_he_cap he_cap; + struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; + + u8 rx_nss; + enum ieee80211_sta_rx_bandwidth bandwidth; + struct ieee80211_sta_txpwr txpwr; +}; + /** * struct ieee80211_sta - station table entry * @@ -2065,15 +2104,11 @@ struct ieee80211_sta_txpwr { * either be protected by rcu_read_lock() explicitly or implicitly, * or you must take good care to not use such a pointer after a * call to your sta_remove callback that removed it. + * This also represents the MLD STA in case of MLO association + * and holds pointers to various link STA's * * @addr: MAC address * @aid: AID we assigned to the station if we're an AP - * @supp_rates: Bitmap of supported rates (per band) - * @ht_cap: HT capabilities of this STA; restricted to our own capabilities - * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities - * @he_cap: HE capabilities of this STA - * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities - * @eht_cap: EHT capabilities of this STA * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2085,11 +2120,6 @@ struct ieee80211_sta_txpwr { * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. - * @bandwidth: current bandwidth the station can receive with - * @rx_nss: in HT/VHT, the maximum number of spatial streams the - * station can receive at the moment, changed by operating mode - * notifications and capabilities. The value is only valid after - * the station moves to associated state. * @smps_mode: current SMPS mode (off, static or dynamic) * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer @@ -2102,25 +2132,28 @@ struct ieee80211_sta_txpwr { * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID - * @txpwr: the station tx power configuration * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that * the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames + * @multi_link_sta: Identifies if this sta is a MLD STA + * @deflink: This holds the default link STA information, for non MLO STA all link + * specific STA information is accessed through @deflink or through + * link[0] which points to address of @deflink. For MLO Link STA + * the first added link STA will point to deflink. + * @link: reference to Link Sta entries. For Non MLO STA, except 1st link, + * i.e link[0] all links would be assigned to NULL by default and + * would access link information via @deflink or link[0]. For MLO + * STA, first link STA being added will point its link pointer to + * @deflink address and remaining would be allocated and the address + * would be assigned to link[link_id] where link_id is the id assigned + * by the AP. */ struct ieee80211_sta { - u32 supp_rates[NUM_NL80211_BANDS]; u8 addr[ETH_ALEN]; u16 aid; - struct ieee80211_sta_ht_cap ht_cap; - struct ieee80211_sta_vht_cap vht_cap; - struct ieee80211_sta_he_cap he_cap; - struct ieee80211_he_6ghz_capa he_6ghz_capa; - struct ieee80211_sta_eht_cap eht_cap; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; - u8 rx_nss; - enum ieee80211_sta_rx_bandwidth bandwidth; enum ieee80211_smps_mode smps_mode; struct ieee80211_sta_rates __rcu *rates; bool tdls; @@ -2147,10 +2180,13 @@ struct ieee80211_sta { bool support_p2p_ps; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; - struct ieee80211_sta_txpwr txpwr; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; + bool multi_link_sta; + struct ieee80211_link_sta deflink; + struct ieee80211_link_sta *link[MAX_STA_LINKS]; + /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -6371,7 +6407,7 @@ static inline int rate_supported(struct ieee80211_sta *sta, enum nl80211_band band, int index) { - return (sta == NULL || sta->supp_rates[band] & BIT(index)); + return (sta == NULL || sta->deflink.supp_rates[band] & BIT(index)); } static inline s8 -- cgit v1.2.3 From 888ade8f90d7dbbdc8552ae9b23d311f9e61ab0e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 8 Apr 2022 22:08:37 +0200 Subject: ipv4: Use dscp_t in struct fib_rt_info Use the new dscp_t type to replace the tos field of struct fib_rt_info. This ensures ECN bits are ignored and makes it compatible with the fa_dscp field of struct fib_alias. This also allows sparse to flag potential incorrect uses of DSCP and ECN bits. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6a82bcb8813b..f08ba531ac08 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -212,7 +212,7 @@ struct fib_rt_info { u32 tb_id; __be32 dst; int dst_len; - u8 tos; + dscp_t dscp; u8 type; u8 offload:1, trap:1, -- cgit v1.2.3 From 568a3f33b4273833f1b1e4a39d4a3410c4770c32 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 8 Apr 2022 22:08:40 +0200 Subject: ipv4: Use dscp_t in struct fib_entry_notifier_info Use the new dscp_t type to replace the tos field of struct fib_entry_notifier_info. This ensures ECN bits are ignored and makes it compatible with the dscp field of struct fib_rt_info. This also allows sparse to flag potential incorrect uses of DSCP and ECN bits. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f08ba531ac08..a378eff827c7 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -225,7 +225,7 @@ struct fib_entry_notifier_info { u32 dst; int dst_len; struct fib_info *fi; - u8 tos; + dscp_t dscp; u8 type; u32 tb_id; }; -- cgit v1.2.3 From ec095263a965720e1ca39db1d9c5cd47846c789b Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 11 Apr 2022 14:49:55 +0200 Subject: net: remove noblock parameter from recvmsg() entities The internal recvmsg() functions have two parameters 'flags' and 'noblock' that were merged inside skb_recv_datagram(). As a follow up patch to commit f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()") this patch removes the separate 'noblock' parameter for recvmsg(). Analogue to the referenced patch for skb_recv_datagram() the 'flags' and 'noblock' parameters are unnecessarily split up with e.g. err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); or in err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); instead of simply using only flags all the time and check for MSG_DONTWAIT where needed (to preserve for the formerly separated no(n)block condition). Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net Signed-off-by: Paolo Abeni --- include/net/ping.h | 2 +- include/net/sctp/sctp.h | 2 +- include/net/sock.h | 3 +-- include/net/tcp.h | 2 +- include/net/tls.h | 2 +- include/net/udp.h | 8 ++++---- 6 files changed, 9 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ping.h b/include/net/ping.h index b68fbfdb606f..e4ff3911cbf5 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -71,7 +71,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info); int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *); -int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, +int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index bf3716fe83e0..a04999ee99b0 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -103,7 +103,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc); extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); -struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); +struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *); void sctp_transport_walk_start(struct rhashtable_iter *iter); diff --git a/include/net/sock.h b/include/net/sock.h index 1a988e605f09..a01d6c421aa2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1202,8 +1202,7 @@ struct proto { int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, - int *addr_len); + size_t len, int flags, int *addr_len); int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, diff --git a/include/net/tcp.h b/include/net/tcp.h index 6d50a662bf89..679b1964d494 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -407,7 +407,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); diff --git a/include/net/tls.h b/include/net/tls.h index 6fe78361c8c8..b59f0a63292b 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -371,7 +371,7 @@ void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); + int flags, int *addr_len); bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, diff --git a/include/net/udp.h b/include/net/udp.h index f1c2a88c9005..b83a00330566 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -250,14 +250,14 @@ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); -struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *off, int *err); +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, + int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *err) + int *err) { int off = 0; - return __skb_recv_udp(sk, flags, noblock, &off, err); + return __skb_recv_udp(sk, flags, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); -- cgit v1.2.3 From 12dc5c2cb7b269c5a1c6d02844f40bfce942a7a6 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:51 +0300 Subject: net: rtnetlink: add msg kind names Add rtnl kind names instead of using raw values. We'll need to check for DEL kind later to validate bulk flag support. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 9f48733bfd21..78712b51f3da 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,6 +13,13 @@ enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = 1, }; +enum rtnl_kinds { + RTNL_KIND_NEW, + RTNL_KIND_DEL, + RTNL_KIND_GET, + RTNL_KIND_SET +}; + void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_register_module(struct module *owner, int protocol, int msgtype, -- cgit v1.2.3 From 2e9ea3e30f696fd438319c07836422bb0bbb4608 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:52 +0300 Subject: net: rtnetlink: add helper to extract msg type's kind Add a helper which extracts the msg type's kind using the kind mask (0x3). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 78712b51f3da..c51c5ff7f7e2 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -19,6 +19,12 @@ enum rtnl_kinds { RTNL_KIND_GET, RTNL_KIND_SET }; +#define RTNL_KIND_MASK 0x3 + +static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) +{ + return msgtype & RTNL_KIND_MASK; +} void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -- cgit v1.2.3 From 0569e31f1bc2f50613ba4c219f3ecc0d1174d841 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:53 +0300 Subject: net: rtnetlink: use BIT for flag values Use BIT to define flag values. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index c51c5ff7f7e2..0bf622409aaa 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -10,7 +10,7 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { - RTNL_FLAG_DOIT_UNLOCKED = 1, + RTNL_FLAG_DOIT_UNLOCKED = BIT(0), }; enum rtnl_kinds { -- cgit v1.2.3 From a6cec0bcd34264be8887791594be793b3f12719f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:55 +0300 Subject: net: rtnetlink: add bulk delete support flag Add a new rtnl flag (RTNL_FLAG_BULK_DEL_SUPPORTED) which is used to verify that the delete operation allows bulk object deletion. Also emit a warning if anyone tries to set it for non-delete kind. Suggested-by: David Ahern Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 0bf622409aaa..bf8bb3357825 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -10,7 +10,8 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { - RTNL_FLAG_DOIT_UNLOCKED = BIT(0), + RTNL_FLAG_DOIT_UNLOCKED = BIT(0), + RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), }; enum rtnl_kinds { -- cgit v1.2.3 From f3c5264f452a5b0ac1de1f2f657efbabdea3c76a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 12 Apr 2022 18:31:58 +0200 Subject: net: page_pool: introduce ethtool stats Introduce page_pool APIs to report stats through ethtool and reduce duplicated code in each driver. Signed-off-by: Lorenzo Bianconi Reviewed-by: Jakub Kicinski Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ea5fb70e5101..813c93499f20 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -117,6 +117,10 @@ struct page_pool_stats { struct page_pool_recycle_stats recycle_stats; }; +int page_pool_ethtool_stats_get_count(void); +u8 *page_pool_ethtool_stats_get_strings(u8 *data); +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); + /* * Drivers that wish to harvest page pool stats and report them to users * (perhaps via ethtool, debugfs, or another mechanism) can allocate a @@ -124,6 +128,23 @@ struct page_pool_stats { */ bool page_pool_get_stats(struct page_pool *pool, struct page_pool_stats *stats); +#else + +static inline int page_pool_ethtool_stats_get_count(void) +{ + return 0; +} + +static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + return data; +} + +static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + return data; +} + #endif struct page_pool { -- cgit v1.2.3 From c246f9b5fd617fe487f8b6f18851703f468501d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 18 Apr 2022 09:42:25 +0300 Subject: devlink: add support to create line card and expose to user Extend the devlink API so the driver is going to be able to create and destroy linecard instances. There can be multiple line cards per devlink device. Expose this new type of object over devlink netlink API to the userspace, with notifications. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index a30180c0988a..7aabdfb3bc6a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -22,6 +22,7 @@ #include struct devlink; +struct devlink_linecard; struct devlink_port_phys_attrs { u32 port_number; /* Same value as "split group". @@ -1536,6 +1537,9 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, int devlink_rate_leaf_create(struct devlink_port *port, void *priv); void devlink_rate_leaf_destroy(struct devlink_port *devlink_port); void devlink_rate_nodes_destroy(struct devlink *devlink); +struct devlink_linecard *devlink_linecard_create(struct devlink *devlink, + unsigned int linecard_index); +void devlink_linecard_destroy(struct devlink_linecard *linecard); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From fcdc8ce23a309c26a67fc613a741d9b21a248311 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 18 Apr 2022 09:42:26 +0300 Subject: devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7aabdfb3bc6a..3e49d4ff498c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -149,6 +149,40 @@ struct devlink_port_new_attrs { sfnum_valid:1; }; +/** + * struct devlink_linecard_ops - Linecard operations + * @provision: callback to provision the linecard slot with certain + * type of linecard. As a result of this operation, + * driver is expected to eventually (could be after + * the function call returns) call one of: + * devlink_linecard_provision_set() + * devlink_linecard_provision_fail() + * @unprovision: callback to unprovision the linecard slot. As a result + * of this operation, driver is expected to eventually + * (could be after the function call returns) call + * devlink_linecard_provision_clear() + * devlink_linecard_provision_fail() + * @same_provision: callback to ask the driver if linecard is already + * provisioned in the same way user asks this linecard to be + * provisioned. + * @types_count: callback to get number of supported types + * @types_get: callback to get next type in list + */ +struct devlink_linecard_ops { + int (*provision)(struct devlink_linecard *linecard, void *priv, + const char *type, const void *type_priv, + struct netlink_ext_ack *extack); + int (*unprovision)(struct devlink_linecard *linecard, void *priv, + struct netlink_ext_ack *extack); + bool (*same_provision)(struct devlink_linecard *linecard, void *priv, + const char *type, const void *type_priv); + unsigned int (*types_count)(struct devlink_linecard *linecard, + void *priv); + void (*types_get)(struct devlink_linecard *linecard, + void *priv, unsigned int index, const char **type, + const void **type_priv); +}; + struct devlink_sb_pool_info { enum devlink_sb_pool_type pool_type; u32 size; @@ -1537,9 +1571,14 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, int devlink_rate_leaf_create(struct devlink_port *port, void *priv); void devlink_rate_leaf_destroy(struct devlink_port *devlink_port); void devlink_rate_nodes_destroy(struct devlink *devlink); -struct devlink_linecard *devlink_linecard_create(struct devlink *devlink, - unsigned int linecard_index); +struct devlink_linecard * +devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv); void devlink_linecard_destroy(struct devlink_linecard *linecard); +void devlink_linecard_provision_set(struct devlink_linecard *linecard, + const char *type); +void devlink_linecard_provision_clear(struct devlink_linecard *linecard); +void devlink_linecard_provision_fail(struct devlink_linecard *linecard); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From fc9f50d5b366cd9f35bdee22fe3f8d77833cb1d8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 18 Apr 2022 09:42:27 +0300 Subject: devlink: implement line card active state Allow driver to mark a line card as active. Expose this state to the userspace over devlink netlink interface with proper notifications. 'active' state means that line card was plugged in after being provisioned. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3e49d4ff498c..d8061a11fee6 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1579,6 +1579,8 @@ void devlink_linecard_provision_set(struct devlink_linecard *linecard, const char *type); void devlink_linecard_provision_clear(struct devlink_linecard *linecard); void devlink_linecard_provision_fail(struct devlink_linecard *linecard); +void devlink_linecard_activate(struct devlink_linecard *linecard); +void devlink_linecard_deactivate(struct devlink_linecard *linecard); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From b837585985386e05478cb17868d23a6fa87c4f8d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 18 Apr 2022 09:42:28 +0300 Subject: devlink: add port to line card relationship set In order to properly inform user about relationship between port and line card, introduce a driver API to set line card for a port. Use this information to extend port devlink netlink message by line card index and also include the line card index into phys_port_name and by that into a netdevice name. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index d8061a11fee6..2a2a2a0c93f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -136,6 +136,7 @@ struct devlink_port { struct mutex reporters_lock; /* Protects reporter_list */ struct devlink_rate *devlink_rate; + struct devlink_linecard *linecard; }; struct devlink_port_new_attrs { @@ -1571,6 +1572,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, int devlink_rate_leaf_create(struct devlink_port *port, void *priv); void devlink_rate_leaf_destroy(struct devlink_port *devlink_port); void devlink_rate_nodes_destroy(struct devlink *devlink); +void devlink_port_linecard_set(struct devlink_port *devlink_port, + struct devlink_linecard *linecard); struct devlink_linecard * devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv); -- cgit v1.2.3 From 38a6f0865796e26fc38fff4858f681d9ae76fa0f Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 16 Apr 2022 00:40:46 +0800 Subject: net: sched: support hash selecting tx queue This patch allows users to pick queue_mapping, range from A to B. Then we can load balance packets from A to B tx queue. The range is an unsigned 16bit value in decimal format. $ tc filter ... action skbedit queue_mapping skbhash A B "skbedit queue_mapping QUEUE_MAPPING" (from "man 8 tc-skbedit") is enhanced with flags: SKBEDIT_F_TXQ_SKBHASH +----+ +----+ +----+ | P1 | | P2 | | Pn | +----+ +----+ +----+ | | | +-----------+-----------+ | | clsact/skbedit | MQ v +-----------+-----------+ | q0 | qn | qm v v v HTB/FQ FIFO ... FIFO For example: If P1 sends out packets to different Pods on other host, and we want distribute flows from qn - qm. Then we can use skb->hash as hash. setup commands: $ NETDEV=eth0 $ ip netns add n1 $ ip link add ipv1 link $NETDEV type ipvlan mode l2 $ ip link set ipv1 netns n1 $ ip netns exec n1 ifconfig ipv1 2.2.2.100/24 up $ tc qdisc add dev $NETDEV clsact $ tc filter add dev $NETDEV egress protocol ip prio 1 \ flower skip_hw src_ip 2.2.2.100 action skbedit queue_mapping skbhash 2 6 $ tc qdisc add dev $NETDEV handle 1: root mq $ tc qdisc add dev $NETDEV parent 1:1 handle 2: htb $ tc class add dev $NETDEV parent 2: classid 2:1 htb rate 100kbit $ tc class add dev $NETDEV parent 2: classid 2:2 htb rate 200kbit $ tc qdisc add dev $NETDEV parent 1:2 tbf rate 100mbit burst 100mb latency 1 $ tc qdisc add dev $NETDEV parent 1:3 pfifo $ tc qdisc add dev $NETDEV parent 1:4 pfifo $ tc qdisc add dev $NETDEV parent 1:5 pfifo $ tc qdisc add dev $NETDEV parent 1:6 pfifo $ tc qdisc add dev $NETDEV parent 1:7 pfifo $ ip netns exec n1 iperf3 -c 2.2.2.1 -i 1 -t 10 -P 10 pick txqueue from 2 - 6: $ ethtool -S $NETDEV | grep -i tx_queue_[0-9]_bytes tx_queue_0_bytes: 42 tx_queue_1_bytes: 0 tx_queue_2_bytes: 11442586444 tx_queue_3_bytes: 7383615334 tx_queue_4_bytes: 3981365579 tx_queue_5_bytes: 3983235051 tx_queue_6_bytes: 6706236461 tx_queue_7_bytes: 42 tx_queue_8_bytes: 0 tx_queue_9_bytes: 0 txqueues 2 - 6 are mapped to classid 1:3 - 1:7 $ tc -s class show dev $NETDEV ... class mq 1:3 root leaf 8002: Sent 11949133672 bytes 7929798 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 class mq 1:4 root leaf 8003: Sent 7710449050 bytes 5117279 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 class mq 1:5 root leaf 8004: Sent 4157648675 bytes 2758990 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 class mq 1:6 root leaf 8005: Sent 4159632195 bytes 2759990 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 class mq 1:7 root leaf 8006: Sent 7003169603 bytes 4646912 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 ... Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Jonathan Lemon Cc: Eric Dumazet Cc: Alexander Lobakin Cc: Paolo Abeni Cc: Talal Ahmad Cc: Kevin Hao Cc: Ilias Apalodimas Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Antoine Tenart Cc: Wei Wang Cc: Arnd Bergmann Signed-off-by: Tonghao Zhang Reviewed-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- include/net/tc_act/tc_skbedit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index cab8229b9bed..dc1079f28e13 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -17,6 +17,7 @@ struct tcf_skbedit_params { u32 mark; u32 mask; u16 queue_mapping; + u16 mapping_mod; u16 ptype; struct rcu_head rcu; }; -- cgit v1.2.3 From 34951fcf26c59e78ae430fba1fce7c08b1871249 Mon Sep 17 00:00:00 2001 From: Boris Sukholitko Date: Tue, 19 Apr 2022 11:14:32 +0300 Subject: flow_dissector: Add number of vlan tags dissector Our customers in the fiber telecom world have network configurations where they would like to control their traffic according to the number of tags appearing in the packet. For example, TR247 GPON conformance test suite specification mostly talks about untagged, single, double tagged packets and gives lax guidelines on the vlan protocol vs. number of vlan tags. This is different from the common IT networks where 802.1Q and 802.1ad protocols are usually describe single and double tagged packet. GPON configurations that we work with have arbitrary mix the above protocols and number of vlan tags in the packet. The goal is to make the following TC commands possible: tc filter add dev eth1 ingress flower \ num_of_vlans 1 vlan_prio 5 action drop From our logs, we have redirect rules such that: tc filter add dev $GPON ingress flower num_of_vlans $N \ action mirred egress redirect dev $DEV where N can range from 0 to 3 and $DEV is the function of $N. Also there are rules setting skb mark based on the number of vlans: tc filter add dev $GPON ingress flower num_of_vlans $N vlan_prio \ $P action skbedit mark $M This new dissector allows extracting the number of vlan tags existing in the packet. Signed-off-by: Boris Sukholitko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 9f65f1bfbd24..a4c6057c7097 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -253,6 +253,14 @@ struct flow_dissector_key_hash { u32 hash; }; +/** + * struct flow_dissector_key_num_of_vlans: + * @num_of_vlans: num_of_vlans value + */ +struct flow_dissector_key_num_of_vlans { + u8 num_of_vlans; +}; + enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ @@ -282,6 +290,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */ FLOW_DISSECTOR_KEY_MAX, }; -- cgit v1.2.3 From 67e1e2f4854bb2c0dd2b8440cf090016a0e1a091 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 21 Apr 2022 01:21:33 +0200 Subject: ipv4: Avoid using RTO_ONLINK with ip_route_connect(). Now that ip_rt_fix_tos() doesn't reset ->flowi4_scope unconditionally, we don't have to rely on the RTO_ONLINK bit to properly set the scope of a flowi4 structure. We can just set ->flowi4_scope explicitly and avoid using RTO_ONLINK in ->flowi4_tos. This patch converts callers of ip_route_connect(). Instead of setting the tos parameter with RT_CONN_FLAGS(sk), as all callers do, we can: 1- Drop the tos parameter from ip_route_connect(): its value was entirely based on sk, which is also passed as parameter. 2- Set ->flowi4_scope depending on the SOCK_LOCALROUTE socket option instead of always initialising it with RT_SCOPE_UNIVERSE (let's define ip_sock_rt_scope() for this purpose). 3- Avoid overloading ->flowi4_tos with RTO_ONLINK: since the scope is now properly initialised, we don't need to tell ip_rt_fix_tos() to adjust ->flowi4_scope for us. So let's define ip_sock_rt_tos(), which is the same as RT_CONN_FLAGS() but without the RTO_ONLINK bit overload. Note: In the original ip_route_connect() code, __ip_route_output_key() might clear the RTO_ONLINK bit of fl4->flowi4_tos (because of ip_rt_fix_tos()). Therefore flowi4_update_output() had to reuse the original tos variable. Now that we don't set RTO_ONLINK any more, this is not a problem and we can use fl4->flowi4_tos in flowi4_update_output(). Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 25404fc2b483..991a3985712d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -43,6 +43,19 @@ #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) #define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE)) +static inline __u8 ip_sock_rt_scope(const struct sock *sk) +{ + if (sock_flag(sk, SOCK_LOCALROUTE)) + return RT_SCOPE_LINK; + + return RT_SCOPE_UNIVERSE; +} + +static inline __u8 ip_sock_rt_tos(const struct sock *sk) +{ + return RT_TOS(inet_sk(sk)->tos); +} + struct ip_tunnel_info; struct fib_nh; struct fib_info; @@ -289,39 +302,38 @@ static inline char rt_tos2priority(u8 tos) * ip_route_newports() calls. */ -static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 src, - u32 tos, int oif, u8 protocol, +static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, + __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, - struct sock *sk) + const struct sock *sk) { __u8 flow_flags = 0; if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; - flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - protocol, flow_flags, dst, src, dport, sport, - sk->sk_uid); + flowi4_init_output(fl4, oif, sk->sk_mark, ip_sock_rt_tos(sk), + ip_sock_rt_scope(sk), protocol, flow_flags, dst, + src, dport, sport, sk->sk_uid); } -static inline struct rtable *ip_route_connect(struct flowi4 *fl4, - __be32 dst, __be32 src, u32 tos, - int oif, u8 protocol, +static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, + __be32 src, int oif, u8 protocol, __be16 sport, __be16 dport, struct sock *sk) { struct net *net = sock_net(sk); struct rtable *rt; - ip_route_connect_init(fl4, dst, src, tos, oif, protocol, - sport, dport, sk); + ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk); if (!dst || !src) { rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) return rt; ip_rt_put(rt); - flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); + flowi4_update_output(fl4, oif, fl4->flowi4_tos, fl4->daddr, + fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); -- cgit v1.2.3 From 1e39e5a32ad7fdd82d6e071aa14ecd511eedc1f7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 22 Apr 2022 14:55:39 -0700 Subject: mptcp: infinite mapping sending This patch adds the infinite mapping sending logic. Add a new flag send_infinite_map in struct mptcp_subflow_context. Set it true when a single contiguous subflow is in use and the allow_infinite_fallback flag is true in mptcp_pm_mp_fail_received(). In mptcp_sendmsg_frag(), if this flag is true, call the new function mptcp_update_infinite_map() to set the infinite mapping. Add a new flag infinite_map in struct mptcp_ext, set it true in mptcp_update_infinite_map(), and check this flag in a new helper mptcp_check_infinite_map(). In mptcp_update_infinite_map(), set data_len to 0, and clear the send_infinite_map flag, then do fallback. In mptcp_established_options(), use the helper mptcp_check_infinite_map() to let the infinite mapping DSS can be sent out in the fallback mode. Suggested-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0a3b0fb04a3b..8b1afd6f5cc4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -35,7 +35,8 @@ struct mptcp_ext { frozen:1, reset_transient:1; u8 reset_reason:4, - csum_reqd:1; + csum_reqd:1, + infinite_map:1; }; #define MPTCP_RM_IDS_MAX 8 -- cgit v1.2.3 From 8d92e4fbcf0fb7ecb24223b7b1ce95b9beb4dfa2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Apr 2022 06:44:21 +0300 Subject: devlink: introduce line card devices support Line card can contain one or more devices that makes sense to make visible to the user. For example, this can be a gearbox with flash memory, which could be updated. Provide the driver possibility to attach such devices to a line card and expose those to user. Example: $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G devices: device 0 device 1 device 2 device 3 Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 2a2a2a0c93f7..c84b52fb9ff0 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1578,6 +1578,13 @@ struct devlink_linecard * devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv); void devlink_linecard_destroy(struct devlink_linecard *linecard); +struct devlink_linecard_device; +struct devlink_linecard_device * +devlink_linecard_device_create(struct devlink_linecard *linecard, + unsigned int device_index); +void +devlink_linecard_device_destroy(struct devlink_linecard *linecard, + struct devlink_linecard_device *linecard_device); void devlink_linecard_provision_set(struct devlink_linecard *linecard, const char *type); void devlink_linecard_provision_clear(struct devlink_linecard *linecard); -- cgit v1.2.3 From 276910aecc6a4076f5fbfd8160ff70695d6c1eb5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Apr 2022 06:44:22 +0300 Subject: devlink: introduce line card info get message Allow the driver to provide per line card info get op to fill-up info, similar to the "devlink dev info". Example: $ devlink lc info pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 versions: fixed: hw.revision 0 running: ini.version 4 Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index c84b52fb9ff0..f96dcb376630 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -150,6 +150,8 @@ struct devlink_port_new_attrs { sfnum_valid:1; }; +struct devlink_info_req; + /** * struct devlink_linecard_ops - Linecard operations * @provision: callback to provision the linecard slot with certain @@ -168,6 +170,7 @@ struct devlink_port_new_attrs { * provisioned. * @types_count: callback to get number of supported types * @types_get: callback to get next type in list + * @info_get: callback to get linecard info */ struct devlink_linecard_ops { int (*provision)(struct devlink_linecard *linecard, void *priv, @@ -182,6 +185,9 @@ struct devlink_linecard_ops { void (*types_get)(struct devlink_linecard *linecard, void *priv, unsigned int index, const char **type, const void **type_priv); + int (*info_get)(struct devlink_linecard *linecard, void *priv, + struct devlink_info_req *req, + struct netlink_ext_ack *extack); }; struct devlink_sb_pool_info { @@ -628,7 +634,6 @@ struct devlink_flash_update_params { #define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) struct devlink_region; -struct devlink_info_req; /** * struct devlink_region_ops - Region operations -- cgit v1.2.3 From 28b2d1f1ac41dda0b239830765d85c7ae4434182 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Apr 2022 06:44:23 +0300 Subject: devlink: introduce line card device info infrastructure Extend the line card info message with information (e.g., FW version) about devices found on the line card. Example: $ devlink lc info pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 versions: fixed: hw.revision 0 running: ini.version 4 devices: device 0 versions: running: fw 19.2010.1310 device 1 versions: running: fw 19.2010.1310 device 2 versions: running: fw 19.2010.1310 device 3 versions: running: fw 19.2010.1310 Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index f96dcb376630..062895973656 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -151,6 +151,7 @@ struct devlink_port_new_attrs { }; struct devlink_info_req; +struct devlink_linecard_device; /** * struct devlink_linecard_ops - Linecard operations @@ -171,6 +172,7 @@ struct devlink_info_req; * @types_count: callback to get number of supported types * @types_get: callback to get next type in list * @info_get: callback to get linecard info + * @device_info_get: callback to get linecard device info */ struct devlink_linecard_ops { int (*provision)(struct devlink_linecard *linecard, void *priv, @@ -188,6 +190,9 @@ struct devlink_linecard_ops { int (*info_get)(struct devlink_linecard *linecard, void *priv, struct devlink_info_req *req, struct netlink_ext_ack *extack); + int (*device_info_get)(struct devlink_linecard_device *device, + void *priv, struct devlink_info_req *req, + struct netlink_ext_ack *extack); }; struct devlink_sb_pool_info { @@ -1583,10 +1588,9 @@ struct devlink_linecard * devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv); void devlink_linecard_destroy(struct devlink_linecard *linecard); -struct devlink_linecard_device; struct devlink_linecard_device * devlink_linecard_device_create(struct devlink_linecard *linecard, - unsigned int device_index); + unsigned int device_index, void *priv); void devlink_linecard_device_destroy(struct devlink_linecard *linecard, struct devlink_linecard_device *linecard_device); -- cgit v1.2.3 From 30ca44eb2480ede838c7c8c131a7d25589a98f0a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 Apr 2022 12:08:57 +0200 Subject: net: mac802154: Create an offloaded transmission error helper So far there is only a helper for successful transmissions, which led device drivers to implement their own handling in case of error. Unfortunately, we really need all the drivers to give the hand back to the core once they are done in order to be able to build a proper synchronous API. So let's create a _xmit_error() helper and take this opportunity to fill the new device-global field storing Tx statuses. Suggested-by: Alexander Aring Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220407100903.1695973-5-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/mac802154.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 2c3bbc6645ba..29f289d58720 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -498,4 +498,14 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw); void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, bool ifs_handling); +/** + * ieee802154_xmit_error - offloaded frame transmission failed + * + * @hw: pointer as obtained from ieee802154_alloc_hw(). + * @skb: buffer for transmission + * @reason: error code + */ +void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, + int reason); + #endif /* NET_MAC802154_H */ -- cgit v1.2.3 From 5a1b57c0dde9b93ba8261fa1c81f9aae2bac284a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 Apr 2022 12:08:58 +0200 Subject: net: mac802154: Create an error helper for asynchronous offloading errors A few drivers do the full transmit operation asynchronously, which means that a bus error that happens when forwarding the packet to the transmitter or a timeout happening when offloading the request to the transmitter will not be reported immediately. The solution in this case is to call this new helper to free the necessary resources, restart the queue and always return the same generic TRAC error code: IEEE802154_SYSTEM_ERROR. Suggested-by: Alexander Aring Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220407100903.1695973-6-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/mac802154.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 29f289d58720..bdac0ddbdcdb 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -508,4 +508,13 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, int reason); +/** + * ieee802154_xmit_hw_error - frame could not be offloaded to the transmitter + * because of a hardware error (bus error, timeout, etc) + * + * @hw: pointer as obtained from ieee802154_alloc_hw(). + * @skb: buffer for transmission + */ +void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb); + #endif /* NET_MAC802154_H */ -- cgit v1.2.3 From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Apr 2022 13:12:37 -0700 Subject: net: generalize skb freeing deferral to per-cpu lists Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket lock is released") helped bulk TCP flows to move the cost of skbs frees outside of critical section where socket lock was held. But for RPC traffic, or hosts with RFS enabled, the solution is far from being ideal. For RPC traffic, recvmsg() has to return to user space right after skb payload has been consumed, meaning that BH handler has no chance to pick the skb before recvmsg() thread. This issue is more visible with BIG TCP, as more RPC fit one skb. For RFS, even if BH handler picks the skbs, they are still picked from the cpu on which user thread is running. Ideally, it is better to free the skbs (and associated page frags) on the cpu that originally allocated them. This patch removes the per socket anchor (sk->defer_list) and instead uses a per-cpu list, which will hold more skbs per round. This new per-cpu list is drained at the end of net_action_rx(), after incoming packets have been processed, to lower latencies. In normal conditions, skbs are added to the per-cpu list with no further action. In the (unlikely) cases where the cpu does not run net_action_rx() handler fast enough, we use an IPI to raise NET_RX_SOFTIRQ on the remote cpu. Also, we do not bother draining the per-cpu list from dev_cpu_dead() This is because skbs in this list have no requirement on how fast they should be freed. Note that we can add in the future a small per-cpu cache if we see any contention on sd->defer_lock. Tested on a pair of hosts with 100Gbit NIC, RFS enabled, and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around page recycling strategy used by NIC driver (its page pool capacity being too small compared to number of skbs/pages held in sockets receive queues) Note that this tuning was only done to demonstrate worse conditions for skb freeing for this particular test. These conditions can happen in more general production workload. 10 runs of one TCP_STREAM flow Before: Average throughput: 49685 Mbit. Kernel profiles on cpu running user thread recvmsg() show high cost for skb freeing related functions (*) 57.81% [kernel] [k] copy_user_enhanced_fast_string (*) 12.87% [kernel] [k] skb_release_data (*) 4.25% [kernel] [k] __free_one_page (*) 3.57% [kernel] [k] __list_del_entry_valid 1.85% [kernel] [k] __netif_receive_skb_core 1.60% [kernel] [k] __skb_datagram_iter (*) 1.59% [kernel] [k] free_unref_page_commit (*) 1.16% [kernel] [k] __slab_free 1.16% [kernel] [k] _copy_to_iter (*) 1.01% [kernel] [k] kfree (*) 0.88% [kernel] [k] free_unref_page 0.57% [kernel] [k] ip6_rcv_core 0.55% [kernel] [k] ip6t_do_table 0.54% [kernel] [k] flush_smp_call_function_queue (*) 0.54% [kernel] [k] free_pcppages_bulk 0.51% [kernel] [k] llist_reverse_order 0.38% [kernel] [k] process_backlog (*) 0.38% [kernel] [k] free_pcp_prepare 0.37% [kernel] [k] tcp_recvmsg_locked (*) 0.37% [kernel] [k] __list_add_valid 0.34% [kernel] [k] sock_rfree 0.34% [kernel] [k] _raw_spin_lock_irq (*) 0.33% [kernel] [k] __page_cache_release 0.33% [kernel] [k] tcp_v6_rcv (*) 0.33% [kernel] [k] __put_page (*) 0.29% [kernel] [k] __mod_zone_page_state 0.27% [kernel] [k] _raw_spin_lock After patch: Average throughput: 73076 Mbit. Kernel profiles on cpu running user thread recvmsg() looks better: 81.35% [kernel] [k] copy_user_enhanced_fast_string 1.95% [kernel] [k] _copy_to_iter 1.95% [kernel] [k] __skb_datagram_iter 1.27% [kernel] [k] __netif_receive_skb_core 1.03% [kernel] [k] ip6t_do_table 0.60% [kernel] [k] sock_rfree 0.50% [kernel] [k] tcp_v6_rcv 0.47% [kernel] [k] ip6_rcv_core 0.45% [kernel] [k] read_tsc 0.44% [kernel] [k] _raw_spin_lock_irqsave 0.37% [kernel] [k] _raw_spin_lock 0.37% [kernel] [k] native_irq_return_iret 0.33% [kernel] [k] __inet6_lookup_established 0.31% [kernel] [k] ip6_protocol_deliver_rcu 0.29% [kernel] [k] tcp_rcv_established 0.29% [kernel] [k] llist_reverse_order v2: kdoc issue (kernel bots) do not defer if (alloc_cpu == smp_processor_id()) (Paolo) replace the sk_buff_head with a single-linked list (Jakub) add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 -- include/net/tcp.h | 12 ------------ 2 files changed, 14 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index a01d6c421aa2..f9f8ecae0f8d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -292,7 +292,6 @@ struct sk_filter; * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held - * @defer_list: head of llist storing skbs to be freed * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, @@ -417,7 +416,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; - struct llist_head defer_list; #define sk_rmem_alloc sk_backlog.rmem_alloc diff --git a/include/net/tcp.h b/include/net/tcp.h index 679b1964d494..94a52ad1101c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); -#ifdef CONFIG_INET -void __sk_defer_free_flush(struct sock *sk); - -static inline void sk_defer_free_flush(struct sock *sk) -{ - if (llist_empty(&sk->defer_list)) - return; - __sk_defer_free_flush(sk); -} -#else -static inline void sk_defer_free_flush(struct sock *sk) {} -#endif int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); -- cgit v1.2.3 From 6fd1d51cfa253b5ee7dae18d7cf1df830e9b6137 Mon Sep 17 00:00:00 2001 From: Erin MacNeil Date: Wed, 27 Apr 2022 16:02:37 -0400 Subject: net: SO_RCVMARK socket option for SO_MARK with recvmsg() Adding a new socket option, SO_RCVMARK, to indicate that SO_MARK should be included in the ancillary data returned by recvmsg(). Renamed the sock_recv_ts_and_drops() function to sock_recv_cmsgs(). Signed-off-by: Erin MacNeil Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20220427200259.2564-1-lnx.erin@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index f9f8ecae0f8d..663041b92c21 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -893,6 +893,7 @@ enum sock_flags { SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ + SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -2647,20 +2648,21 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) __sock_recv_wifi_status(msg, sk, skb); } -void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb); +void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) -static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, - struct sk_buff *skb) +static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) { -#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ - (1UL << SOCK_RCVTSTAMP)) +#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVMARK)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) - __sock_recv_ts_and_drops(msg, sk, skb); + if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY) + __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) -- cgit v1.2.3 From de32bc6aad09131a30b4a9a738e2bf2ba5a9a5aa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:44 +0100 Subject: net: inline sock_alloc_send_skb sock_alloc_send_skb() is simple and just proxying to another function, so we can inline it and cut associated overhead. Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 663041b92c21..73063c88a249 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1823,11 +1823,17 @@ int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, - int noblock, int *errcode); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); + +static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, + unsigned long size, + int noblock, int *errcode) +{ + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); +} + void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); -- cgit v1.2.3 From 34c9a0e71cbb316f360919353273b185c2780cd7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 12 Apr 2022 22:09:59 +0200 Subject: cfg80211: remove cfg80211_get_chan_state() We haven't used this function for years, since commit c781944b71f8 ("cfg80211: Remove unused cfg80211_can_use_iftype_chan()") which itself removed a function unused since commit 97dc94f1d933 ("cfg80211: remove channel_switch combination check"), almost eight years ago. Also remove the now unused enum cfg80211_chan_mode and some struct members that were only used for this function. Link: https://lore.kernel.org/r/20220412220958.1a191dca19d7.Ide4448f02d0e2f1ca2992971421ffc1933a5370a@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 68713388b617..cd1212113901 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5549,8 +5549,6 @@ static inline void wiphy_unlock(struct wiphy *wiphy) * @conn_owner_nlportid: (private) connection owner socket port ID * @disconnect_wk: (private) auto-disconnect work * @disconnect_bssid: (private) the BSSID to use for auto-disconnect - * @ibss_fixed: (private) IBSS is using fixed BSSID - * @ibss_dfs_possible: (private) IBSS may change to a DFS channel * @event_list: (private) list for internal event processing * @event_lock: (private) lock for event list * @owner_nlportid: (private) owner socket port ID @@ -5599,9 +5597,6 @@ struct wireless_dev { struct cfg80211_chan_def preset_chandef; struct cfg80211_chan_def chandef; - bool ibss_fixed; - bool ibss_dfs_possible; - bool ps; int ps_timeout; -- cgit v1.2.3 From 36f8423597000bd7d5e48b7b306e1d0958e72359 Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Wed, 23 Mar 2022 15:46:35 -0700 Subject: cfg80211: support disabling EHT mode Allow userspace to disable EHT mode during association. Signed-off-by: Muna Sinada Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20220323224636.20211-1-quic_alokad@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cd1212113901..6a3e3f0a8615 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2735,6 +2735,7 @@ struct cfg80211_auth_request { * userspace if this flag is set. Only applicable for cfg80211_connect() * request (connect callback). * @ASSOC_REQ_DISABLE_HE: Disable HE + * @ASSOC_REQ_DISABLE_EHT: Disable EHT */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), @@ -2742,6 +2743,7 @@ enum cfg80211_assoc_req_flags { ASSOC_REQ_USE_RRM = BIT(2), CONNECT_REQ_EXTERNAL_AUTH_SUPPORT = BIT(3), ASSOC_REQ_DISABLE_HE = BIT(4), + ASSOC_REQ_DISABLE_EHT = BIT(5), }; /** -- cgit v1.2.3 From b2d057560b8107c633b39aabe517ff9d93f285e3 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 2 May 2022 10:46:08 +0200 Subject: secure_seq: use the 64 bits of the siphash for port offset calculation SipHash replaced MD5 in secure_ipv{4,6}_port_ephemeral() via commit 7cd23e5300c1 ("secure_seq: use SipHash in place of MD5"), but the output remained truncated to 32-bit only. In order to exploit more bits from the hash, let's make the functions return the full 64-bit of siphash_3u32(). We also make sure the port offset calculation in __inet_hash_connect() remains done on 32-bit to avoid the need for div_u64_rem() and an extra cost on 32-bit systems. Cc: Jason A. Donenfeld Cc: Moshe Kol Cc: Yossi Gilad Cc: Amit Klein Reviewed-by: Eric Dumazet Signed-off-by: Willy Tarreau Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- include/net/secure_seq.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f72ec113ae56..98e1ec1a14f0 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -425,7 +425,7 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) } int __inet_hash_connect(struct inet_timewait_death_row *death_row, - struct sock *sk, u32 port_offset, + struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)); diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index d7d2495f83c2..dac91aa38c5a 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -4,8 +4,8 @@ #include -u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); -u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); u32 secure_tcp_seq(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); -- cgit v1.2.3 From c4a67a21a6d255ddcbaa076c0412aad73c7e0c02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 4 May 2022 08:40:37 -0700 Subject: Revert "Merge branch 'mlxsw-line-card-model'" This reverts commit 5e927a9f4b9f29d78a7c7d66ea717bb5c8bbad8e, reversing changes made to cfc1d91a7d78cf9de25b043d81efcc16966d55b3. The discussion is still ongoing so let's remove the uAPI until the discussion settles. Link: https://lore.kernel.org/all/20220425090021.32e9a98f@kernel.org/ Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/20220504154037.539442-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 062895973656..2a2a2a0c93f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -150,9 +150,6 @@ struct devlink_port_new_attrs { sfnum_valid:1; }; -struct devlink_info_req; -struct devlink_linecard_device; - /** * struct devlink_linecard_ops - Linecard operations * @provision: callback to provision the linecard slot with certain @@ -171,8 +168,6 @@ struct devlink_linecard_device; * provisioned. * @types_count: callback to get number of supported types * @types_get: callback to get next type in list - * @info_get: callback to get linecard info - * @device_info_get: callback to get linecard device info */ struct devlink_linecard_ops { int (*provision)(struct devlink_linecard *linecard, void *priv, @@ -187,12 +182,6 @@ struct devlink_linecard_ops { void (*types_get)(struct devlink_linecard *linecard, void *priv, unsigned int index, const char **type, const void **type_priv); - int (*info_get)(struct devlink_linecard *linecard, void *priv, - struct devlink_info_req *req, - struct netlink_ext_ack *extack); - int (*device_info_get)(struct devlink_linecard_device *device, - void *priv, struct devlink_info_req *req, - struct netlink_ext_ack *extack); }; struct devlink_sb_pool_info { @@ -639,6 +628,7 @@ struct devlink_flash_update_params { #define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) struct devlink_region; +struct devlink_info_req; /** * struct devlink_region_ops - Region operations @@ -1588,12 +1578,6 @@ struct devlink_linecard * devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv); void devlink_linecard_destroy(struct devlink_linecard *linecard); -struct devlink_linecard_device * -devlink_linecard_device_create(struct devlink_linecard *linecard, - unsigned int device_index, void *priv); -void -devlink_linecard_device_destroy(struct devlink_linecard *linecard, - struct devlink_linecard_device *linecard_device); void devlink_linecard_provision_set(struct devlink_linecard *linecard, const char *type); void devlink_linecard_provision_clear(struct devlink_linecard *linecard); -- cgit v1.2.3 From ea66758c1795cef81dc59cd5240e9d0f5c5207cf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 May 2022 14:54:06 -0700 Subject: tcp: allow MPTCP to update the announced window The MPTCP RFC requires that the MPTCP-level receive window's right edge never moves backward. Currently the MPTCP code enforces such constraint while tracking the right edge, but it does not reflects it on the wire, as MPTCP lacks a suitable hook to update accordingly the TCP header. This change modifies the existing mptcp_write_options() hook, providing the current packet's TCP header to the MPTCP protocol, so that the next patch could implement the above mentioned constraint. No functional changes intended. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8b1afd6f5cc4..d4ec894ce67b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -125,7 +125,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); -void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts); void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info); -- cgit v1.2.3 From b01a277a0520edd5c815af346f05ef2c747919b6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 May 2022 13:06:38 +0300 Subject: xfrm: free not used XFRM_ESP_NO_TRAILER flag After removal of Innova IPsec support from mlx5 driver, the last user of this XFRM_ESP_NO_TRAILER was gone too. This means that we can safely remove it as no other hardware is capable (or need) to remove ESP trailer. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6fb899ff5afc..b41278abeeaa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1006,7 +1006,7 @@ struct xfrm_offload { #define CRYPTO_FALLBACK 8 #define XFRM_GSO_SEGMENT 16 #define XFRM_GRO 32 -#define XFRM_ESP_NO_TRAILER 64 +/* 64 is free */ #define XFRM_DEV_RESUME 128 #define XFRM_XMIT 256 -- cgit v1.2.3 From a36708e646586f74d073199828ed878b223e988d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 May 2022 13:06:39 +0300 Subject: xfrm: delete not used number of external headers num_exthdrs is set but never used, so delete it. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b41278abeeaa..4e097423116c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -131,7 +131,6 @@ struct xfrm_state_offload { netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; - unsigned int num_exthdrs; u8 flags; }; -- cgit v1.2.3 From 87e0a94e60ea2e29be9dec6bc146fbc9861a4055 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 May 2022 13:06:40 +0300 Subject: xfrm: rename xfrm_state_offload struct to allow reuse The struct xfrm_state_offload has all fields needed to hold information for offloaded policies too. In order to do not create new struct with same fields, let's rename existing one and reuse it later. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Acked-by: David S. Miller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4e097423116c..bb20278d689c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -126,7 +126,7 @@ struct xfrm_state_walk { struct xfrm_address_filter *filter; }; -struct xfrm_state_offload { +struct xfrm_dev_offload { struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *real_dev; @@ -246,7 +246,7 @@ struct xfrm_state { struct xfrm_lifetime_cur curlft; struct hrtimer mtimer; - struct xfrm_state_offload xso; + struct xfrm_dev_offload xso; /* used to fix curlft->add_time when changing date */ long saved_tmo; @@ -1865,7 +1865,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { - struct xfrm_state_offload *xso = &x->xso; + struct xfrm_dev_offload *xso = &x->xso; if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); @@ -1891,7 +1891,7 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) static inline void xfrm_dev_state_delete(struct xfrm_state *x) { - struct xfrm_state_offload *xso = &x->xso; + struct xfrm_dev_offload *xso = &x->xso; if (xso->dev) xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); @@ -1899,7 +1899,7 @@ static inline void xfrm_dev_state_delete(struct xfrm_state *x) static inline void xfrm_dev_state_free(struct xfrm_state *x) { - struct xfrm_state_offload *xso = &x->xso; + struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = xso->dev; if (dev && dev->xfrmdev_ops) { -- cgit v1.2.3 From 482db2f1dd211f73ad9d71e33ae15c1df6379982 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 May 2022 13:06:41 +0300 Subject: xfrm: store and rely on direction to construct offload flags XFRM state doesn't need anything from flags except to understand direction, so store it separately. For future patches, such change will allow us to reuse xfrm_dev_offload for policy offload too, which has three possible directions instead of two. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index bb20278d689c..45422f7be0c5 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -126,12 +126,18 @@ struct xfrm_state_walk { struct xfrm_address_filter *filter; }; +enum { + XFRM_DEV_OFFLOAD_IN = 1, + XFRM_DEV_OFFLOAD_OUT, +}; + struct xfrm_dev_offload { struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; u8 flags; + u8 dir : 2; }; struct xfrm_mode { -- cgit v1.2.3 From 254c4a824c7c6a53360bc4974710e4213b8b7f5d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 May 2022 13:06:45 +0300 Subject: xfrm: drop not needed flags variable in XFRM offload struct After drivers were converted to rely on direction, the flags is not used anymore and can be removed. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 45422f7be0c5..736c349de8bf 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -136,7 +136,6 @@ struct xfrm_dev_offload { netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; - u8 flags; u8 dir : 2; }; -- cgit v1.2.3 From fe5233b0ba0d2216d549f93e4540542b99b97642 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 May 2022 19:22:13 +0300 Subject: net: dsa: delete dsa_port_walk_{fdbs,mdbs} All the users of these functions are gone, delete them before they gain new ones. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 934958fda962..efd33956df37 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1239,12 +1239,6 @@ struct dsa_switch_driver { struct net_device *dsa_dev_to_net_device(struct device *dev); -typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid, - struct dsa_db db); - -int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); -int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); -- cgit v1.2.3 From 5b87be9e4978fe507c7e14c0bdff147d10d39aec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:37 -0700 Subject: net: add include/net/net_debug.h Remove from include/linux/netdevice.h helpers that send debug/info/warnings to syslog. We plan adding more helpers in following patches. v2: added two includes, and 'struct net_device' forward declaration to avoid compile errors (kernel bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_debug.h | 151 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 include/net/net_debug.h (limited to 'include/net') diff --git a/include/net/net_debug.h b/include/net/net_debug.h new file mode 100644 index 000000000000..19cd2700c20e --- /dev/null +++ b/include/net/net_debug.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NET_DEBUG_H +#define _LINUX_NET_DEBUG_H + +#include +#include + +struct net_device; + +__printf(3, 4) __cold +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); +__printf(2, 3) __cold +void netdev_emerg(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_alert(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_crit(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_err(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_warn(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_notice(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_info(const struct net_device *dev, const char *format, ...); + +#define netdev_level_once(level, dev, fmt, ...) \ +do { \ + static bool __section(".data.once") __print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +#define netdev_emerg_once(dev, fmt, ...) \ + netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) +#define netdev_alert_once(dev, fmt, ...) \ + netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) +#define netdev_crit_once(dev, fmt, ...) \ + netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) +#define netdev_err_once(dev, fmt, ...) \ + netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) +#define netdev_warn_once(dev, fmt, ...) \ + netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) +#define netdev_notice_once(dev, fmt, ...) \ + netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) +#define netdev_info_once(dev, fmt, ...) \ + netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) + +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#define netdev_dbg(__dev, format, args...) \ +do { \ + dynamic_netdev_dbg(__dev, format, ##args); \ +} while (0) +#elif defined(DEBUG) +#define netdev_dbg(__dev, format, args...) \ + netdev_printk(KERN_DEBUG, __dev, format, ##args) +#else +#define netdev_dbg(__dev, format, args...) \ +({ \ + if (0) \ + netdev_printk(KERN_DEBUG, __dev, format, ##args); \ +}) +#endif + +#if defined(VERBOSE_DEBUG) +#define netdev_vdbg netdev_dbg +#else + +#define netdev_vdbg(dev, format, args...) \ +({ \ + if (0) \ + netdev_printk(KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + +/* netif printk helpers, similar to netdev_printk */ + +#define netif_printk(priv, type, level, dev, fmt, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + netdev_printk(level, (dev), fmt, ##args); \ +} while (0) + +#define netif_level(level, priv, type, dev, fmt, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + netdev_##level(dev, fmt, ##args); \ +} while (0) + +#define netif_emerg(priv, type, dev, fmt, args...) \ + netif_level(emerg, priv, type, dev, fmt, ##args) +#define netif_alert(priv, type, dev, fmt, args...) \ + netif_level(alert, priv, type, dev, fmt, ##args) +#define netif_crit(priv, type, dev, fmt, args...) \ + netif_level(crit, priv, type, dev, fmt, ##args) +#define netif_err(priv, type, dev, fmt, args...) \ + netif_level(err, priv, type, dev, fmt, ##args) +#define netif_warn(priv, type, dev, fmt, args...) \ + netif_level(warn, priv, type, dev, fmt, ##args) +#define netif_notice(priv, type, dev, fmt, args...) \ + netif_level(notice, priv, type, dev, fmt, ##args) +#define netif_info(priv, type, dev, fmt, args...) \ + netif_level(info, priv, type, dev, fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#define netif_dbg(priv, type, netdev, format, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + dynamic_netdev_dbg(netdev, format, ##args); \ +} while (0) +#elif defined(DEBUG) +#define netif_dbg(priv, type, dev, format, args...) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) +#else +#define netif_dbg(priv, type, dev, format, args...) \ +({ \ + if (0) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + +/* if @cond then downgrade to debug, else print at @level */ +#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ + do { \ + if (cond) \ + netif_dbg(priv, type, netdev, fmt, ##args); \ + else \ + netif_ ## level(priv, type, netdev, fmt, ##args); \ + } while (0) + +#if defined(VERBOSE_DEBUG) +#define netif_vdbg netif_dbg +#else +#define netif_vdbg(priv, type, dev, format, args...) \ +({ \ + if (0) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + + +#endif /* _LINUX_NET_DEBUG_H */ -- cgit v1.2.3 From d268c1f5cfc92eb5bb605f7365769aacd93be234 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:38 -0700 Subject: net: add CONFIG_DEBUG_NET This config option enables network debugging checks. This patch adds DEBUG_NET_WARN_ON_ONCE(cond) Note that this is not a replacement for WARN_ON_ONCE(cond) as (cond) is not evaluated if CONFIG_DEBUG_NET is not set. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_debug.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/net_debug.h b/include/net/net_debug.h index 19cd2700c20e..1e74684cbbdb 100644 --- a/include/net/net_debug.h +++ b/include/net/net_debug.h @@ -148,4 +148,10 @@ do { \ #endif +#if defined(CONFIG_DEBUG_NET) +#define DEBUG_NET_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#else +#define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#endif + #endif /* _LINUX_NET_DEBUG_H */ -- cgit v1.2.3 From 8b796475fd7882663a870456466a4fb315cc1bd6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 10 May 2022 16:57:34 +0200 Subject: net/sched: act_pedit: really ensure the skb is writable Currently pedit tries to ensure that the accessed skb offset is writable via skb_unclone(). The action potentially allows touching any skb bytes, so it may end-up modifying shared data. The above causes some sporadic MPTCP self-test failures, due to this code: tc -n $ns2 filter add dev ns2eth$i egress \ protocol ip prio 1000 \ handle 42 fw \ action pedit munge offset 148 u8 invert \ pipe csum tcp \ index 100 The above modifies a data byte outside the skb head and the skb is a cloned one, carrying a TCP output packet. This change addresses the issue by keeping track of a rough over-estimate highest skb offset accessed by the action and ensuring such offset is really writable. Note that this may cause performance regressions in some scenarios, but hopefully pedit is not in the critical path. Fixes: db2c24175d14 ("act_pedit: access skb->data safely") Acked-by: Mat Martineau Tested-by: Geliang Tang Signed-off-by: Paolo Abeni Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/1fcf78e6679d0a287dd61bb0f04730ce33b3255d.1652194627.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_pedit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 748cf87a4d7e..3e02709a1df6 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -14,6 +14,7 @@ struct tcf_pedit { struct tc_action common; unsigned char tcfp_nkeys; unsigned char tcfp_flags; + u32 tcfp_off_max_hint; struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; }; -- cgit v1.2.3 From 103a2f3255a95991252f8f13375c3a96a75011cd Mon Sep 17 00:00:00 2001 From: Itay Iellin Date: Sat, 7 May 2022 08:32:48 -0400 Subject: Bluetooth: Fix the creation of hdev->name Set a size limit of 8 bytes of the written buffer to "hdev->name" including the terminating null byte, as the size of "hdev->name" is 8 bytes. If an id value which is greater than 9999 is allocated, then the "snprintf(hdev->name, sizeof(hdev->name), "hci%d", id)" function call would lead to a truncation of the id value in decimal notation. Set an explicit maximum id parameter in the id allocation function call. The id allocation function defines the maximum allocated id value as the maximum id parameter value minus one. Therefore, HCI_MAX_ID is defined as 10000. Signed-off-by: Itay Iellin Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8abd08245326..62d7b81b1cb7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -36,6 +36,9 @@ /* HCI priority */ #define HCI_PRIO_MAX 7 +/* HCI maximum id value */ +#define HCI_MAX_ID 10000 + /* HCI Core structures */ struct inquiry_data { bdaddr_t bdaddr; -- cgit v1.2.3 From 465c3de42b5dcdf0fa8cf996a81de6ba0e8553a3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 11 May 2022 12:50:16 +0300 Subject: net: dsa: introduce the dsa_cpu_ports() helper Similar to dsa_user_ports() which retrieves a port mask of all user ports, introduce dsa_cpu_ports() which retrieves the mask of all CPU ports of a switch. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index efd33956df37..76257a9f0e1b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -590,6 +590,17 @@ static inline u32 dsa_user_ports(struct dsa_switch *ds) return mask; } +static inline u32 dsa_cpu_ports(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp; + u32 mask = 0; + + dsa_switch_for_each_cpu_port(cpu_dp, ds) + mask |= BIT(cpu_dp->index); + + return mask; +} + /* Return the local port used to reach an arbitrary switch device */ static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) { -- cgit v1.2.3 From 72c3b0c7359a6f91dd03e08b839adccd9d4268a8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 11 May 2022 12:50:17 +0300 Subject: net: dsa: felix: manage host flooding using a specific driver callback At the time - commit 7569459a52c9 ("net: dsa: manage flooding on the CPU ports") - not introducing a dedicated switch callback for host flooding made sense, because for the only user, the felix driver, there was nothing different to do for the CPU port than set the flood flags on the CPU port just like on any other bridge port. There are 2 reasons why this approach is not good enough, however. (1) Other drivers, like sja1105, support configuring flooding as a function of {ingress port, egress port}, whereas the DSA ->port_bridge_flags() function only operates on an egress port. So with that driver we'd have useless host flooding from user ports which don't need it. (2) Even with the felix driver, support for multiple CPU ports makes it difficult to piggyback on ->port_bridge_flags(). The way in which the felix driver is going to support host-filtered addresses with multiple CPU ports is that it will direct these addresses towards both CPU ports (in a sort of multicast fashion), then restrict the forwarding to only one of the two using the forwarding masks. Consequently, flooding will also be enabled towards both CPU ports. However, ->port_bridge_flags() gets passed the index of a single CPU port, and that leaves the flood settings out of sync between the 2 CPU ports. This is to say, it's better to have a specific driver method for host flooding, which takes the user port as argument. This solves problem (1) by allowing the driver to do different things for different user ports, and problem (2) by abstracting the operation and letting the driver do whatever, rather than explicitly making the DSA core point to the CPU port it thinks needs to be touched. This new method also creates a problem, which is that cross-chip setups are not handled. However I don't have hardware right now where I can test what is the proper thing to do, and there isn't hardware compatible with multi-switch trees that supports host flooding. So it remains a problem to be tackled in the future. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 76257a9f0e1b..cfb287b0d311 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -978,6 +978,8 @@ struct dsa_switch_ops { int (*port_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); + void (*port_set_host_flood)(struct dsa_switch *ds, int port, + bool uc, bool mc); /* * VLAN support -- cgit v1.2.3 From bacf93b0561937695e9c6c1dc1d8ed10ca80eb81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 11 May 2022 12:50:18 +0300 Subject: net: dsa: remove port argument from ->change_tag_protocol() DSA has not supported (and probably will not support in the future either) independent tagging protocols per CPU port. Different switch drivers have different requirements, some may need to replicate some settings for each CPU port, some may need to apply some settings on a single CPU port, while some may have to configure some global settings and then some per-CPU-port settings. In any case, the current model where DSA calls ->change_tag_protocol for each CPU port turns out to be impractical for drivers where there are global things to be done. For example, felix calls dsa_tag_8021q_register(), which makes no sense per CPU port, so it suppresses the second call. Let drivers deal with replication towards all CPU ports, and remove the CPU port argument from the function prototype. Signed-off-by: Vladimir Oltean Acked-by: Luiz Angelo Daros de Luca Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index cfb287b0d311..14f07275852b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -579,6 +579,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) +#define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \ + dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ + if (dsa_port_is_cpu((_dp))) + static inline u32 dsa_user_ports(struct dsa_switch *ds) { struct dsa_port *dp; @@ -803,7 +807,7 @@ struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); - int (*change_tag_protocol)(struct dsa_switch *ds, int port, + int (*change_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); /* * Method for switch drivers to connect to the tagging protocol driver -- cgit v1.2.3 From 8ea1eebb49a2dfee1dce621a638cc1626e542392 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 May 2022 17:05:52 -0700 Subject: net: inet: Remove count from inet_listen_hashbucket After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address") and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"), the count is no longer used. This patch removes it. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 98e1ec1a14f0..103fc7a30e60 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -111,7 +111,6 @@ struct inet_bind_hashbucket { #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - unsigned int count; union { struct hlist_head head; struct hlist_nulls_head nulls_head; -- cgit v1.2.3 From cae3873c5b3a4fcd9706fb461ff4e91bdf1f0120 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 May 2022 17:06:05 -0700 Subject: net: inet: Retire port only listening_hash The listen sk is currently stored in two hash tables, listening_hash (hashed by port) and lhash2 (hashed by port and address). After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address") and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"), the TCP-SYN lookup fast path does not use listening_hash. The commit 05c0b35709c5 ("tcp: seq_file: Replace listening_hash with lhash2") also moved the seq_file (/proc/net/tcp) iteration usage from listening_hash to lhash2. There are still a few listening_hash usages left. One of them is inet_reuseport_add_sock() which uses the listening_hash to search a listen sk during the listen() system call. This turns out to be very slow on use cases that listen on many different VIPs at a popular port (e.g. 443). [ On top of the slowness in adding to the tail in the IPv6 case ]. The latter patch has a selftest to demonstrate this case. This patch takes this chance to move all remaining listening_hash usages to lhash2 and then retire listening_hash. Since most changes need to be done together, it is hard to cut the listening_hash to lhash2 switch into small patches. The changes in this patch is highlighted here for the review purpose. 1. Because of the listening_hash removal, lhash2 can use the sk->sk_nulls_node instead of the icsk->icsk_listen_portaddr_node. This will also keep the sk_unhashed() check to work as is after stop adding sk to listening_hash. The union is removed from inet_listen_hashbucket because only nulls_head is needed. 2. icsk->icsk_listen_portaddr_node and its helpers are removed. 3. The current lhash2 users needs to iterate with sk_nulls_node instead of icsk_listen_portaddr_node. One case is in the inet[6]_lhash2_lookup(). Another case is the seq_file iterator in tcp_ipv4.c. One thing to note is sk_nulls_next() is needed because the old inet_lhash2_for_each_icsk_continue() does a "next" first before iterating. 4. Move the remaining listening_hash usage to lhash2 inet_reuseport_add_sock() which this series is trying to improve. inet_diag.c and mptcp_diag.c are the final two remaining use cases and is moved to lhash2 now also. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 2 -- include/net/inet_hashtables.h | 41 +------------------------------------- 2 files changed, 1 insertion(+), 42 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3908296d103f..85cd695e7fd1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -66,7 +66,6 @@ struct inet_connection_sock_af_ops { * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_clean_acked Clean acked data hook - * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -96,7 +95,6 @@ struct inet_connection_sock { const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); - struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 103fc7a30e60..1b8706719d4f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -111,10 +111,7 @@ struct inet_bind_hashbucket { #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - union { - struct hlist_head head; - struct hlist_nulls_head nulls_head; - }; + struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ @@ -142,32 +139,8 @@ struct inet_hashinfo { /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; - - /* All the above members are written once at bootup and - * never written again _or_ are predominantly read-access. - * - * Now align to a new cache line as all the following members - * might be often dirty. - */ - /* All sockets in TCP_LISTEN state will be in listening_hash. - * This is the only table where wildcard'd TCP sockets can - * exist. listening_hash is only hashed by local port number. - * If lhash2 is initialized, the same socket will also be hashed - * to lhash2 by port and address. - */ - struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] - ____cacheline_aligned_in_smp; }; -#define inet_lhash2_for_each_icsk_continue(__icsk) \ - hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node) - -#define inet_lhash2_for_each_icsk(__icsk, list) \ - hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node) - -#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ - hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) - static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { @@ -229,23 +202,11 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum); -/* These can have wildcards, don't try too hard. */ -static inline u32 inet_lhashfn(const struct net *net, const unsigned short num) -{ - return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1); -} - -static inline int inet_sk_listen_hashfn(const struct sock *sk) -{ - return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num); -} - /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); -void inet_hashinfo_init(struct inet_hashinfo *h); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, -- cgit v1.2.3 From 05abad857277dda198063017b00ba5b9fed2c0cb Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 1 Apr 2022 16:38:23 -0700 Subject: Bluetooth: HCI: Add HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN quirk This adds HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN quirk which can be used to mark HCI_Enhanced_Setup_Synchronous_Connection as broken even if its support command bit are set since some controller report it as supported but the command don't work properly with some configurations (e.g. BT_VOICE_TRANSPARENT/mSBC). Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 9 +++++++++ include/net/bluetooth/hci_core.h | 8 ++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 69ef31cea582..62a9bb022aed 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -265,6 +265,15 @@ enum { * runtime suspend, because event filtering takes place there. */ HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, + + /* + * When this quirk is set, disables the use of + * HCI_OP_ENHANCED_SETUP_SYNC_CONN command to setup SCO connections. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 62d7b81b1cb7..5a52a2018b56 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1495,8 +1495,12 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define privacy_mode_capable(dev) (use_ll_privacy(dev) && \ (hdev->commands[39] & 0x04)) -/* Use enhanced synchronous connection if command is supported */ -#define enhanced_sco_capable(dev) ((dev)->commands[29] & 0x08) +/* Use enhanced synchronous connection if command is supported and its quirk + * has not been set. + */ +#define enhanced_sync_conn_capable(dev) \ + (((dev)->commands[29] & 0x08) && \ + !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ -- cgit v1.2.3 From 4915d50e300e96929d2462041d6f6c6f061167fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 May 2022 09:56:01 -0700 Subject: inet: add READ_ONCE(sk->sk_bound_dev_if) in INET_MATCH() INET_MATCH() runs without holding a lock on the socket. We probably need to annotate most reads. This patch makes INET_MATCH() an inline function to ease our changes. v2: We remove the 32bit version of it, as modern compilers should generate the same code really, no need to try to be smarter. Also make 'struct net *net' the first argument. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 35 ++++++++++++++++------------------- include/net/sock.h | 3 --- 2 files changed, 16 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 1b8706719d4f..59d72024ad1d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -255,7 +255,6 @@ static inline struct sock *inet_lookup_listener(struct net *net, ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif -#if (BITS_PER_LONG == 64) #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ @@ -267,24 +266,22 @@ static inline struct sock *inet_lookup_listener(struct net *net, (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ -#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ - (((__sk)->sk_portpair == (__ports)) && \ - ((__sk)->sk_addrpair == (__cookie)) && \ - (((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ - net_eq(sock_net(__sk), (__net))) -#else /* 32-bit arch */ -#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ - const int __name __deprecated __attribute__((unused)) - -#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ - (((__sk)->sk_portpair == (__ports)) && \ - ((__sk)->sk_daddr == (__saddr)) && \ - ((__sk)->sk_rcv_saddr == (__daddr)) && \ - (((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ - net_eq(sock_net(__sk), (__net))) -#endif /* 64-bit arch */ + +static inline bool INET_MATCH(struct net *net, const struct sock *sk, + const __addrpair cookie, const __portpair ports, + int dif, int sdif) +{ + int bound_dev_if; + + if (!net_eq(sock_net(sk), net) || + sk->sk_portpair != ports || + sk->sk_addrpair != cookie) + return false; + + /* Paired with WRITE_ONCE() from sock_bindtoindex_locked() */ + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + return bound_dev_if == dif || bound_dev_if == sdif; +} /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM diff --git a/include/net/sock.h b/include/net/sock.h index 73063c88a249..01edfde4257d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -161,9 +161,6 @@ typedef __u64 __bitwise __addrpair; * for struct sock and struct inet_timewait_sock. */ struct sock_common { - /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned - * address on 64bit arches : cf INET_MATCH() - */ union { __addrpair skc_addrpair; struct { -- cgit v1.2.3 From 04c494e68a1340cb5c70d4704ac32d863dc64293 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 May 2022 14:14:56 -0700 Subject: Revert "tcp/dccp: get rid of inet_twsk_purge()" This reverts commits: 0dad4087a86a2cbe177404dc73f18ada26a2c390 ("tcp/dccp: get rid of inet_twsk_purge()") d507204d3c5cc57d9a8bdf0a477615bb59ea1611 ("tcp/dccp: add tw->tw_bslot") As Leonard pointed out, a newly allocated netns can happen to reuse a freed 'struct net'. While TCP TW timers were covered by my patches, other things were not: 1) Lookups in rx path (INET_MATCH() and INET6_MATCH()), as they look at 4-tuple plus the 'struct net' pointer. 2) /proc/net/tcp[6] and inet_diag, same reason. 3) hashinfo->bhash[], same reason. Fixing all this seems risky, lets instead revert. In the future, we might have a per netns tcp hash table, or a per netns list of timewait sockets... Fixes: 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()") Signed-off-by: Eric Dumazet Reported-by: Leonard Crestez Tested-by: Leonard Crestez Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 463ae5d33eb0..5b47545f22d3 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -71,7 +71,6 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; - u32 tw_bslot; /* bind bucket slot */ struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; @@ -110,6 +109,8 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); + static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { -- cgit v1.2.3 From 2ed3bf188b33630cf9d93b996ebf001847a00b5a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:16 +0200 Subject: netfilter: ecache: use dedicated list for event redelivery This disentangles event redelivery and the percpu dying list. Because entries are now stored on a dedicated list, all entries are in NFCT_ECACHE_DESTROY_FAIL state and all entries still have confirmed bit set -- the reference count is at least 1. The 'struct net' back-pointer can be removed as well. The pcpu dying list will be removed eventually, it has no functionality. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 ++- include/net/netfilter/nf_conntrack_ecache.h | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 69e6c6a218be..28672a944499 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -45,7 +45,8 @@ union nf_conntrack_expect_proto { struct nf_conntrack_net_ecache { struct delayed_work dwork; - struct netns_ct *ct_net; + spinlock_t dying_lock; + struct hlist_nulls_head dying_list; }; struct nf_conntrack_net { diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 6c4c490a3e34..a6135b5030dd 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -14,7 +14,6 @@ #include enum nf_ct_ecache_state { - NFCT_ECACHE_UNKNOWN, /* destroy event not sent */ NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */ }; @@ -23,7 +22,6 @@ struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ - enum nf_ct_ecache_state state:8;/* ecache state */ u32 missed; /* missed events */ u32 portid; /* netlink portid of destroyer */ }; -- cgit v1.2.3 From 0d3cc504ba9cdcff76346306c37eb1ea01e60a86 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:17 +0200 Subject: netfilter: conntrack: include ecache dying list in dumps The new pernet dying list includes conntrack entries that await delivery of the 'destroy' event via ctnetlink. The old percpu dying list will be removed soon. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index a6135b5030dd..b57d73785e4d 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -164,6 +164,8 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state); void nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); +struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net); + static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return net->ct.ecache_dwork_pending; -- cgit v1.2.3 From 1397af5bfd7d32b0cf2adb70a78c9a9e8f11d912 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:18 +0200 Subject: netfilter: conntrack: remove the percpu dying list Its no longer needed. Entries that need event redelivery are placed on the new pernet dying list. The advantage is that there is no need to take additional spinlock on conntrack removal unless event redelivery failed or the conntrack entry was never added to the table in the first place (confirmed bit not set). The IPS_CONFIRMED bit now needs to be set as soon as the entry has been unlinked from the unconfirmed list, else the destroy function may attempt to unlink it a second time. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 0294f3d473af..e985a3010b89 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -96,7 +96,6 @@ struct nf_ip_net { struct ct_pcpu { spinlock_t lock; struct hlist_nulls_head unconfirmed; - struct hlist_nulls_head dying; }; struct netns_ct { -- cgit v1.2.3 From 78222bacfca97cb18505df1ba5f3591864498a7e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:19 +0200 Subject: netfilter: cttimeout: decouple unlink and free on netns destruction Make it so netns pre_exit unlinks the objects from the pernet list, so they cannot be found anymore. netns core issues a synchronize_rcu() before calling the exit hooks so any the time the exit hooks run unconfirmed nf_conn entries have been free'd or they have been committed to the hashtable. The exit hook still tags unconfirmed entries as dying, this can now be removed in a followup change. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 3ea94f6f3844..fea258983d23 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -17,14 +17,6 @@ struct nf_ct_timeout { char data[]; }; -struct ctnl_timeout { - struct list_head head; - struct rcu_head rcu_head; - refcount_t refcnt; - char name[CTNL_TIMEOUT_NAME_MAX]; - struct nf_ct_timeout timeout; -}; - struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; -- cgit v1.2.3 From 17438b42ce14cb60ceda9ae62ad5dd022d55a216 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:20 +0200 Subject: netfilter: remove nf_ct_unconfirmed_destroy helper This helper tags connections not yet in the conntrack table as dying. These nf_conn entries will be dropped instead when the core attempts to insert them from the input or postrouting 'confirm' hook. After the previous change, the entries get unlinked from the list earlier, so that by the time the actual exit hook runs, new connections no longer have a timeout policy assigned. Its enough to walk the hashtable instead. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 28672a944499..f60212244b13 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -237,9 +237,6 @@ static inline bool nf_ct_kill(struct nf_conn *ct) return nf_ct_delete(ct, 0, 0); } -/* Set all unconfirmed conntrack as dying */ -void nf_ct_unconfirmed_destroy(struct net *); - /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), -- cgit v1.2.3 From c56716c69ce1ac320432fb1ea5654196ba24d2f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:21 +0200 Subject: netfilter: extensions: introduce extension genid count Multiple netfilter extensions store pointers to external data in their extension area struct. Examples: 1. Timeout policies 2. Connection tracking helpers. No references are taken for these. When a helper or timeout policy is removed, the conntrack table gets traversed and affected extensions are cleared. Conntrack entries not yet in the hashtable are referenced via a special list, the unconfirmed list. On removal of a policy or connection tracking helper, the unconfirmed list gets traversed an all entries are marked as dying, this prevents them from getting committed to the table at insertion time: core checks for dying bit, if set, the conntrack entry gets destroyed at confirm time. The disadvantage is that each new conntrack has to be added to the percpu unconfirmed list, and each insertion needs to remove it from this list. The list is only ever needed when a policy or helper is removed -- a rare occurrence. Add a generation ID count: Instead of adding to the list and then traversing that list on policy/helper removal, increment a counter that is stored in the extension area. For unconfirmed conntracks, the extension has the genid valid at ct allocation time. Removal of a helper/policy etc. increments the counter. At confirmation time, validate that ext->genid == global_id. If the stored number is not the same, do not allow the conntrack insertion, just like as if a confirmed-list traversal would have flagged the entry as dying. After insertion, the genid is no longer relevant (conntrack entries are now reachable via the conntrack table iterators and is set to 0. This allows removal of the percpu unconfirmed list. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 31 +++++++++++++++-------------- include/net/netfilter/nf_conntrack_labels.h | 10 +++++++++- 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 96635ad2acc7..0b247248b032 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -34,21 +34,11 @@ enum nf_ct_ext_id { NF_CT_EXT_NUM, }; -#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help -#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat -#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj -#define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct -#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache -#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp -#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout -#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels -#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy -#define NF_CT_EXT_ACT_CT_TYPE struct nf_conn_act_ct_ext - /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; + unsigned int gen_id; char data[] __aligned(8); }; @@ -62,17 +52,28 @@ static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id) return (ct->ext && __nf_ct_ext_exist(ct->ext, id)); } -static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) +void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id); + +static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id) { - if (!nf_ct_ext_exist(ct, id)) + struct nf_ct_ext *ext = ct->ext; + + if (!ext || !__nf_ct_ext_exist(ext, id)) return NULL; + if (unlikely(ext->gen_id)) + return __nf_ct_ext_find(ext, id); + return (void *)ct->ext + ct->ext->offset[id]; } -#define nf_ct_ext_find(ext, id) \ - ((id##_TYPE *)__nf_ct_ext_find((ext), (id))) /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); +/* ext genid. if ext->id != ext_genid, extensions cannot be used + * anymore unless conntrack has CONFIRMED bit set. + */ +extern atomic_t nf_conntrack_ext_genid; +void nf_ct_ext_bump_genid(void); + #endif /* _NF_CONNTRACK_EXTEND_H */ diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index 3c23298e68ca..66bab6c60d12 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -17,10 +17,18 @@ struct nf_conn_labels { unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)]; }; +/* Can't use nf_ct_ext_find(), flow dissector cannot use symbols + * exported by nf_conntrack module. + */ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS - return nf_ct_ext_find(ct, NF_CT_EXT_LABELS); + struct nf_ct_ext *ext = ct->ext; + + if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS)) + return NULL; + + return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS]; #else return NULL; #endif -- cgit v1.2.3 From 8a75a2c1741073f0c2d7bee146648e717527e048 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Apr 2022 13:01:24 +0200 Subject: netfilter: conntrack: remove unconfirmed list It has no function anymore and can be removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 - include/net/netns/conntrack.h | 6 ------ 2 files changed, 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f60212244b13..3ce9a5b42fe5 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -101,7 +101,6 @@ struct nf_conn { /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; - u16 cpu; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index e985a3010b89..a71cfd4e2f21 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -93,11 +93,6 @@ struct nf_ip_net { #endif }; -struct ct_pcpu { - spinlock_t lock; - struct hlist_nulls_head unconfirmed; -}; - struct netns_ct { #ifdef CONFIG_NF_CONNTRACK_EVENTS bool ecache_dwork_pending; @@ -109,7 +104,6 @@ struct netns_ct { u8 sysctl_tstamp; u8 sysctl_checksum; - struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; struct nf_ip_net nf_ct_proto; -- cgit v1.2.3 From 8169ff584003c871a226719e998bb034231954d6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 8 Apr 2022 13:10:19 +0200 Subject: netfilter: conntrack: add nf_ct_iter_data object for nf_ct_iterate_cleanup*() This patch adds a structure to collect all the context data that is passed to the cleanup iterator. struct nf_ct_iter_data { struct net *net; void *data; u32 portid; int report; }; There is a netns field that allows to clean up conntrack entries specifically owned by the specified netns. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3ce9a5b42fe5..a32be8aa7ed2 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -236,10 +236,16 @@ static inline bool nf_ct_kill(struct nf_conn *ct) return nf_ct_delete(ct, 0, 0); } +struct nf_ct_iter_data { + struct net *net; + void *data; + u32 portid; + int report; +}; + /* Iterate over all conntracks: if iter returns true, it's deleted. */ -void nf_ct_iterate_cleanup_net(struct net *net, - int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report); +void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), + const struct nf_ct_iter_data *iter_data); /* also set unconfirmed conntracks as dying. Only use in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), -- cgit v1.2.3 From 2794cdb0b97bfe62d25c996c8afe4832207e78bc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Apr 2022 15:15:41 +0200 Subject: netfilter: nfnetlink: allow to detect if ctnetlink listeners exist At this time, every new conntrack gets the 'event cache extension' enabled for it. This is because the 'net.netfilter.nf_conntrack_events' sysctl defaults to 1. Changing the default to 0 means that commands that rely on the event notification extension, e.g. 'conntrack -E' or conntrackd, stop working. We COULD detect if there is a listener by means of 'nfnetlink_has_listeners()' and only add the extension if this is true. The downside is a dependency from conntrack module to nfnetlink module. This adds a different way: inc/dec a counter whenever a ctnetlink group is being (un)subscribed and toggle a flag in struct net. Next patches will take advantage of this and will only add the event extension if the flag is set. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index a71cfd4e2f21..0677cd3de034 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -95,6 +95,7 @@ struct nf_ip_net { struct netns_ct { #ifdef CONFIG_NF_CONNTRACK_EVENTS + bool ctnetlink_has_listener; bool ecache_dwork_pending; #endif u8 sysctl_log_invalid; /* Log invalid packets */ -- cgit v1.2.3 From b0a7ab4a776583b4344e8313638dc795f7589209 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Apr 2022 15:15:42 +0200 Subject: netfilter: conntrack: un-inline nf_ct_ecache_ext_add Only called when new ct is allocated or the extension isn't present. This function will be extended, place this in the conntrack module instead of inlining. The callers already depend on nf_conntrack module. Return value is changed to bool, noone used the returned pointer. Make sure that the core drops the newly allocated conntrack if the extension is requested but can't be added. This makes it necessary to ifdef the section, as the stub always returns false we'd drop every new conntrack if the the ecache extension is disabled in kconfig. Add from data path (xt_CT, nft_ct) is unchanged. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 30 +++++------------------------ 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index b57d73785e4d..2e3d58439e34 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -36,31 +36,6 @@ nf_ct_ecache_find(const struct nf_conn *ct) #endif } -static inline struct nf_conntrack_ecache * -nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) -{ -#ifdef CONFIG_NF_CONNTRACK_EVENTS - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *e; - - if (!ctmask && !expmask && net->ct.sysctl_events) { - ctmask = ~0; - expmask = ~0; - } - if (!ctmask && !expmask) - return NULL; - - e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); - if (e) { - e->ctmask = ctmask; - e->expmask = expmask; - } - return e; -#else - return NULL; -#endif -} - #ifdef CONFIG_NF_CONNTRACK_EVENTS /* This structure is passed to event handler */ @@ -89,6 +64,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct); int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, u32 portid, int report); +bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp); #else static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) @@ -103,6 +79,10 @@ static inline int nf_conntrack_eventmask_report(unsigned int eventmask, return 0; } +static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) +{ + return false; +} #endif static inline void -- cgit v1.2.3 From 8edc813111001e9be3cce066d3d4091d2ef04a1d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Apr 2022 15:15:44 +0200 Subject: netfilter: prefer extension check to pointer check The pointer check usually results in a 'false positive': its likely that the ctnetlink module is loaded but no event monitoring is enabled. After recent change to autodetect ctnetlink usage and only allocate the ecache extension if a listener is active, check if the extension is present on a given conntrack. If its not there, there is nothing to report and calls to the notification framework can be elided. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 2 +- include/net/netfilter/nf_conntrack_ecache.h | 31 ++++++++++++++--------------- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 13807ea94cd2..6406cfee34c2 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) if (ct) { if (!nf_ct_is_confirmed(ct)) ret = __nf_conntrack_confirm(skb); - if (likely(ret == NF_ACCEPT)) + if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct)) nf_ct_deliver_cached_events(ct); } return ret; diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 2e3d58439e34..0c1dac318e02 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -36,6 +36,15 @@ nf_ct_ecache_find(const struct nf_conn *ct) #endif } +static inline bool nf_ct_ecache_exist(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE); +#else + return false; +#endif +} + #ifdef CONFIG_NF_CONNTRACK_EVENTS /* This structure is passed to event handler */ @@ -108,30 +117,20 @@ nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, u32 portid, int report) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - const struct net *net = nf_ct_net(ct); - - if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) - return 0; - - return nf_conntrack_eventmask_report(1 << event, ct, portid, report); -#else - return 0; + if (nf_ct_ecache_exist(ct)) + return nf_conntrack_eventmask_report(1 << event, ct, portid, report); #endif + return 0; } static inline int nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - const struct net *net = nf_ct_net(ct); - - if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) - return 0; - - return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); -#else - return 0; + if (nf_ct_ecache_exist(ct)) + return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); #endif + return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS -- cgit v1.2.3 From 4f9bd53084d18c2f9f1ec68fa56587b99a2cef00 Mon Sep 17 00:00:00 2001 From: Kevin Mitchell Date: Fri, 29 Apr 2022 20:40:27 -0700 Subject: netfilter: conntrack: skip verification of zero UDP checksum The checksum is optional for UDP packets. However nf_reject would previously require a valid checksum to elicit a response such as ICMP_DEST_UNREACH. Add some logic to nf_reject_verify_csum to determine if a UDP packet has a zero checksum and should therefore not be verified. Signed-off-by: Kevin Mitchell Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_reject.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h index 9051c3a0c8e7..7c669792fb9c 100644 --- a/include/net/netfilter/nf_reject.h +++ b/include/net/netfilter/nf_reject.h @@ -5,12 +5,28 @@ #include #include -static inline bool nf_reject_verify_csum(__u8 proto) +static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff, + __u8 proto) { /* Skip protocols that don't use 16-bit one's complement checksum * of the entire payload. */ switch (proto) { + /* Protocols with optional checksums. */ + case IPPROTO_UDP: { + const struct udphdr *udp_hdr; + struct udphdr _udp_hdr; + + udp_hdr = skb_header_pointer(skb, dataoff, + sizeof(_udp_hdr), + &_udp_hdr); + if (!udp_hdr || udp_hdr->check) + return true; + + return false; + } + case IPPROTO_GRE: + /* Protocols with other integrity checks. */ case IPPROTO_AH: case IPPROTO_ESP: @@ -19,9 +35,6 @@ static inline bool nf_reject_verify_csum(__u8 proto) /* Protocols with partial checksums. */ case IPPROTO_UDPLITE: case IPPROTO_DCCP: - - /* Protocols with optional checksums. */ - case IPPROTO_GRE: return false; } return true; -- cgit v1.2.3 From 4273d3fa8aa594e37c295bef3e73073608644241 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 May 2022 13:37:59 +0200 Subject: mac80211: fix typo in documentation This is called offload_flags, remove the extra 'a'. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 75880fc70700..7b74c695a83a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1701,7 +1701,7 @@ enum ieee80211_offload_flags { * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed * at runtime, mac80211 will never touch this field - * @offloaad_flags: hardware offload capabilities/flags for this interface. + * @offload_flags: hardware offload capabilities/flags for this interface. * These are initialized by mac80211 before calling .add_interface, * .change_interface or .update_vif_offload and updated by the driver * within these ops, based on supported features or runtime change -- cgit v1.2.3 From f5bf586aadddd9803a1f16bc18621fd819377130 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 May 2022 16:38:19 +0200 Subject: mac80211: remove stray multi_sta_back_32bit docs This field doesn't exist, remove the docs for it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7b74c695a83a..944b31f0eb0d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -514,7 +514,6 @@ struct ieee80211_fils_discovery { * to that BSS) that can change during the lifetime of the BSS. * * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE - * @multi_sta_back_32bit: supports BA bitmap of 32-bits in Multi-STA BACK * @uora_exists: is the UORA element advertised by AP * @ack_enabled: indicates support to receive a multi-TID that solicits either * ACK, BACK or both -- cgit v1.2.3 From e6175a2ed1f18bf2f649625bf725e07adcfa6a28 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Fri, 13 May 2022 23:34:02 +0300 Subject: xfrm: fix "disable_policy" flag use when arriving from different devices In IPv4 setting the "disable_policy" flag on a device means no policy should be enforced for traffic originating from the device. This was implemented by seting the DST_NOPOLICY flag in the dst based on the originating device. However, dsts are cached in nexthops regardless of the originating devices, in which case, the DST_NOPOLICY flag value may be incorrect. Consider the following setup: +------------------------------+ | ROUTER | +-------------+ | +-----------------+ | | ipsec src |----|-|ipsec0 | | +-------------+ | |disable_policy=0 | +----+ | | +-----------------+ |eth1|-|----- +-------------+ | +-----------------+ +----+ | | noipsec src |----|-|eth0 | | +-------------+ | |disable_policy=1 | | | +-----------------+ | +------------------------------+ Where ROUTER has a default route towards eth1. dst entries for traffic arriving from eth0 would have DST_NOPOLICY and would be cached and therefore can be reused by traffic originating from ipsec0, skipping policy check. Fix by setting a IPSKB_NOPOLICY flag in IPCB and observing it instead of the DST in IN/FWD IPv4 policy checks. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Shmulik Ladkani Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- include/net/ip.h | 1 + include/net/xfrm.h | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 3984f2c39c4b..0161137914cf 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -56,6 +56,7 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) +#define IPSKB_NOPOLICY BIT(8) u16 frag_max_size; }; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6fb899ff5afc..d2efddce65d4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1093,6 +1093,18 @@ static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, return false; } +static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb, + int dir, unsigned short family) +{ + if (dir != XFRM_POLICY_OUT && family == AF_INET) { + /* same dst may be used for traffic originating from + * devices with different policy settings. + */ + return IPCB(skb)->flags & IPSKB_NOPOLICY; + } + return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY); +} + static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) @@ -1104,7 +1116,7 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, return __xfrm_policy_check(sk, ndir, skb, family); return __xfrm_check_nopolicy(net, skb, dir) || - (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || + __xfrm_check_dev_nopolicy(skb, dir, family) || __xfrm_policy_check(sk, ndir, skb, family); } -- cgit v1.2.3 From 3d48cb74816d8468f0235ce9a867a2d7b9832693 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Wed, 13 Apr 2022 21:58:14 +0530 Subject: nl80211: Parse NL80211_ATTR_HE_BSS_COLOR as a part of nl80211_parse_beacon NL80211_ATTR_HE_BSS_COLOR attribute can be included in both NL80211_CMD_START_AP and NL80211_CMD_SET_BEACON commands. Move he_bss_color from cfg80211_ap_settings to cfg80211_beacon_data and parse NL80211_ATTR_HE_BSS_COLOR as a part of nl80211_parse_beacon() to have bss color settings parsed for both start ap and set beacon commands. Add a new flag he_bss_color_valid to indicate whether NL80211_ATTR_HE_BSS_COLOR attribute is included. Signed-off-by: Rameshkumar Sundaram Link: https://lore.kernel.org/r/1649867295-7204-2-git-send-email-quic_ramess@quicinc.com [fix build ...] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6a3e3f0a8615..97a5804ccdcf 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1183,6 +1183,9 @@ struct cfg80211_mbssid_elems { * Token (measurement type 11) * @lci_len: LCI data length * @civicloc_len: Civic location data length + * @he_bss_color: BSS Color settings + * @he_bss_color_valid: indicates whether bss color + attribute is present in beacon data or not. */ struct cfg80211_beacon_data { const u8 *head, *tail; @@ -1202,6 +1205,8 @@ struct cfg80211_beacon_data { size_t probe_resp_len; size_t lci_len; size_t civicloc_len; + struct cfg80211_he_bss_color he_bss_color; + bool he_bss_color_valid; }; struct mac_address { @@ -1292,7 +1297,6 @@ struct cfg80211_unsol_bcast_probe_resp { * @sae_h2e_required: stations must support direct H2E technique in SAE * @flags: flags, as defined in enum cfg80211_ap_settings_flags * @he_obss_pd: OBSS Packet Detection settings - * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters @@ -1326,7 +1330,6 @@ struct cfg80211_ap_settings { bool twt_responder; u32 flags; struct ieee80211_he_obss_pd he_obss_pd; - struct cfg80211_he_bss_color he_bss_color; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; -- cgit v1.2.3 From 44fa75f207d8a106bc75e6230db61e961fdbf8a8 Mon Sep 17 00:00:00 2001 From: Jonas Jelonek Date: Mon, 9 May 2022 19:39:57 +0200 Subject: mac80211: extend current rate control tx status API This patch adds the new struct ieee80211_rate_status and replaces 'struct rate_info *rate' in ieee80211_tx_status with pointer and length annotation. The struct ieee80211_rate_status allows to: (1) receive tx power status feedback for transmit power control (TPC) per packet or packet retry (2) dynamic mapping of wifi chip specific multi-rate retry (mrr) chains with different lengths (3) increase the limit of annotatable rate indices to support IEEE802.11ac rate sets and beyond ieee80211_tx_info, control and status buffer, and ieee80211_tx_rate cannot be used to achieve these goals due to fixed size limitations. Our new struct contains a struct rate_info to annotate the rate that was used, retry count of the rate and tx power. It is intended for all information related to RC and TPC that needs to be passed from driver to mac80211 and its RC/TPC algorithms like Minstrel_HT. It corresponds to one stage in an mrr. Multiple subsequent instances of this struct can be included in struct ieee80211_tx_status via a pointer and a length variable. Those instances can be allocated on-stack. The former reference to a single instance of struct rate_info is replaced with our new annotation. An extension is introduced to struct ieee80211_hw. There are two new members called 'tx_power_levels' and 'max_txpwr_levels_idx' acting as a tx power level table. When a wifi device is registered, the driver shall supply all supported power levels in this list. This allows to support several quirks like differing power steps in power level ranges or alike. TPC can use this for algorithm and thus be designed more abstract instead of handling all possible step widths individually. Further mandatory changes in status.c, mt76 and ath11k drivers due to the removal of 'struct rate_info *rate' are also included. status.c already uses the information in ieee80211_tx_status->rate in radiotap, this is now changed to use ieee80211_rate_status->rate_idx. mt76 driver already uses struct rate_info to pass the tx rate to status path. The new members of the ieee80211_tx_status are set to NULL and 0 because the previously passed rate is not relevant to rate control and accurate information is passed via tx_info->status.rates. For ath11k, the txrate can be passed via this struct because ath11k uses firmware RC and thus the information does not interfere with software RC. Compile-Tested: current wireless-next tree with all flags on Tested-on: Xiaomi 4A Gigabit (MediaTek MT7603E, MT7612E) with OpenWrt Linux 5.10.113 Signed-off-by: Jonas Jelonek Link: https://lore.kernel.org/r/20220509173958.1398201-2-jelonek.jonas@gmail.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 944b31f0eb0d..ebadb2103968 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1143,20 +1143,41 @@ ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) return info->tx_time_est << 2; } +/*** + * struct ieee80211_rate_status - mrr stage for status path + * + * This struct is used in struct ieee80211_tx_status to provide drivers a + * dynamic way to report about used rates and power levels per packet. + * + * @rate_idx The actual used rate. + * @try_count How often the rate was tried. + * @tx_power_idx An idx into the ieee80211_hw->tx_power_levels list of the + * corresponding wifi hardware. The idx shall point to the power level + * that was used when sending the packet. + */ +struct ieee80211_rate_status { + struct rate_info rate_idx; + u8 try_count; + u8 tx_power_idx; +}; + /** * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) - * @rate: The TX rate that was used when sending the packet + * @rates: Mrr stages that were used when sending the packet + * @n_rates: Number of mrr stages (count of instances for @rates) * @free_list: list where processed skbs are stored to be free'd by the driver */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; - struct rate_info *rate; + struct ieee80211_rate_status *rates; + u8 n_rates; + struct list_head *free_list; }; @@ -2657,6 +2678,12 @@ enum ieee80211_hw_flags { * refilling deficit of each TXQ. * * @max_mtu: the max mtu could be set. + * + * @tx_power_levels: a list of power levels supported by the wifi hardware. + * The power levels can be specified either as integer or fractions. + * The power level at idx 0 shall be the maximum positive power level. + * + * @max_txpwr_levels_idx: the maximum valid idx of 'tx_power_levels' list. */ struct ieee80211_hw { struct ieee80211_conf conf; @@ -2695,6 +2722,8 @@ struct ieee80211_hw { u8 tx_sk_pacing_shift; u8 weight_multiplier; u32 max_mtu; + const s8 *tx_power_levels; + u8 max_txpwr_levels_idx; }; static inline bool _ieee80211_hw_check(struct ieee80211_hw *hw, -- cgit v1.2.3 From 7c96d8ec96bb71aac54c9f872aaa65d7411ab864 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:00 -0700 Subject: ipv6: add struct hop_jumbo_hdr definition Following patches will need to add and remove local IPv6 jumbogram options to enable BIG TCP. Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ipv6.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 213612f1680c..63d019953c47 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -151,6 +151,17 @@ struct frag_hdr { __be32 identification; }; +/* + * Jumbo payload option, as described in RFC 2675 2. + */ +struct hop_jumbo_hdr { + u8 nexthdr; + u8 hdrlen; + u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ + u8 tlv_len; /* 4 */ + __be32 jumbo_payload_len; +}; + #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 -- cgit v1.2.3 From 09f3d1a3a52c696208008618a67e2c7c3fb16d41 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:01 -0700 Subject: ipv6/gso: remove temporary HBH/jumbo header ipv6 tcp and gro stacks will soon be able to build big TCP packets, with an added temporary Hop By Hop header. If GSO is involved for these large packets, we need to remove the temporary HBH header before segmentation happens. v2: perform HBH removal from ipv6_gso_segment() instead of skb_segment() (Alexander feedback) Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ipv6.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 63d019953c47..b6df0314aa02 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -467,6 +467,39 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); +/* This helper is specialized for BIG TCP needs. + * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. + * It assumes headers are already in skb->head. + * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. + */ +static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) +{ + const struct hop_jumbo_hdr *jhdr; + const struct ipv6hdr *nhdr; + + if (likely(skb->len <= GRO_MAX_SIZE)) + return 0; + + if (skb->protocol != htons(ETH_P_IPV6)) + return 0; + + if (skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) + return 0; + + nhdr = ipv6_hdr(skb); + + if (nhdr->nexthdr != NEXTHDR_HOP) + return 0; + + jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); + if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || + jhdr->nexthdr != IPPROTO_TCP) + return 0; + return jhdr->nexthdr; +} + static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special -- cgit v1.2.3 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b6df0314aa02..5b38bf1a586b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -477,7 +477,7 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; - if (likely(skb->len <= GRO_MAX_SIZE)) + if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) -- cgit v1.2.3 From 4c971d2f3548e4f11b1460ac048f5307e4b39fdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:55:41 -0700 Subject: net: annotate races around sk->sk_bound_dev_if UDP sendmsg() is lockless, and reads sk->sk_bound_dev_if while this field can be changed by another thread. Adds minimal annotations to avoid KCSAN splats for UDP. Following patches will add more annotations to potential lockless readers. BUG: KCSAN: data-race in __ip6_datagram_connect / udpv6_sendmsg write to 0xffff888136d47a94 of 4 bytes by task 7681 on cpu 0: __ip6_datagram_connect+0x6e2/0x930 net/ipv6/datagram.c:221 ip6_datagram_connect+0x2a/0x40 net/ipv6/datagram.c:272 inet_dgram_connect+0x107/0x190 net/ipv4/af_inet.c:576 __sys_connect_file net/socket.c:1900 [inline] __sys_connect+0x197/0x1b0 net/socket.c:1917 __do_sys_connect net/socket.c:1927 [inline] __se_sys_connect net/socket.c:1924 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1924 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x50 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888136d47a94 of 4 bytes by task 7670 on cpu 1: udpv6_sendmsg+0xc60/0x16e0 net/ipv6/udp.c:1436 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:652 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x50 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffff9b Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 7670 Comm: syz-executor.3 Tainted: G W 5.18.0-rc1-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 I chose to not add Fixes: tag because race has minor consequences and stable teams busy enough. Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- include/net/sock.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 3984f2c39c4b..8ad04f60b413 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -93,7 +93,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm->sockc.mark = inet->sk.sk_mark; ipcm->sockc.tsflags = inet->sk.sk_tsflags; - ipcm->oif = inet->sk.sk_bound_dev_if; + ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; } diff --git a/include/net/sock.h b/include/net/sock.h index 01edfde4257d..72ca97ccb460 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2875,13 +2875,14 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val) */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; - if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif) + if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); - if (mdif && mdif == sk->sk_bound_dev_if) + if (mdif && mdif == bound_dev_if) return true; return false; -- cgit v1.2.3 From fdb5fd7f736ec7ae9fb36d2842ea6d9ebc4e7269 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:55:43 -0700 Subject: tcp: sk->sk_bound_dev_if once in inet_request_bound_dev_if() inet_request_bound_dev_if() reads sk->sk_bound_dev_if twice while listener socket is not locked. Another cpu could change this field under us. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 234d70ae5f4c..c1b5dcd6597c 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -116,14 +116,15 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) static inline int inet_request_bound_dev_if(const struct sock *sk, struct sk_buff *skb) { + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); - if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) + if (!bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif - return sk->sk_bound_dev_if; + return bound_dev_if; } static inline int inet_sk_bound_l3mdev(const struct sock *sk) -- cgit v1.2.3 From 5d368f03280d3678433a7f119efe15dfbbb87bc8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:55:49 -0700 Subject: ipv6: add READ_ONCE(sk->sk_bound_dev_if) in INET6_MATCH() INET6_MATCH() runs without holding a lock on the socket. We probably need to annotate most reads. This patch makes INET6_MATCH() an inline function to ease our changes. v2: inline function only defined if IS_ENABLED(CONFIG_IPV6) Change the name to inet6_match(), this is no longer a macro. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 81b965953036..f259e1ae14ba 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -103,15 +103,25 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, const int dif); int inet6_hash(struct sock *sk); -#endif /* IS_ENABLED(CONFIG_IPV6) */ -#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif, __sdif) \ - (((__sk)->sk_portpair == (__ports)) && \ - ((__sk)->sk_family == AF_INET6) && \ - ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ - ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ - (((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ - net_eq(sock_net(__sk), (__net))) +static inline bool inet6_match(struct net *net, const struct sock *sk, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + const __portpair ports, + const int dif, const int sdif) +{ + int bound_dev_if; + + if (!net_eq(sock_net(sk), net) || + sk->sk_family != AF_INET6 || + sk->sk_portpair != ports || + !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) || + !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) + return false; + + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + return bound_dev_if == dif || bound_dev_if == sdif; +} +#endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _INET6_HASHTABLES_H */ -- cgit v1.2.3 From eda090c31fe923ab9463b884469744ec903ab0cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:55:50 -0700 Subject: inet: rename INET_MATCH() This is no longer a macro, but an inlined function. INET_MATCH() -> inet_match() Signed-off-by: Eric Dumazet Suggested-by: Olivier Hartkopp Suggested-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 59d72024ad1d..ebfa3df6f8dc 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -267,7 +267,7 @@ static inline struct sock *inet_lookup_listener(struct net *net, ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ -static inline bool INET_MATCH(struct net *net, const struct sock *sk, +static inline bool inet_match(struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { -- cgit v1.2.3 From d265929930e2ffafc744c0ae05fb70acd53be1ee Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 4 May 2022 08:35:59 -0700 Subject: netfilter: nf_conncount: reduce unnecessary GC Currently nf_conncount can trigger garbage collection (GC) at multiple places. Each GC process takes a spin_lock_bh to traverse the nf_conncount_list. We found that when testing port scanning use two parallel nmap, because the number of connection increase fast, the nf_conncount_count and its subsequent call to __nf_conncount_add take too much time, causing several CPU lockup. This happens when user set the conntrack limit to +20,000, because the larger the limit, the longer the list that GC has to traverse. The patch mitigate the performance issue by avoiding unnecessary GC with a timestamp. Whenever nf_conncount has done a GC, a timestamp is updated, and beforce the next time GC is triggered, we make sure it's more than a jiffies. By doin this we can greatly reduce the CPU cycles and avoid the softirq lockup. To reproduce it in OVS, $ ovs-appctl dpctl/ct-set-limits zone=1,limit=20000 $ ovs-appctl dpctl/ct-get-limits At another machine, runs two nmap $ nmap -p1- $ nmap -p1- Signed-off-by: William Tu Co-authored-by: Yifeng Sun Reported-by: Greg Rose Suggested-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 9645b47fa7e4..e227d997fc71 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -10,6 +10,7 @@ struct nf_conncount_data; struct nf_conncount_list { spinlock_t list_lock; + u32 last_gc; /* jiffies at most recent gc */ struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ }; -- cgit v1.2.3 From ee0e2f51e2115c2578d40e5a8ac33737984fe477 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 17 May 2022 12:43:46 +0200 Subject: cfg80211: fix kernel-doc for cfg80211_beacon_data The kernel-doc comment is formatted badly, resulting in a warning: include/net/cfg80211.h:1188: warning: bad line: [...] Fix that. Reported-by: Stephen Rothwell Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 97a5804ccdcf..cc8a9880b9d6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1185,7 +1185,7 @@ struct cfg80211_mbssid_elems { * @civicloc_len: Civic location data length * @he_bss_color: BSS Color settings * @he_bss_color_valid: indicates whether bss color - attribute is present in beacon data or not. + * attribute is present in beacon data or not. */ struct cfg80211_beacon_data { const u8 *head, *tail; -- cgit v1.2.3 From c1318b39c7d36bd5139a9c71044ff2b2d3c6f9d8 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 18 May 2022 12:27:31 +0300 Subject: tls: Add opt-in zerocopy mode of sendfile() TLS device offload copies sendfile data to a bounce buffer before transmitting. It allows to maintain the valid MAC on TLS records when the file contents change and a part of TLS record has to be retransmitted on TCP level. In many common use cases (like serving static files over HTTPS) the file contents are not changed on the fly. In many use cases breaking the connection is totally acceptable if the file is changed during transmission, because it would be received corrupted in any case. This commit allows to optimize performance for such use cases to providing a new optional mode of TLS sendfile(), in which the extra copy is skipped. Removing this copy improves performance significantly, as TLS and TCP sendfile perform the same operations, and the only overhead is TLS header/trailer insertion. The new mode can only be enabled with the new socket option named TLS_TX_ZEROCOPY_SENDFILE on per-socket basis. It preserves backwards compatibility with existing applications that rely on the copying behavior. The new mode is safe, meaning that unsolicited modifications of the file being sent can't break integrity of the kernel. The worst thing that can happen is sending a corrupted TLS record, which is in any case not forbidden when using regular TCP sockets. Sockets other than TLS device offload are not affected by the new socket option. The actual status of zerocopy sendfile can be queried with sock_diag. Performance numbers in a single-core test with 24 HTTPS streams on nginx, under 100% CPU load: * non-zerocopy: 33.6 Gbit/s * zerocopy: 79.92 Gbit/s CPU: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz Signed-off-by: Boris Pismenny Signed-off-by: Tariq Toukan Signed-off-by: Maxim Mikityanskiy Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220518092731.1243494-1-maximmi@nvidia.com Signed-off-by: Paolo Abeni --- include/net/tls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index b59f0a63292b..8017f1703447 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -238,6 +238,7 @@ struct tls_context { u8 tx_conf:3; u8 rx_conf:3; + u8 zerocopy_sendfile:1; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); -- cgit v1.2.3 From 8f9ae5b3ae80f168a6224529e3787f4fb27f299a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 4 May 2022 17:06:28 -0700 Subject: Bluetooth: eir: Add helpers for managing service data This adds helpers for accessing and appending service data (0x16) ad type. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 62a9bb022aed..fe7935be7dc4 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -625,6 +625,7 @@ enum { #define EIR_SSP_RAND_R192 0x0F /* Simple Pairing Randomizer R-192 */ #define EIR_DEVICE_ID 0x10 /* device ID */ #define EIR_APPEARANCE 0x19 /* Device appearance */ +#define EIR_SERVICE_DATA 0x16 /* Service Data */ #define EIR_LE_BDADDR 0x1B /* LE Bluetooth device address */ #define EIR_LE_ROLE 0x1C /* LE role */ #define EIR_SSP_HASH_C256 0x1D /* Simple Pairing Hash C-256 */ -- cgit v1.2.3 From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 May 2022 16:30:10 -0700 Subject: bpf: Add bpf_skc_to_mptcp_sock_proto This patch implements a new struct bpf_func_proto, named bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP, and a new helper bpf_skc_to_mptcp_sock(), which invokes another new helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct mptcp_sock from a given subflow socket. v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c, remove build check for CONFIG_BPF_JIT v5: Drop EXPORT_SYMBOL (Martin) Co-developed-by: Nicolas Rybowski Co-developed-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com --- include/net/mptcp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 8b1afd6f5cc4..2ba09de955c7 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -284,4 +284,10 @@ static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif +#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL) +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk); +#else +static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; } +#endif + #endif /* __NET_MPTCP_H */ -- cgit v1.2.3 From d5a42de8bdbe25081f07b801d8b35f4d75a791f4 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 19 May 2022 17:18:33 -0700 Subject: net: Add a second bind table hashed by port and address We currently have one tcp bind table (bhash) which hashes by port number only. In the socket bind path, we check for bind conflicts by traversing the specified port's inet_bind2_bucket while holding the bucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()). In instances where there are tons of sockets hashed to the same port at different addresses, checking for a bind conflict is time-intensive and can cause softirq cpu lockups, as well as stops new tcp connections since __inet_inherit_port() also contests for the spinlock. This patch proposes adding a second bind table, bhash2, that hashes by port and ip address. Searching the bhash2 table leads to significantly faster conflict resolution and less time holding the spinlock. Signed-off-by: Joanne Koong Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 ++ include/net/inet_hashtables.h | 68 +++++++++++++++++++++++++++++++++++++- include/net/sock.h | 14 ++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 85cd695e7fd1..077cd730ce2f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -25,6 +25,7 @@ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; +struct inet_bind2_bucket; struct tcp_congestion_ops; /* @@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops { * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node + * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout @@ -83,6 +85,7 @@ struct inet_connection_sock { struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; + struct inet_bind2_bucket *icsk_bind2_hash; unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ebfa3df6f8dc..a0887b70967b 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -90,11 +90,32 @@ struct inet_bind_bucket { struct hlist_head owners; }; +struct inet_bind2_bucket { + possible_net_t ib_net; + int l3mdev; + unsigned short port; + union { +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr v6_rcv_saddr; +#endif + __be32 rcv_saddr; + }; + /* Node in the inet2_bind_hashbucket chain */ + struct hlist_node node; + /* List of sockets hashed to this bucket */ + struct hlist_head owners; +}; + static inline struct net *ib_net(struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } +static inline struct net *ib2_net(struct inet_bind2_bucket *ib) +{ + return read_pnet(&ib->ib_net); +} + #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) @@ -103,6 +124,15 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; +/* This is synchronized using the inet_bind_hashbucket's spinlock. + * Instead of having separate spinlocks, the inet_bind2_hashbucket can share + * the inet_bind_hashbucket's given that in every case where the bhash2 table + * is useful, a lookup in the bhash table also occurs. + */ +struct inet_bind2_hashbucket { + struct hlist_head chain; +}; + /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without @@ -134,6 +164,12 @@ struct inet_hashinfo { */ struct kmem_cache *bind_bucket_cachep; struct inet_bind_hashbucket *bhash; + /* The 2nd binding table hashed by port and address. + * This is used primarily for expediting the resolution of bind + * conflicts. + */ + struct kmem_cache *bind2_bucket_cachep; + struct inet_bind2_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ @@ -193,6 +229,36 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); +static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb, + struct net *net, + const unsigned short port, + int l3mdev) +{ + return net_eq(ib_net(tb), net) && tb->port == port && + tb->l3mdev == l3mdev; +} + +struct inet_bind2_bucket * +inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, + struct inet_bind2_hashbucket *head, + const unsigned short port, int l3mdev, + const struct sock *sk); + +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, + struct inet_bind2_bucket *tb); + +struct inet_bind2_bucket * +inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, + const unsigned short port, int l3mdev, + struct sock *sk, + struct inet_bind2_hashbucket **head); + +bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, + struct net *net, + const unsigned short port, + int l3mdev, + const struct sock *sk); + static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { @@ -200,7 +266,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum); + struct inet_bind2_bucket *tb2, const unsigned short snum); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); diff --git a/include/net/sock.h b/include/net/sock.h index 72ca97ccb460..c585ef6565d9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -348,6 +348,7 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference + * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -537,6 +538,7 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; + struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -817,6 +819,16 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } +static inline void __sk_del_bind2_node(struct sock *sk) +{ + __hlist_del(&sk->sk_bind2_node); +} + +static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) +{ + hlist_add_head(&sk->sk_bind2_node, list); +} + #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -834,6 +846,8 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) +#define sk_for_each_bound_bhash2(__sk, list) \ + hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset -- cgit v1.2.3 From c304eddcecfe2513ff98ce3ae97d1c196d82ba08 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 May 2022 13:20:54 -0700 Subject: net: wrap the wireless pointers in struct net_device in an ifdef Most protocol-specific pointers in struct net_device are under a respective ifdef. Wireless is the notable exception. Since there's a sizable number of custom-built kernels for datacenter workloads which don't build wireless it seems reasonable to ifdefy those pointers as well. While at it move IPv4 and IPv6 pointers up, those are special for obvious reasons. Acked-by: Johannes Berg Acked-by: Stefan Schmidt # ieee802154 Acked-by: Sven Eckelmann Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/cfg80211.h | 2 ++ include/net/cfg802154.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cc8a9880b9d6..6d02e12e4702 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -8006,7 +8006,9 @@ int cfg80211_register_netdevice(struct net_device *dev); */ static inline void cfg80211_unregister_netdevice(struct net_device *dev) { +#if IS_ENABLED(CONFIG_CFG80211) cfg80211_unregister_wdev(dev->ieee80211_ptr); +#endif } /** diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 85f9e8417688..d8d8719315fd 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -373,6 +373,7 @@ struct wpan_dev { #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) static inline int wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, @@ -383,6 +384,7 @@ wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len); } +#endif struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size); -- cgit v1.2.3 From 4934609dda03ec90ca5052deecbe455b09a44e21 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 23 May 2022 16:17:06 +0000 Subject: amt: fix typo in amt AMT_MSG_TEARDOWM is defined, But it should be AMT_MSG_TEARDOWN. Fixes: b9022b53adad ("amt: add control plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Jakub Kicinski --- include/net/amt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/amt.h b/include/net/amt.h index 7a4db8b903ee..0e40c3d64fcf 100644 --- a/include/net/amt.h +++ b/include/net/amt.h @@ -15,7 +15,7 @@ enum amt_msg_type { AMT_MSG_MEMBERSHIP_QUERY, AMT_MSG_MEMBERSHIP_UPDATE, AMT_MSG_MULTICAST_DATA, - AMT_MSG_TEARDOWM, + AMT_MSG_TEARDOWN, __AMT_MSG_MAX, }; -- cgit v1.2.3 From a54ce3703613e41fe1d98060b62ec09a3984dc28 Mon Sep 17 00:00:00 2001 From: Vincent Ray Date: Wed, 25 May 2022 17:17:46 -0700 Subject: net: sched: fixed barrier to prevent skbuff sticking in qdisc backlog In qdisc_run_begin(), smp_mb__before_atomic() used before test_bit() does not provide any ordering guarantee as test_bit() is not an atomic operation. This, added to the fact that the spin_trylock() call at the beginning of qdisc_run_begin() does not guarantee acquire semantics if it does not grab the lock, makes it possible for the following statement : if (test_bit(__QDISC_STATE_MISSED, &qdisc->state)) to be executed before an enqueue operation called before qdisc_run_begin(). As a result the following race can happen : CPU 1 CPU 2 qdisc_run_begin() qdisc_run_begin() /* true */ set(MISSED) . /* returns false */ . . /* sees MISSED = 1 */ . /* so qdisc not empty */ . __qdisc_run() . . . pfifo_fast_dequeue() ----> /* may be done here */ . | . clear(MISSED) | . . | . smp_mb __after_atomic(); | . . | . /* recheck the queue */ | . /* nothing => exit */ | enqueue(skb1) | . | qdisc_run_begin() | . | spin_trylock() /* fail */ | . | smp_mb__before_atomic() /* not enough */ | . ---- if (test_bit(MISSED)) return false; /* exit */ In the above scenario, CPU 1 and CPU 2 both try to grab the qdisc->seqlock at the same time. Only CPU 2 succeeds and enters the bypass code path, where it emits its skb then calls __qdisc_run(). CPU1 fails, sets MISSED and goes down the traditionnal enqueue() + dequeue() code path. But when executing qdisc_run_begin() for the second time, after enqueuing its skbuff, it sees the MISSED bit still set (by itself) and consequently chooses to exit early without setting it again nor trying to grab the spinlock again. Meanwhile CPU2 has seen MISSED = 1, cleared it, checked the queue and found it empty, so it returned. At the end of the sequence, we end up with skb1 enqueued in the backlog, both CPUs out of __dev_xmit_skb(), the MISSED bit not set, and no __netif_schedule() called made. skb1 will now linger in the qdisc until somebody later performs a full __qdisc_run(). Associated to the bypass capacity of the qdisc, and the ability of the TCP layer to avoid resending packets which it knows are still in the qdisc, this can lead to serious traffic "holes" in a TCP connection. We fix this by replacing the smp_mb__before_atomic() / test_bit() / set_bit() / smp_mb__after_atomic() sequence inside qdisc_run_begin() by a single test_and_set_bit() call, which is more concise and enforces the needed memory barriers. Fixes: 89837eb4b246 ("net: sched: add barrier to ensure correct ordering for lockless qdisc") Signed-off-by: Vincent Ray Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20220526001746.2437669-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9bab396c1f3b..80973ce820f3 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -187,37 +187,17 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) if (spin_trylock(&qdisc->seqlock)) return true; - /* Paired with smp_mb__after_atomic() to make sure - * STATE_MISSED checking is synchronized with clearing - * in pfifo_fast_dequeue(). + /* No need to insist if the MISSED flag was already set. + * Note that test_and_set_bit() also gives us memory ordering + * guarantees wrt potential earlier enqueue() and below + * spin_trylock(), both of which are necessary to prevent races */ - smp_mb__before_atomic(); - - /* If the MISSED flag is set, it means other thread has - * set the MISSED flag before second spin_trylock(), so - * we can return false here to avoid multi cpus doing - * the set_bit() and second spin_trylock() concurrently. - */ - if (test_bit(__QDISC_STATE_MISSED, &qdisc->state)) + if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state)) return false; - /* Set the MISSED flag before the second spin_trylock(), - * if the second spin_trylock() return false, it means - * other cpu holding the lock will do dequeuing for us - * or it will see the MISSED flag set after releasing - * lock and reschedule the net_tx_action() to do the - * dequeuing. - */ - set_bit(__QDISC_STATE_MISSED, &qdisc->state); - - /* spin_trylock() only has load-acquire semantic, so use - * smp_mb__after_atomic() to ensure STATE_MISSED is set - * before doing the second spin_trylock(). - */ - smp_mb__after_atomic(); - - /* Retry again in case other CPU may not see the new flag - * after it releases the lock at the end of qdisc_run_end(). + /* Try to take the lock again to make sure that we will either + * grab it or the CPU that still has it will see MISSED set + * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); } -- cgit v1.2.3 From 56b14ecec97f39118bf85c9ac2438c5a949509ed Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 20 May 2022 00:02:04 +0200 Subject: netfilter: conntrack: re-fetch conntrack after insertion In case the conntrack is clashing, insertion can free skb->_nfct and set skb->_nfct to the already-confirmed entry. This wasn't found before because the conntrack entry and the extension space used to free'd after an rcu grace period, plus the race needs events enabled to trigger. Reported-by: Fixes: 71d8c47fc653 ("netfilter: conntrack: introduce clash resolution on insertion race") Fixes: 2ad9d7747c10 ("netfilter: conntrack: free extension area immediately") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 6406cfee34c2..37866c8386e2 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -58,8 +58,13 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) int ret = NF_ACCEPT; if (ct) { - if (!nf_ct_is_confirmed(ct)) + if (!nf_ct_is_confirmed(ct)) { ret = __nf_conntrack_confirm(skb); + + if (ret == NF_ACCEPT) + ct = (struct nf_conn *)skb_nfct(skb); + } + if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct)) nf_ct_deliver_cached_events(ct); } -- cgit v1.2.3 From 2e8728c955ce0624b958eee6e030a37aca3a5d86 Mon Sep 17 00:00:00 2001 From: Guoju Fang Date: Sat, 28 May 2022 18:16:28 +0800 Subject: net: sched: add barrier to fix packet stuck problem for lockless qdisc In qdisc_run_end(), the spin_unlock() only has store-release semantic, which guarantees all earlier memory access are visible before it. But the subsequent test_bit() has no barrier semantics so may be reordered ahead of the spin_unlock(). The store-load reordering may cause a packet stuck problem. The concurrent operations can be described as below, CPU 0 | CPU 1 qdisc_run_end() | qdisc_run_begin() . | . ----> /* may be reorderd here */ | . | . | . | spin_unlock() | set_bit() | . | smp_mb__after_atomic() ---- test_bit() | spin_trylock() . | . Consider the following sequence of events: CPU 0 reorder test_bit() ahead and see MISSED = 0 CPU 1 calls set_bit() CPU 1 calls spin_trylock() and return fail CPU 0 executes spin_unlock() At the end of the sequence, CPU 0 calls spin_unlock() and does nothing because it see MISSED = 0. The skb on CPU 1 has beed enqueued but no one take it, until the next cpu pushing to the qdisc (if ever ...) will notice and dequeue it. This patch fix this by adding one explicit barrier. As spin_unlock() and test_bit() ordering is a store-load ordering, a full memory barrier smp_mb() is needed here. Fixes: a90c57f2cedd ("net: sched: fix packet stuck problem for lockless qdisc") Signed-off-by: Guoju Fang Link: https://lore.kernel.org/r/20220528101628.120193-1-gjfang@linux.alibaba.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 80973ce820f3..d6cf5116b5f9 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -209,6 +209,12 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); + /* spin_unlock() only has store-release semantic. The unlock + * and test_bit() ordering is a store-load ordering, so a full + * memory barrier is needed here. + */ + smp_mb(); + if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); -- cgit v1.2.3 From c4caa500ffebf64795d1c0f6f9d6f179b502c6b7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 31 May 2022 14:37:27 +0800 Subject: bonding: guard ns_targets by CONFIG_IPV6 Guard ns_targets in struct bond_params by CONFIG_IPV6, which could save 256 bytes if IPv6 not configed. Also add this protection for function bond_is_ip6_target_ok() and bond_get_targets_ip6(). Remove the IS_ENABLED() check for bond_opts[] as this will make BOND_OPT_NS_TARGETS uninitialized if CONFIG_IPV6 not enabled. Add a dummy bond_option_ns_ip6_targets_set() for this situation. Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Acked-by: Jonathan Toppins Link: https://lore.kernel.org/r/20220531063727.224043-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- include/net/bonding.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index b14f4c0b4e9e..cb904d356e31 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -149,7 +149,9 @@ struct bond_params { struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; +#if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; +#endif /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; @@ -503,12 +505,14 @@ static inline int bond_is_ip_target_ok(__be32 addr) return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } +#if IS_ENABLED(CONFIG_IPV6) static inline int bond_is_ip6_target_ok(struct in6_addr *addr) { return !ipv6_addr_any(addr) && !ipv6_addr_loopback(addr) && !ipv6_addr_is_multicast(addr); } +#endif /* Get the oldest arp which we've received on this slave for bond's * arp_targets. @@ -746,6 +750,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) return -1; } +#if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { int i; @@ -758,6 +763,7 @@ static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr return -1; } +#endif /* exported from bond_main.c */ extern unsigned int bond_net_id; -- cgit v1.2.3 From b6d9014a3335194590abdd2a2471ef5147a67645 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 30 May 2022 18:40:06 +0200 Subject: netfilter: nf_tables: delete flowtable hooks via transaction list Remove inactive bool field in nft_hook object that was introduced in abadb2f865d7 ("netfilter: nf_tables: delete devices from flowtable"). Move stale flowtable hooks to transaction list instead. Deleting twice the same device does not result in ENOENT. Fixes: abadb2f865d7 ("netfilter: nf_tables: delete devices from flowtable") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 20af9d3557b9..279ae0fff7ad 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1090,7 +1090,6 @@ struct nft_stats { struct nft_hook { struct list_head list; - bool inactive; struct nf_hook_ops ops; struct rcu_head rcu; }; -- cgit v1.2.3 From 7d8a3a477b3e25ada8dc71d22048c2ea417209a0 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Mon, 30 May 2022 23:21:58 +0800 Subject: ax25: Fix ax25 session cleanup problems There are session cleanup problems in ax25_release() and ax25_disconnect(). If we setup a session and then disconnect, the disconnected session is still in "LISTENING" state that is shown below. Active AX.25 sockets Dest Source Device State Vr/Vs Send-Q Recv-Q DL9SAU-4 DL9SAU-3 ??? LISTENING 000/000 0 0 DL9SAU-3 DL9SAU-4 ??? LISTENING 000/000 0 0 The first reason is caused by del_timer_sync() in ax25_release(). The timers of ax25 are used for correct session cleanup. If we use ax25_release() to close ax25 sessions and ax25_dev is not null, the del_timer_sync() functions in ax25_release() will execute. As a result, the sessions could not be cleaned up correctly, because the timers have stopped. In order to solve this problem, this patch adds a device_up flag in ax25_dev in order to judge whether the device is up. If there are sessions to be cleaned up, the del_timer_sync() in ax25_release() will not execute. What's more, we add ax25_cb_del() in ax25_kill_by_device(), because the timers have been stopped and there are no functions that could delete ax25_cb if we do not call ax25_release(). Finally, we reorder the position of ax25_list_lock in ax25_cb_del() in order to synchronize among different functions that call ax25_cb_del(). The second reason is caused by improper check in ax25_disconnect(). The incoming ax25 sessions which ax25->sk is null will close heartbeat timer, because the check "if(!ax25->sk || ..)" is satisfied. As a result, the session could not be cleaned up properly. In order to solve this problem, this patch changes the improper check to "if(ax25->sk && ..)" in ax25_disconnect(). What`s more, the ax25_disconnect() may be called twice, which is not necessary. For example, ax25_kill_by_device() calls ax25_disconnect() and sets ax25->state to AX25_STATE_0, but ax25_release() calls ax25_disconnect() again. In order to solve this problem, this patch add a check in ax25_release(). If the flag of ax25->sk equals to SOCK_DEAD, the ax25_disconnect() in ax25_release() should not be executed. Fixes: 82e31755e55f ("ax25: Fix UAF bugs in ax25 timers") Fixes: 8a367e74c012 ("ax25: Fix segfault after sock connection timeout") Reported-and-tested-by: Thomas Osterried Signed-off-by: Duoming Zhou Link: https://lore.kernel.org/r/20220530152158.108619-1-duoming@zju.edu.cn Signed-off-by: Paolo Abeni --- include/net/ax25.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ax25.h b/include/net/ax25.h index 0f9790c455bb..a427a05672e2 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -228,6 +228,7 @@ typedef struct ax25_dev { ax25_dama_info dama; #endif refcount_t refcount; + bool device_up; } ax25_dev; typedef struct ax25_cb { -- cgit v1.2.3 From e1cff7002b716bd0b5f5f4afd4273c99aa8644be Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 5 Jun 2022 11:51:48 -0700 Subject: bluetooth: don't use bitmaps for random flag accesses The bluetooth code uses our bitmap infrastructure for the two bits (!) of connection setup flags, and in the process causes odd problems when it converts between a bitmap and just the regular values of said bits. It's completely pointless to do things like bitmap_to_arr32() to convert a bitmap into a u32. It shoudln't have been a bitmap in the first place. The reason to use bitmaps is if you have arbitrary number of bits you want to manage (not two!), or if you rely on the atomicity guarantees of the bitmap setting and clearing. The code could use an "atomic_t" and use "atomic_or/andnot()" to set and clear the bit values, but considering that it then copies the bitmaps around with "bitmap_to_arr32()" and friends, there clearly cannot be a lot of atomicity requirements. So just use a regular integer. In the process, this avoids the warnings about erroneous use of bitmap_from_u64() which were triggered on 32-bit architectures when conversion from a u64 would access two words (and, surprise, surprise, only one word is needed - and indeed overkill - for a 2-bit bitmap). That was always problematic, but the compiler seems to notice it and warn about the invalid pattern only after commit 0a97953fd221 ("lib: add bitmap_{from,to}_arr64") changed the exact implementation details of 'bitmap_from_u64()', as reported by Sudip Mukherjee and Stephen Rothwell. Fixes: fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags") Link: https://lore.kernel.org/all/YpyJ9qTNHJzz0FHY@debian/ Link: https://lore.kernel.org/all/20220606080631.0c3014f2@canb.auug.org.au/ Link: https://lore.kernel.org/all/20220605162537.1604762-1-yury.norov@gmail.com/ Reported-by: Stephen Rothwell Reported-by: Sudip Mukherjee Reviewed-by: Yury Norov Cc: Luiz Augusto von Dentz Cc: Marcel Holtmann Signed-off-by: Linus Torvalds --- include/net/bluetooth/hci_core.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5a52a2018b56..c0ea2a4892b1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -155,21 +155,18 @@ struct bdaddr_list_with_irk { u8 local_irk[16]; }; +/* Bitmask of connection flags */ enum hci_conn_flags { - HCI_CONN_FLAG_REMOTE_WAKEUP, - HCI_CONN_FLAG_DEVICE_PRIVACY, - - __HCI_CONN_NUM_FLAGS, + HCI_CONN_FLAG_REMOTE_WAKEUP = 1, + HCI_CONN_FLAG_DEVICE_PRIVACY = 2, }; - -/* Make sure number of flags doesn't exceed sizeof(current_flags) */ -static_assert(__HCI_CONN_NUM_FLAGS < 32); +typedef u8 hci_conn_flags_t; struct bdaddr_list_with_flags { struct list_head list; bdaddr_t bdaddr; u8 bdaddr_type; - DECLARE_BITMAP(flags, __HCI_CONN_NUM_FLAGS); + hci_conn_flags_t flags; }; struct bt_uuid { @@ -576,7 +573,7 @@ struct hci_dev { struct rfkill *rfkill; DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS); - DECLARE_BITMAP(conn_flags, __HCI_CONN_NUM_FLAGS); + hci_conn_flags_t conn_flags; __s8 adv_tx_power; __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; @@ -775,7 +772,7 @@ struct hci_conn_params { struct hci_conn *conn; bool explicit_connect; - DECLARE_BITMAP(flags, __HCI_CONN_NUM_FLAGS); + hci_conn_flags_t flags; u8 privacy_mode; }; -- cgit v1.2.3 From 3a41c64d9c1185a2f3a184015e2a9b78bfc99c71 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jun 2022 17:31:29 +0200 Subject: netfilter: nf_tables: bail out early if hardware offload is not supported If user requests for NFT_CHAIN_HW_OFFLOAD, then check if either device provides the .ndo_setup_tc interface or there is an indirect flow block that has been registered. Otherwise, bail out early from the preparation phase. Moreover, validate that family == NFPROTO_NETDEV and hook is NF_NETDEV_INGRESS. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- include/net/flow_offload.h | 1 + include/net/netfilter/nf_tables_offload.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 021778a7e1af..6484095a8c01 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -612,5 +612,6 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); +bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */ diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 797147843958..3568b6a2f5f0 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -92,7 +92,7 @@ int nft_flow_rule_offload_commit(struct net *net); NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ memset(&(__reg)->mask, 0xff, (__reg)->len); -int nft_chain_offload_priority(struct nft_base_chain *basechain); +bool nft_chain_offload_support(const struct nft_base_chain *basechain); int nft_offload_init(void); void nft_offload_exit(void); -- cgit v1.2.3 From f93431c86b631bbca5614c66f966bf3ddb3c2803 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 7 Jun 2022 20:00:27 +0800 Subject: ipv6: Fix signed integer overflow in __ip6_append_data Resurrect ubsan overflow checks and ubsan report this warning, fix it by change the variable [length] type to size_t. UBSAN: signed-integer-overflow in net/ipv6/ip6_output.c:1489:19 2147479552 + 8567 cannot be represented in type 'int' CPU: 0 PID: 253 Comm: err Not tainted 5.16.0+ #1 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x214/0x230 show_stack+0x30/0x78 dump_stack_lvl+0xf8/0x118 dump_stack+0x18/0x30 ubsan_epilogue+0x18/0x60 handle_overflow+0xd0/0xf0 __ubsan_handle_add_overflow+0x34/0x44 __ip6_append_data.isra.48+0x1598/0x1688 ip6_append_data+0x128/0x260 udpv6_sendmsg+0x680/0xdd0 inet6_sendmsg+0x54/0x90 sock_sendmsg+0x70/0x88 ____sys_sendmsg+0xe8/0x368 ___sys_sendmsg+0x98/0xe0 __sys_sendmmsg+0xf4/0x3b8 __arm64_sys_sendmmsg+0x34/0x48 invoke_syscall+0x64/0x160 el0_svc_common.constprop.4+0x124/0x300 do_el0_svc+0x44/0xc8 el0_svc+0x3c/0x1e8 el0t_64_sync_handler+0x88/0xb0 el0t_64_sync+0x16c/0x170 Changes since v1: -Change the variable [length] type to unsigned, as Eric Dumazet suggested. Changes since v2: -Don't change exthdrlen type in ip6_make_skb, as Paolo Abeni suggested. Changes since v3: -Don't change ulen type in udpv6_sendmsg and l2tp_ip6_sendmsg, as Jakub Kicinski suggested. Reported-by: Hulk Robot Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/20220607120028.845916-1-wangyufen@huawei.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5b38bf1a586b..de9dcc5652c4 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1063,7 +1063,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); @@ -1079,7 +1079,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); -- cgit v1.2.3 From 593d1ebe00a45af5cb7bda1235c0790987c2a2b2 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 15 Jun 2022 12:32:13 -0700 Subject: Revert "net: Add a second bind table hashed by port and address" This reverts: commit d5a42de8bdbe ("net: Add a second bind table hashed by port and address") commit 538aaf9b2383 ("selftests: Add test for timing a bind request to a port with a populated bhash entry") Link: https://lore.kernel.org/netdev/20220520001834.2247810-1-kuba@kernel.org/ There are a few things that need to be fixed here: * Updating bhash2 in cases where the socket's rcv saddr changes * Adding bhash2 hashbucket locks Links to syzbot reports: https://lore.kernel.org/netdev/00000000000022208805e0df247a@google.com/ https://lore.kernel.org/netdev/0000000000003f33bc05dfaf44fe@google.com/ Fixes: d5a42de8bdbe ("net: Add a second bind table hashed by port and address") Reported-by: syzbot+015d756bbd1f8b5c8f09@syzkaller.appspotmail.com Reported-by: syzbot+98fd2d1422063b0f8c44@syzkaller.appspotmail.com Reported-by: syzbot+0a847a982613c6438fba@syzkaller.appspotmail.com Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20220615193213.2419568-1-joannelkoong@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 -- include/net/inet_hashtables.h | 68 +------------------------------------- include/net/sock.h | 14 -------- 3 files changed, 1 insertion(+), 84 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 077cd730ce2f..85cd695e7fd1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -25,7 +25,6 @@ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; -struct inet_bind2_bucket; struct tcp_congestion_ops; /* @@ -58,7 +57,6 @@ struct inet_connection_sock_af_ops { * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node - * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout @@ -85,7 +83,6 @@ struct inet_connection_sock { struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; - struct inet_bind2_bucket *icsk_bind2_hash; unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a0887b70967b..ebfa3df6f8dc 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -90,32 +90,11 @@ struct inet_bind_bucket { struct hlist_head owners; }; -struct inet_bind2_bucket { - possible_net_t ib_net; - int l3mdev; - unsigned short port; - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; -#endif - __be32 rcv_saddr; - }; - /* Node in the inet2_bind_hashbucket chain */ - struct hlist_node node; - /* List of sockets hashed to this bucket */ - struct hlist_head owners; -}; - static inline struct net *ib_net(struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } -static inline struct net *ib2_net(struct inet_bind2_bucket *ib) -{ - return read_pnet(&ib->ib_net); -} - #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) @@ -124,15 +103,6 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; -/* This is synchronized using the inet_bind_hashbucket's spinlock. - * Instead of having separate spinlocks, the inet_bind2_hashbucket can share - * the inet_bind_hashbucket's given that in every case where the bhash2 table - * is useful, a lookup in the bhash table also occurs. - */ -struct inet_bind2_hashbucket { - struct hlist_head chain; -}; - /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without @@ -164,12 +134,6 @@ struct inet_hashinfo { */ struct kmem_cache *bind_bucket_cachep; struct inet_bind_hashbucket *bhash; - /* The 2nd binding table hashed by port and address. - * This is used primarily for expediting the resolution of bind - * conflicts. - */ - struct kmem_cache *bind2_bucket_cachep; - struct inet_bind2_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ @@ -229,36 +193,6 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); -static inline bool check_bind_bucket_match(struct inet_bind_bucket *tb, - struct net *net, - const unsigned short port, - int l3mdev) -{ - return net_eq(ib_net(tb), net) && tb->port == port && - tb->l3mdev == l3mdev; -} - -struct inet_bind2_bucket * -inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, - struct inet_bind2_hashbucket *head, - const unsigned short port, int l3mdev, - const struct sock *sk); - -void inet_bind2_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind2_bucket *tb); - -struct inet_bind2_bucket * -inet_bind2_bucket_find(struct inet_hashinfo *hinfo, struct net *net, - const unsigned short port, int l3mdev, - struct sock *sk, - struct inet_bind2_hashbucket **head); - -bool check_bind2_bucket_match_nulladdr(struct inet_bind2_bucket *tb, - struct net *net, - const unsigned short port, - int l3mdev, - const struct sock *sk); - static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { @@ -266,7 +200,7 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - struct inet_bind2_bucket *tb2, const unsigned short snum); + const unsigned short snum); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); diff --git a/include/net/sock.h b/include/net/sock.h index c585ef6565d9..72ca97ccb460 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -348,7 +348,6 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -538,7 +537,6 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -819,16 +817,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -846,8 +834,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset -- cgit v1.2.3 From e34a07c0ae3906f97eb18df50902e2a01c1015b6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Jun 2022 12:13:53 -0700 Subject: sock: redo the psock vs ULP protection check Commit 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") has moved the inet_csk_has_ulp(sk) check from sk_psock_init() to the new tcp_bpf_update_proto() function. I'm guessing that this was done to allow creating psocks for non-inet sockets. Unfortunately the destruction path for psock includes the ULP unwind, so we need to fail the sk_psock_init() itself. Otherwise if ULP is already present we'll notice that later, and call tcp_update_ulp() with the sk_proto of the ULP itself, which will most likely result in the ULP looping its callbacks. Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") Signed-off-by: Jakub Kicinski Reviewed-by: John Fastabend Reviewed-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220620191353.1184629-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/net/inet_sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index c1b5dcd6597c..daead5fb389a 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,6 +253,11 @@ struct inet_sock { #define IP_CMSG_CHECKSUM BIT(7) #define IP_CMSG_RECVFRAGSIZE BIT(8) +static inline bool sk_is_inet(struct sock *sk) +{ + return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; +} + /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket -- cgit v1.2.3 From e34b9ed96ce3b06c79bf884009b16961ca478f87 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 22 Jun 2022 16:43:57 +0200 Subject: netfilter: nf_tables: avoid skb access on nf_stolen When verdict is NF_STOLEN, the skb might have been freed. When tracing is enabled, this can result in a use-after-free: 1. access to skb->nf_trace 2. access to skb->mark 3. computation of trace id 4. dump of packet payload To avoid 1, keep a cached copy of skb->nf_trace in the trace state struct. Refresh this copy whenever verdict is != STOLEN. Avoid 2 by skipping skb->mark access if verdict is STOLEN. 3 is avoided by precomputing the trace id. Only dump the packet when verdict is not "STOLEN". Reported-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 279ae0fff7ad..5c4e5a96a984 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1338,24 +1338,28 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * + * @trace: other struct members are initialised + * @nf_trace: copy of skb->nf_trace before rule evaluation + * @type: event type (enum nft_trace_types) + * @skbid: hash of skb to be used as trace id + * @packet_dumped: packet headers sent in a previous traceinfo message * @pkt: pktinfo currently processed * @basechain: base chain currently processed * @chain: chain currently processed * @rule: rule that was evaluated * @verdict: verdict given by rule - * @type: event type (enum nft_trace_types) - * @packet_dumped: packet headers sent in a previous traceinfo message - * @trace: other struct members are initialised */ struct nft_traceinfo { + bool trace; + bool nf_trace; + bool packet_dumped; + enum nft_trace_types type:8; + u32 skbid; const struct nft_pktinfo *pkt; const struct nft_base_chain *basechain; const struct nft_chain *chain; const struct nft_rule_dp *rule; const struct nft_verdict *verdict; - enum nft_trace_types type; - bool packet_dumped; - bool trace; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, -- cgit v1.2.3 From 052f744f44462cc49b88a125b0f7b93a9e47a9dd Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 4 Jul 2022 22:44:04 +0200 Subject: net/sched: act_police: allow 'continue' action offload Offloading police with action TC_ACT_UNSPEC was erroneously disabled even though it was supported by mlx5 matchall offload implementation, which didn't verify the action type but instead assumed that any single police action attached to matchall classifier is a 'continue' action. Lack of action type check made it non-obvious what mlx5 matchall implementation actually supports and caused implementers and reviewers of referenced commits to disallow it as a part of improved validation code. Fixes: b8cd5831c61c ("net: flow_offload: add tc police action parameters") Fixes: b50e462bc22d ("net/sched: act_police: Add extack messages for offload failure") Signed-off-by: Vlad Buslov Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/flow_offload.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 6484095a8c01..7ac313858037 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -152,6 +152,7 @@ enum flow_action_id { FLOW_ACTION_PIPE, FLOW_ACTION_VLAN_PUSH_ETH, FLOW_ACTION_VLAN_POP_ETH, + FLOW_ACTION_CONTINUE, NUM_FLOW_ACTIONS, }; -- cgit v1.2.3