From 36a686c0784fcccdaa4f38b498a9ef0d42ea7cb8 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 10 Jul 2025 18:43:42 +0200 Subject: Revert "netfilter: nf_tables: Add notifications for hook changes" This reverts commit 465b9ee0ee7bc268d7f261356afd6c4262e48d82. Such notifications fit better into core or nfnetlink_hook code, following the NFNL_MSG_HOOK_GET message format. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 ----- include/uapi/linux/netfilter/nf_tables.h | 10 ---------- include/uapi/linux/netfilter/nfnetlink.h | 2 -- 3 files changed, 17 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e4d8e451e935..5e49619ae49c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,11 +1142,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); -struct nft_hook; -void nf_tables_chain_device_notify(const struct nft_chain *chain, - const struct nft_hook *hook, - const struct net_device *dev, int event); - enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 518ba144544c..2beb30be2c5f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,8 +142,6 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_GETSETELEM_RESET, - NFT_MSG_NEWDEV, - NFT_MSG_DELDEV, NFT_MSG_MAX, }; @@ -1786,18 +1784,10 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) - * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) - * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) - * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) - * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, - NFTA_DEVICE_TABLE, - NFTA_DEVICE_FLOWTABLE, - NFTA_DEVICE_CHAIN, - NFTA_DEVICE_SPEC, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 50d807af2649..6cd58cd2a6f0 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,8 +25,6 @@ enum nfnetlink_groups { #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA NFNLGRP_NFTRACE, #define NFNLGRP_NFTRACE NFNLGRP_NFTRACE - NFNLGRP_NFT_DEV, -#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) -- cgit v1.2.3 From 444020f4bf06fb86805ee7e7ceec0375485fd94d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 14 Jul 2025 14:21:30 +0200 Subject: wifi: cfg80211: remove scan request n_channels counted_by This reverts commit e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by"). This really has been a completely failed experiment. There were no actual bugs found, and yet at this point we already have four "fixes" to it, with nothing to show for but code churn, and it never even made the code any safer. In all of the cases that ended up getting "fixed", the structure is also internally inconsistent after the n_channels setting as the channel list isn't actually filled yet. You cannot scan with such a structure, that's just wrong. In mac80211, the struct is also reused multiple times, so initializing it once is no good. Some previous "fixes" (e.g. one in brcm80211) are also just setting n_channels before accessing the array, under the assumption that the code is correct and the array can be accessed, further showing that the whole thing is just pointless when the allocation count and use count are not separate. If we really wanted to fix it, we'd need to separately track the number of channels allocated and the number of channels currently used, but given that no bugs were found despite the numerous syzbot reports, that'd just be a waste of time. Remove the __counted_by() annotation. We really should also remove a number of the n_channels settings that are setting up a structure that's inconsistent, but that can wait. Reported-by: syzbot+e834e757bd9b3d3e1251@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=e834e757bd9b3d3e1251 Fixes: e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by") Link: https://patch.msgid.link/20250714142130.9b0bbb7e1f07.I09112ccde72d445e11348fc2bef68942cb2ffc94@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d1848dc8ec99..10248d527616 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2690,7 +2690,7 @@ struct cfg80211_scan_request { s8 tsf_report_link_id; /* keep last */ - struct ieee80211_channel *channels[] __counted_by(n_channels); + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) -- cgit v1.2.3 From dfef8d87a031ac1a46dde3de804e0fcf3c3a6afd Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 14 Jul 2025 22:27:43 +0200 Subject: Bluetooth: hci_core: fix typos in macros The provided macro parameter is named 'dev' (rather than 'hdev', which may be a variable on the stack where the macro is used). Fixes: a9a830a676a9 ("Bluetooth: hci_event: Fix sending HCI_OP_READ_ENC_KEY_SIZE") Fixes: 6126ffabba6b ("Bluetooth: Introduce HCI_CONN_FLAG_DEVICE_PRIVACY device flag") Signed-off-by: Christian Eggers Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 0da011fc8146..052c91613bb9 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1940,11 +1940,11 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ - (hdev->commands[39] & 0x04)) + ((dev)->commands[39] & 0x04)) #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) + !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &(dev)->quirks)) #define read_voice_setting_capable(dev) \ ((dev)->commands[9] & 0x04 && \ -- cgit v1.2.3 From cdee6a4416b2a57c89082929cc60e2275bb32a3a Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 14 Jul 2025 22:27:44 +0200 Subject: Bluetooth: hci_core: add missing braces when using macro parameters Macro parameters should always be put into braces when accessing it. Fixes: 4fc9857ab8c6 ("Bluetooth: hci_sync: Add check simultaneous roles support") Signed-off-by: Christian Eggers Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 052c91613bb9..367ca43f45d1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -829,20 +829,20 @@ extern struct mutex hci_cb_list_lock; #define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags) -#define hci_dev_clear_volatile_flags(hdev) \ - do { \ - hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ - hci_dev_clear_flag(hdev, HCI_LE_ADV); \ - hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ - hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ - hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ +#define hci_dev_clear_volatile_flags(hdev) \ + do { \ + hci_dev_clear_flag((hdev), HCI_LE_SCAN); \ + hci_dev_clear_flag((hdev), HCI_LE_ADV); \ + hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION); \ + hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ); \ + hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT); \ } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ - (hdev->le_states[4] & 0x08) && /* Central */ \ - (hdev->le_states[4] & 0x40) && /* Peripheral */ \ - (hdev->le_states[3] & 0x10)) /* Simultaneous */ + (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &(hdev)->quirks) && \ + ((hdev)->le_states[4] & 0x08) && /* Central */ \ + ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ + ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); -- cgit v1.2.3 From 6851a0c228fc040dce8e4c393004209e7372e0a3 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 14 Jul 2025 22:27:45 +0200 Subject: Bluetooth: hci_dev: replace 'quirks' integer by 'quirk_flags' bitmap The 'quirks' member already ran out of bits on some platforms some time ago. Replace the integer member by a bitmap in order to have enough bits in future. Replace raw bit operations by accessor macros. Fixes: ff26b2dd6568 ("Bluetooth: Add quirk for broken READ_VOICE_SETTING") Fixes: 127881334eaa ("Bluetooth: Add quirk for broken READ_PAGE_SCAN_TYPE") Suggested-by: Pauli Virtanen Tested-by: Ivan Pravdin Signed-off-by: Kiran K Signed-off-by: Christian Eggers Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 2 ++ include/net/bluetooth/hci_core.h | 28 ++++++++++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 82cbd54443ac..c79901f2dc2a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -377,6 +377,8 @@ enum { * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, + + __HCI_NUM_QUIRKS, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 367ca43f45d1..f79f59e67114 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -464,7 +464,7 @@ struct hci_dev { unsigned int auto_accept_delay; - unsigned long quirks; + DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS); atomic_t cmd_cnt; unsigned int acl_cnt; @@ -656,6 +656,10 @@ struct hci_dev { u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; +#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags) +#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags) +#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags) + #define HCI_PHY_HANDLE(handle) (handle & 0xff) enum conn_reasons { @@ -839,7 +843,7 @@ extern struct mutex hci_cb_list_lock; } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &(hdev)->quirks) && \ + (!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \ ((hdev)->le_states[4] & 0x08) && /* Central */ \ ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ @@ -1931,8 +1935,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ - !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_LE_CODED)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) @@ -1944,27 +1948,27 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE)) #define read_voice_setting_capable(dev) \ ((dev)->commands[9] & 0x04 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING)) /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ #define enhanced_sync_conn_capable(dev) \ (((dev)->commands[29] & 0x08) && \ - !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN)) /* Use ext create connection if command is supported */ #define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1979,8 +1983,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); */ #define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ ext_adv_capable(dev)) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) @@ -1997,7 +2001,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ - (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 -- cgit v1.2.3 From 2d72afb340657f03f7261e9243b44457a9228ac7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 16 Jul 2025 20:39:14 +0200 Subject: netfilter: nf_conntrack: fix crash due to removal of uninitialised entry A crash in conntrack was reported while trying to unlink the conntrack entry from the hash bucket list: [exception RIP: __nf_ct_delete_from_lists+172] [..] #7 [ff539b5a2b043aa0] nf_ct_delete at ffffffffc124d421 [nf_conntrack] #8 [ff539b5a2b043ad0] nf_ct_gc_expired at ffffffffc124d999 [nf_conntrack] #9 [ff539b5a2b043ae0] __nf_conntrack_find_get at ffffffffc124efbc [nf_conntrack] [..] The nf_conn struct is marked as allocated from slab but appears to be in a partially initialised state: ct hlist pointer is garbage; looks like the ct hash value (hence crash). ct->status is equal to IPS_CONFIRMED|IPS_DYING, which is expected ct->timeout is 30000 (=30s), which is unexpected. Everything else looks like normal udp conntrack entry. If we ignore ct->status and pretend its 0, the entry matches those that are newly allocated but not yet inserted into the hash: - ct hlist pointers are overloaded and store/cache the raw tuple hash - ct->timeout matches the relative time expected for a new udp flow rather than the absolute 'jiffies' value. If it were not for the presence of IPS_CONFIRMED, __nf_conntrack_find_get() would have skipped the entry. Theory is that we did hit following race: cpu x cpu y cpu z found entry E found entry E E is expired nf_ct_delete() return E to rcu slab init_conntrack E is re-inited, ct->status set to 0 reply tuplehash hnnode.pprev stores hash value. cpu y found E right before it was deleted on cpu x. E is now re-inited on cpu z. cpu y was preempted before checking for expiry and/or confirm bit. ->refcnt set to 1 E now owned by skb ->timeout set to 30000 If cpu y were to resume now, it would observe E as expired but would skip E due to missing CONFIRMED bit. nf_conntrack_confirm gets called sets: ct->status |= CONFIRMED This is wrong: E is not yet added to hashtable. cpu y resumes, it observes E as expired but CONFIRMED: nf_ct_expired() -> yes (ct->timeout is 30s) confirmed bit set. cpu y will try to delete E from the hashtable: nf_ct_delete() -> set DYING bit __nf_ct_delete_from_lists Even this scenario doesn't guarantee a crash: cpu z still holds the table bucket lock(s) so y blocks: wait for spinlock held by z CONFIRMED is set but there is no guarantee ct will be added to hash: "chaintoolong" or "clash resolution" logic both skip the insert step. reply hnnode.pprev still stores the hash value. unlocks spinlock return NF_DROP In case CPU z does insert the entry into the hashtable, cpu y will unlink E again right away but no crash occurs. Without 'cpu y' race, 'garbage' hlist is of no consequence: ct refcnt remains at 1, eventually skb will be free'd and E gets destroyed via: nf_conntrack_put -> nf_conntrack_destroy -> nf_ct_destroy. To resolve this, move the IPS_CONFIRMED assignment after the table insertion but before the unlock. Pablo points out that the confirm-bit-store could be reordered to happen before hlist add resp. the timeout fixup, so switch to set_bit and before_atomic memory barrier to prevent this. It doesn't matter if other CPUs can observe a newly inserted entry right before the CONFIRMED bit was set: Such event cannot be distinguished from above "E is the old incarnation" case: the entry will be skipped. Also change nf_ct_should_gc() to first check the confirmed bit. The gc sequence is: 1. Check if entry has expired, if not skip to next entry 2. Obtain a reference to the expired entry. 3. Call nf_ct_should_gc() to double-check step 1. nf_ct_should_gc() is thus called only for entries that already failed an expiry check. After this patch, once the confirmed bit check passes ct->timeout has been altered to reflect the absolute 'best before' date instead of a relative time. Step 3 will therefore not remove the entry. Without this change to nf_ct_should_gc() we could still get this sequence: 1. Check if entry has expired. 2. Obtain a reference. 3. Call nf_ct_should_gc() to double-check step 1: 4 - entry is still observed as expired 5 - meanwhile, ct->timeout is corrected to absolute value on other CPU and confirm bit gets set 6 - confirm bit is seen 7 - valid entry is removed again First do check 6), then 4) so the gc expiry check always picks up either confirmed bit unset (entry gets skipped) or expiry re-check failure for re-inited conntrack objects. This change cannot be backported to releases before 5.19. Without commit 8a75a2c17410 ("netfilter: conntrack: remove unconfirmed list") |= IPS_CONFIRMED line cannot be moved without further changes. Cc: Razvan Cojocaru Link: https://lore.kernel.org/netfilter-devel/20250627142758.25664-1-fw@strlen.de/ Link: https://lore.kernel.org/netfilter-devel/4239da15-83ff-4ca4-939d-faef283471bb@gmail.com/ Fixes: 1397af5bfd7d ("netfilter: conntrack: remove the percpu dying list") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3f02a45773e8..ca26274196b9 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -306,8 +306,19 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct) /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { - return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && - !nf_ct_is_dying(ct); + if (!nf_ct_is_confirmed(ct)) + return false; + + /* load ct->timeout after is_confirmed() test. + * Pairs with __nf_conntrack_confirm() which: + * 1. Increases ct->timeout value + * 2. Inserts ct into rcu hlist + * 3. Sets the confirmed bit + * 4. Unlocks the hlist lock + */ + smp_acquire__after_ctrl_dep(); + + return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) -- cgit v1.2.3 From 962fb1f651c2cf2083e0c3ef53ba69e3b96d3fbc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jul 2025 08:43:42 +0100 Subject: rxrpc: Fix recv-recv race of completed call If a call receives an event (such as incoming data), the call gets placed on the socket's queue and a thread in recvmsg can be awakened to go and process it. Once the thread has picked up the call off of the queue, further events will cause it to be requeued, and once the socket lock is dropped (recvmsg uses call->user_mutex to allow the socket to be used in parallel), a second thread can come in and its recvmsg can pop the call off the socket queue again. In such a case, the first thread will be receiving stuff from the call and the second thread will be blocked on call->user_mutex. The first thread can, at this point, process both the event that it picked call for and the event that the second thread picked the call for and may see the call terminate - in which case the call will be "released", decoupling the call from the user call ID assigned to it (RXRPC_USER_CALL_ID in the control message). The first thread will return okay, but then the second thread will wake up holding the user_mutex and, if it sees that the call has been released by the first thread, it will BUG thusly: kernel BUG at net/rxrpc/recvmsg.c:474! Fix this by just dequeuing the call and ignoring it if it is seen to be already released. We can't tell userspace about it anyway as the user call ID has become stale. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: David Howells Reviewed-by: Jeffrey Altman cc: LePremierHomme cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250717074350.3767366-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 378d2dfc7392..e7dcfb1369b6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -330,12 +330,15 @@ EM(rxrpc_call_put_userid, "PUT user-id ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ + EM(rxrpc_call_see_already_released, "SEE alrdy-rl") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ EM(rxrpc_call_see_conn_abort, "SEE conn-abt") \ + EM(rxrpc_call_see_discard, "SEE discard ") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ + EM(rxrpc_call_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ -- cgit v1.2.3 From 2fd895842d49c23137ae48252dd211e5d6d8a3ed Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jul 2025 08:43:43 +0100 Subject: rxrpc: Fix notification vs call-release vs recvmsg When a call is released, rxrpc takes the spinlock and removes it from ->recvmsg_q in an effort to prevent racing recvmsg() invocations from seeing the same call. Now, rxrpc_recvmsg() only takes the spinlock when actually removing a call from the queue; it doesn't, however, take it in the lead up to that when it checks to see if the queue is empty. It *does* hold the socket lock, which prevents a recvmsg/recvmsg race - but this doesn't prevent sendmsg from ending the call because sendmsg() drops the socket lock and relies on the call->user_mutex. Fix this by firstly removing the bit in rxrpc_release_call() that dequeues the released call and, instead, rely on recvmsg() to simply discard released calls (done in a preceding fix). Secondly, rxrpc_notify_socket() is abandoned if the call is already marked as released rather than trying to be clever by setting both pointers in call->recvmsg_link to NULL to trick list_empty(). This isn't perfect and can still race, resulting in a released call on the queue, but recvmsg() will now clean that up. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells Reviewed-by: Jeffrey Altman cc: Marc Dionne cc: Junvyyang, Tencent Zhuque Lab cc: LePremierHomme cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250717074350.3767366-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e7dcfb1369b6..de6f6d25767c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -322,10 +322,10 @@ EM(rxrpc_call_put_kernel, "PUT kernel ") \ EM(rxrpc_call_put_poke, "PUT poke ") \ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ + EM(rxrpc_call_put_release_recvmsg_q, "PUT rls-rcmq") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ - EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ EM(rxrpc_call_put_userid, "PUT user-id ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ @@ -338,6 +338,7 @@ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ + EM(rxrpc_call_see_notify_released, "SEE nfy-rlsd") \ EM(rxrpc_call_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ -- cgit v1.2.3