From 234249d88b091d006b82f8d570343aae5f383736 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Fri, 25 Aug 2023 03:00:55 -0400 Subject: wifi: cfg80211/mac80211: hold link BSSes when assoc fails for MLO connection When connect to MLO AP with more than one link, and the assoc response of AP is not success, then cfg80211_unhold_bss() is not called for all the links' cfg80211_bss except the primary link which means the link used by the latest successful association request. Thus the hold value of the cfg80211_bss is not reset to 0 after the assoc fail, and then the __cfg80211_unlink_bss() will not be called for the cfg80211_bss by __cfg80211_bss_expire(). Then the AP always looks exist even the AP is shutdown or reconfigured to another type, then it will lead error while connecting it again. The detail info are as below. When connect with muti-links AP, cfg80211_hold_bss() is called by cfg80211_mlme_assoc() for each cfg80211_bss of all the links. When assoc response from AP is not success(such as status_code==1), the ieee80211_link_data of non-primary link(sdata->link[link_id]) is NULL because ieee80211_assoc_success()->ieee80211_vif_update_links() is not called for the links. Then struct cfg80211_rx_assoc_resp resp in cfg80211_rx_assoc_resp() and struct cfg80211_connect_resp_params cr in __cfg80211_connect_result() will only have the data of the primary link, and finally function cfg80211_connect_result_release_bsses() only call cfg80211_unhold_bss() for the primary link. Then cfg80211_bss of the other links will never free because its hold is always > 0 now. Hence assign value for the bss and status from assoc_data since it is valid for this case. Also assign value of addr from assoc_data when the link is NULL because the addrs of assoc_data and link both represent the local link addr and they are same value for success connection. Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20230825070055.28164-1-quic_wgong@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 11 ++++++----- net/wireless/mlme.c | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f93eb38ae0b8..46d46cfab6c8 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5429,17 +5429,18 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; - link = sdata_dereference(sdata->link[link_id], sdata); - if (!link) - continue; - if (!assoc_data->link[link_id].bss) continue; resp.links[link_id].bss = assoc_data->link[link_id].bss; - resp.links[link_id].addr = link->conf->addr; + ether_addr_copy(resp.links[link_id].addr, + assoc_data->link[link_id].addr); resp.links[link_id].status = assoc_data->link[link_id].status; + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 775cac4d6100..3e2c398abddc 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -52,7 +52,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, cr.links[link_id].bssid = data->links[link_id].bss->bssid; cr.links[link_id].addr = data->links[link_id].addr; /* need to have local link addresses for MLO connections */ - WARN_ON(cr.ap_mld_addr && !cr.links[link_id].addr); + WARN_ON(cr.ap_mld_addr && + !is_valid_ether_addr(cr.links[link_id].addr)); BUG_ON(!cr.links[link_id].bss->channel); -- cgit v1.2.3 From 5112fa502708aaaf80acb78273fc8625f221eb11 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 5 Sep 2023 12:18:57 +0530 Subject: wifi: cfg80211: validate AP phy operation before starting it Many regulatories can have HE/EHT Operation as not permitted. In such cases, AP should not be allowed to start if it is using a channel having the no operation flag set. However, currently there is no such check in place. Fix this issue by validating such IEs sent during start AP against the channel flags. Signed-off-by: Aditya Kumar Singh Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230905064857.1503-1-quic_adisi@quicinc.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de47838aca4f..0c989a839e56 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5909,6 +5909,21 @@ out: nlmsg_free(msg); } +static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params) +{ + struct ieee80211_channel *channel = params->chandef.chan; + + if ((params->he_cap || params->he_oper) && + (channel->flags & IEEE80211_CHAN_NO_HE)) + return -EOPNOTSUPP; + + if ((params->eht_cap || params->eht_oper) && + (channel->flags & IEEE80211_CHAN_NO_EHT)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6178,6 +6193,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (err) goto out_unlock; + err = nl80211_validate_ap_phy_operation(params); + if (err) + goto out_unlock; + if (info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]) params->flags = nla_get_u32( info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]); -- cgit v1.2.3 From 37c20b2effe987b806c8de6d12978e4ffeff026f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 Aug 2023 15:38:04 +0200 Subject: wifi: cfg80211: fix cqm_config access race Max Schulze reports crashes with brcmfmac. The reason seems to be a race between userspace removing the CQM config and the driver calling cfg80211_cqm_rssi_notify(), where if the data is freed while cfg80211_cqm_rssi_notify() runs it will crash since it assumes wdev->cqm_config is set. This can't be fixed with a simple non-NULL check since there's nothing we can do for locking easily, so use RCU instead to protect the pointer, but that requires pulling the updates out into an asynchronous worker so they can sleep and call back into the driver. Since we need to change the free anyway, also change it to go back to the old settings if changing the settings fails. Reported-and-tested-by: Max Schulze Closes: https://lore.kernel.org/r/ac96309a-8d8d-4435-36e6-6d152eb31876@online.de Fixes: 4a4b8169501b ("cfg80211: Accept multiple RSSI thresholds for CQM") Signed-off-by: Johannes Berg --- net/wireless/core.c | 14 ++++---- net/wireless/core.h | 7 ++-- net/wireless/nl80211.c | 93 +++++++++++++++++++++++++++++++++----------------- 3 files changed, 73 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 25bc2e50a061..64e861617110 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1181,16 +1181,11 @@ void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); -void cfg80211_cqm_config_free(struct wireless_dev *wdev) -{ - kfree(wdev->cqm_config); - wdev->cqm_config = NULL; -} - static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, bool unregister_netdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_cqm_config *cqm_config; unsigned int link_id; ASSERT_RTNL(); @@ -1227,7 +1222,10 @@ static void _cfg80211_unregister_wdev(struct wireless_dev *wdev, kfree_sensitive(wdev->wext.keys); wdev->wext.keys = NULL; #endif - cfg80211_cqm_config_free(wdev); + wiphy_work_cancel(wdev->wiphy, &wdev->cqm_rssi_work); + /* deleted from the list, so can't be found from nl80211 any more */ + cqm_config = rcu_access_pointer(wdev->cqm_config); + kfree_rcu(cqm_config, rcu_head); /* * Ensure that all events have been processed and @@ -1379,6 +1377,8 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; #endif + wiphy_work_init(&wdev->cqm_rssi_work, cfg80211_cqm_rssi_notify_work); + if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT) wdev->ps = true; else diff --git a/net/wireless/core.h b/net/wireless/core.h index 507d184b8b40..ba9c7170afa4 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -295,12 +295,17 @@ struct cfg80211_beacon_registration { }; struct cfg80211_cqm_config { + struct rcu_head rcu_head; u32 rssi_hyst; s32 last_rssi_event_value; + enum nl80211_cqm_rssi_threshold_event last_rssi_event_type; int n_rssi_thresholds; s32 rssi_thresholds[] __counted_by(n_rssi_thresholds); }; +void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, + struct wiphy_work *work); + void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); /* free object */ @@ -566,8 +571,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, #define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; }) #endif -void cfg80211_cqm_config_free(struct wireless_dev *wdev); - void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid); void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev); void cfg80211_pmsr_free_wk(struct work_struct *work); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0c989a839e56..7a88361b3414 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12815,7 +12815,8 @@ static int nl80211_set_cqm_txe(struct genl_info *info, } static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, - struct net_device *dev) + struct net_device *dev, + struct cfg80211_cqm_config *cqm_config) { struct wireless_dev *wdev = dev->ieee80211_ptr; s32 last, low, high; @@ -12824,7 +12825,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, int err; /* RSSI reporting disabled? */ - if (!wdev->cqm_config) + if (!cqm_config) return rdev_set_cqm_rssi_range_config(rdev, dev, 0, 0); /* @@ -12833,7 +12834,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, * connection is established and enough beacons received to calculate * the average. */ - if (!wdev->cqm_config->last_rssi_event_value && + if (!cqm_config->last_rssi_event_value && wdev->links[0].client.current_bss && rdev->ops->get_station) { struct station_info sinfo = {}; @@ -12847,30 +12848,30 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, cfg80211_sinfo_release_content(&sinfo); if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) - wdev->cqm_config->last_rssi_event_value = + cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; } - last = wdev->cqm_config->last_rssi_event_value; - hyst = wdev->cqm_config->rssi_hyst; - n = wdev->cqm_config->n_rssi_thresholds; + last = cqm_config->last_rssi_event_value; + hyst = cqm_config->rssi_hyst; + n = cqm_config->n_rssi_thresholds; for (i = 0; i < n; i++) { i = array_index_nospec(i, n); - if (last < wdev->cqm_config->rssi_thresholds[i]) + if (last < cqm_config->rssi_thresholds[i]) break; } low_index = i - 1; if (low_index >= 0) { low_index = array_index_nospec(low_index, n); - low = wdev->cqm_config->rssi_thresholds[low_index] - hyst; + low = cqm_config->rssi_thresholds[low_index] - hyst; } else { low = S32_MIN; } if (i < n) { i = array_index_nospec(i, n); - high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1; + high = cqm_config->rssi_thresholds[i] + hyst - 1; } else { high = S32_MAX; } @@ -12883,6 +12884,7 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, u32 hysteresis) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_cqm_config *cqm_config = NULL, *old; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; int i, err; @@ -12900,10 +12902,6 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - wdev_lock(wdev); - cfg80211_cqm_config_free(wdev); - wdev_unlock(wdev); - if (n_thresholds <= 1 && rdev->ops->set_cqm_rssi_config) { if (n_thresholds == 0 || thresholds[0] == 0) /* Disabling */ return rdev_set_cqm_rssi_config(rdev, dev, 0, 0); @@ -12920,9 +12918,10 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, n_thresholds = 0; wdev_lock(wdev); - if (n_thresholds) { - struct cfg80211_cqm_config *cqm_config; + old = rcu_dereference_protected(wdev->cqm_config, + lockdep_is_held(&wdev->mtx)); + if (n_thresholds) { cqm_config = kzalloc(struct_size(cqm_config, rssi_thresholds, n_thresholds), GFP_KERNEL); @@ -12937,11 +12936,18 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, flex_array_size(cqm_config, rssi_thresholds, n_thresholds)); - wdev->cqm_config = cqm_config; + rcu_assign_pointer(wdev->cqm_config, cqm_config); + } else { + RCU_INIT_POINTER(wdev->cqm_config, NULL); } - err = cfg80211_cqm_rssi_update(rdev, dev); - + err = cfg80211_cqm_rssi_update(rdev, dev, cqm_config); + if (err) { + rcu_assign_pointer(wdev->cqm_config, old); + kfree_rcu(cqm_config, rcu_head); + } else { + kfree_rcu(old, rcu_head); + } unlock: wdev_unlock(wdev); @@ -19092,9 +19098,8 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp) { - struct sk_buff *msg; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_cqm_config *cqm_config; trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level); @@ -19102,18 +19107,41 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) return; - if (wdev->cqm_config) { - wdev->cqm_config->last_rssi_event_value = rssi_level; + rcu_read_lock(); + cqm_config = rcu_dereference(wdev->cqm_config); + if (cqm_config) { + cqm_config->last_rssi_event_value = rssi_level; + cqm_config->last_rssi_event_type = rssi_event; + wiphy_work_queue(wdev->wiphy, &wdev->cqm_rssi_work); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); + +void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, struct wiphy_work *work) +{ + struct wireless_dev *wdev = container_of(work, struct wireless_dev, + cqm_rssi_work); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + enum nl80211_cqm_rssi_threshold_event rssi_event; + struct cfg80211_cqm_config *cqm_config; + struct sk_buff *msg; + s32 rssi_level; - cfg80211_cqm_rssi_update(rdev, dev); + wdev_lock(wdev); + cqm_config = rcu_dereference_protected(wdev->cqm_config, + lockdep_is_held(&wdev->mtx)); + if (!wdev->cqm_config) + goto unlock; - if (rssi_level == 0) - rssi_level = wdev->cqm_config->last_rssi_event_value; - } + cfg80211_cqm_rssi_update(rdev, wdev->netdev, cqm_config); - msg = cfg80211_prepare_cqm(dev, NULL, gfp); + rssi_level = cqm_config->last_rssi_event_value; + rssi_event = cqm_config->last_rssi_event_type; + + msg = cfg80211_prepare_cqm(wdev->netdev, NULL, GFP_KERNEL); if (!msg) - return; + goto unlock; if (nla_put_u32(msg, NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT, rssi_event)) @@ -19123,14 +19151,15 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, rssi_level)) goto nla_put_failure; - cfg80211_send_cqm(msg, gfp); + cfg80211_send_cqm(msg, GFP_KERNEL); - return; + goto unlock; nla_put_failure: nlmsg_free(msg); + unlock: + wdev_unlock(wdev); } -EXPORT_SYMBOL(cfg80211_cqm_rssi_notify); void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer, u32 num_packets, -- cgit v1.2.3 From 6e48ebffc2db5419b3a51cfc509bde442252b356 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 13 Sep 2023 07:01:34 +0200 Subject: wifi: mac80211: fix mesh id corruption on 32 bit systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the changed field size was increased to u64, mesh_bss_info_changed pulls invalid bits from the first 3 bytes of the mesh id, clears them, and passes them on to ieee80211_link_info_change_notify, because ifmsh->mbss_changed was not updated to match its size. Fix this by turning into ifmsh->mbss_changed into an unsigned long array with 64 bit size. Fixes: 15ddba5f4311 ("wifi: mac80211: consistently use u64 for BSS changes") Reported-by: Thomas Hühn Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20230913050134.53536-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mesh.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 06bd406846d2..b3d00259e1d6 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -676,7 +676,7 @@ struct ieee80211_if_mesh { struct timer_list mesh_path_root_timer; unsigned long wrkq_flags; - unsigned long mbss_changed; + unsigned long mbss_changed[64 / BITS_PER_LONG]; bool userspace_handles_dfs; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index af8c5fc2db14..e31c312c124a 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1175,7 +1175,7 @@ void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata, /* if we race with running work, worst case this work becomes a noop */ for_each_set_bit(bit, &bits, sizeof(changed) * BITS_PER_BYTE) - set_bit(bit, &ifmsh->mbss_changed); + set_bit(bit, ifmsh->mbss_changed); set_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } @@ -1257,7 +1257,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) /* clear any mesh work (for next join) we may have accrued */ ifmsh->wrkq_flags = 0; - ifmsh->mbss_changed = 0; + memset(ifmsh->mbss_changed, 0, sizeof(ifmsh->mbss_changed)); local->fif_other_bss--; atomic_dec(&local->iff_allmultis); @@ -1724,9 +1724,9 @@ static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata) u32 bit; u64 changed = 0; - for_each_set_bit(bit, &ifmsh->mbss_changed, + for_each_set_bit(bit, ifmsh->mbss_changed, sizeof(changed) * BITS_PER_BYTE) { - clear_bit(bit, &ifmsh->mbss_changed); + clear_bit(bit, ifmsh->mbss_changed); changed |= BIT(bit); } -- cgit v1.2.3 From 2c3dfba4cf84ac4f306cc6653b37b6dd6859ae9d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 14 Sep 2023 15:45:17 +0200 Subject: rfkill: sync before userspace visibility/changes If userspace quickly opens /dev/rfkill after a new instance was created, it might see the old state of the instance from before the sync work runs and may even _change_ the state, only to have the sync work change it again. Fix this by doing the sync inline where needed, not just for /dev/rfkill but also for sysfs. Signed-off-by: Johannes Berg --- net/rfkill/core.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 01fca7a10b4b..08630896b6c8 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -48,6 +48,7 @@ struct rfkill { bool persistent; bool polling_paused; bool suspended; + bool need_sync; const struct rfkill_ops *ops; void *data; @@ -368,6 +369,17 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) rfkill_event(rfkill); } +static void rfkill_sync(struct rfkill *rfkill) +{ + lockdep_assert_held(&rfkill_global_mutex); + + if (!rfkill->need_sync) + return; + + rfkill_set_block(rfkill, rfkill_global_states[rfkill->type].cur); + rfkill->need_sync = false; +} + static void rfkill_update_global_state(enum rfkill_type type, bool blocked) { int i; @@ -730,6 +742,10 @@ static ssize_t soft_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); + mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); + mutex_unlock(&rfkill_global_mutex); + return sysfs_emit(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0); } @@ -751,6 +767,7 @@ static ssize_t soft_store(struct device *dev, struct device_attribute *attr, return -EINVAL; mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); rfkill_set_block(rfkill, state); mutex_unlock(&rfkill_global_mutex); @@ -783,6 +800,10 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, { struct rfkill *rfkill = to_rfkill(dev); + mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); + mutex_unlock(&rfkill_global_mutex); + return sysfs_emit(buf, "%d\n", user_state_from_blocked(rfkill->state)); } @@ -805,6 +826,7 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr, return -EINVAL; mutex_lock(&rfkill_global_mutex); + rfkill_sync(rfkill); rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED); mutex_unlock(&rfkill_global_mutex); @@ -1032,14 +1054,10 @@ static void rfkill_uevent_work(struct work_struct *work) static void rfkill_sync_work(struct work_struct *work) { - struct rfkill *rfkill; - bool cur; - - rfkill = container_of(work, struct rfkill, sync_work); + struct rfkill *rfkill = container_of(work, struct rfkill, sync_work); mutex_lock(&rfkill_global_mutex); - cur = rfkill_global_states[rfkill->type].cur; - rfkill_set_block(rfkill, cur); + rfkill_sync(rfkill); mutex_unlock(&rfkill_global_mutex); } @@ -1087,6 +1105,7 @@ int __must_check rfkill_register(struct rfkill *rfkill) round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { + rfkill->need_sync = true; schedule_work(&rfkill->sync_work); } else { #ifdef CONFIG_RFKILL_INPUT @@ -1171,6 +1190,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) goto free; + rfkill_sync(rfkill); rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD); list_add_tail(&ev->list, &data->events); } -- cgit v1.2.3 From cbaabbcdcbd355f0a1ccc09a925575c51c270750 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Sat, 26 Aug 2023 16:13:13 +0800 Subject: Bluetooth: Delete unused hci_req_prepare_suspend() declaration hci_req_prepare_suspend() has been deprecated in favor of hci_suspend_sync(). Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Yao Xiao Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_request.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index b9c5a9823837..0be75cf0efed 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -71,7 +71,5 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn); void hci_req_add_le_passive_scan(struct hci_request *req); -void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); - void hci_request_setup(struct hci_dev *hdev); void hci_request_cancel_all(struct hci_dev *hdev); -- cgit v1.2.3 From 941c998b42f5c90384f49da89a6e11233de567cf Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 29 Aug 2023 13:50:06 -0700 Subject: Bluetooth: hci_sync: Fix handling of HCI_QUIRK_STRICT_DUPLICATE_FILTER When HCI_QUIRK_STRICT_DUPLICATE_FILTER is set LE scanning requires periodic restarts of the scanning procedure as the controller would consider device previously found as duplicated despite of RSSI changes, but in order to set the scan timeout properly set le_scan_restart needs to be synchronous so it shall not use hci_cmd_sync_queue which defers the command processing to cmd_sync_work. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-bluetooth/578e6d7afd676129decafba846a933f5@agner.ch/#t Fixes: 27d54b778ad1 ("Bluetooth: Rework le_scan_restart for hci_sync") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 9b93653c6197..fd7c5d902856 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -413,11 +413,6 @@ static int hci_le_scan_restart_sync(struct hci_dev *hdev) LE_SCAN_FILTER_DUP_ENABLE); } -static int le_scan_restart_sync(struct hci_dev *hdev, void *data) -{ - return hci_le_scan_restart_sync(hdev); -} - static void le_scan_restart(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, @@ -427,15 +422,15 @@ static void le_scan_restart(struct work_struct *work) bt_dev_dbg(hdev, ""); - hci_dev_lock(hdev); - - status = hci_cmd_sync_queue(hdev, le_scan_restart_sync, NULL, NULL); + status = hci_le_scan_restart_sync(hdev); if (status) { bt_dev_err(hdev, "failed to restart LE scan: status %d", status); - goto unlock; + return; } + hci_dev_lock(hdev); + if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || !hdev->discovery.scan_start) goto unlock; -- cgit v1.2.3 From c7eaf80bfb0c8cef852cce9501b95dd5a6bddcb9 Mon Sep 17 00:00:00 2001 From: Ying Hsu Date: Mon, 4 Sep 2023 14:11:51 +0000 Subject: Bluetooth: Fix hci_link_tx_to RCU lock usage Syzbot found a bug "BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580". It is because hci_link_tx_to holds an RCU read lock and calls hci_disconnect which would hold a mutex lock since the commit a13f316e90fd ("Bluetooth: hci_conn: Consolidate code for aborting connections"). Here's an example call trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xfc/0x174 lib/dump_stack.c:106 ___might_sleep+0x4a9/0x4d3 kernel/sched/core.c:9663 __mutex_lock_common kernel/locking/mutex.c:576 [inline] __mutex_lock+0xc7/0x6e7 kernel/locking/mutex.c:732 hci_cmd_sync_queue+0x3a/0x287 net/bluetooth/hci_sync.c:388 hci_abort_conn+0x2cd/0x2e4 net/bluetooth/hci_conn.c:1812 hci_disconnect+0x207/0x237 net/bluetooth/hci_conn.c:244 hci_link_tx_to net/bluetooth/hci_core.c:3254 [inline] __check_timeout net/bluetooth/hci_core.c:3419 [inline] __check_timeout+0x310/0x361 net/bluetooth/hci_core.c:3399 hci_sched_le net/bluetooth/hci_core.c:3602 [inline] hci_tx_work+0xe8f/0x12d0 net/bluetooth/hci_core.c:3652 process_one_work+0x75c/0xba1 kernel/workqueue.c:2310 worker_thread+0x5b2/0x73a kernel/workqueue.c:2457 kthread+0x2f7/0x30b kernel/kthread.c:319 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 This patch releases RCU read lock before calling hci_disconnect and reacquires it afterward to fix the bug. Fixes: a13f316e90fd ("Bluetooth: hci_conn: Consolidate code for aborting connections") Signed-off-by: Ying Hsu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index a5992f1b3c9b..db4f28d68d71 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3418,7 +3418,12 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); + /* hci_disconnect might sleep, so, we have to release + * the RCU read lock before calling it. + */ + rcu_read_unlock(); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); + rcu_read_lock(); } } -- cgit v1.2.3 From e0275ea52169412b8faccb4e2f4fed8a057844c6 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 28 Aug 2023 13:05:45 -0700 Subject: Bluetooth: ISO: Fix handling of listen for unicast iso_listen_cis shall only return -EADDRINUSE if the listening socket has the destination set to BDADDR_ANY otherwise if the destination is set to a specific address it is for broadcast which shall be ignored. Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 16da946f5881..71248163ce9a 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -502,7 +502,7 @@ drop: } /* -------- Socket interface ---------- */ -static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *ba) +static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst) { struct sock *sk; @@ -510,7 +510,10 @@ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *ba) if (sk->sk_state != BT_LISTEN) continue; - if (!bacmp(&iso_pi(sk)->src, ba)) + if (bacmp(&iso_pi(sk)->dst, dst)) + continue; + + if (!bacmp(&iso_pi(sk)->src, src)) return sk; } @@ -952,7 +955,7 @@ static int iso_listen_cis(struct sock *sk) write_lock(&iso_sk_list.lock); - if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src)) + if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); -- cgit v1.2.3 From 1d8e801422d66e4b8c7b187c52196bef94eed887 Mon Sep 17 00:00:00 2001 From: Ying Hsu Date: Thu, 7 Sep 2023 04:39:34 +0000 Subject: Bluetooth: Avoid redundant authentication While executing the Android 13 CTS Verifier Secure Server test on a ChromeOS device, it was observed that the Bluetooth host initiates authentication for an RFCOMM connection after SSP completes. When this happens, some Intel Bluetooth controllers, like AC9560, would disconnect with "Connection Rejected due to Security Reasons (0x0e)". Historically, BlueZ did not mandate this authentication while an authenticated combination key was already in use for the connection. This behavior was changed since commit 7b5a9241b780 ("Bluetooth: Introduce requirements for security level 4"). So, this patch addresses the aforementioned disconnection issue by restoring the previous behavior. Signed-off-by: Ying Hsu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 63 +++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 9d5057cef30a..7a6f20338db8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2413,34 +2413,41 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; - /* An authenticated FIPS approved combination key has sufficient - * security for security level 4. */ - if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 && - sec_level == BT_SECURITY_FIPS) - goto encrypt; - - /* An authenticated combination key has sufficient security for - security level 3. */ - if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 || - conn->key_type == HCI_LK_AUTH_COMBINATION_P256) && - sec_level == BT_SECURITY_HIGH) - goto encrypt; - - /* An unauthenticated combination key has sufficient security for - security level 1 and 2. */ - if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 || - conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) && - (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW)) - goto encrypt; - - /* A combination key has always sufficient security for the security - levels 1 or 2. High security level requires the combination key - is generated using maximum PIN code length (16). - For pre 2.1 units. */ - if (conn->key_type == HCI_LK_COMBINATION && - (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW || - conn->pin_length == 16)) - goto encrypt; + switch (conn->key_type) { + case HCI_LK_AUTH_COMBINATION_P256: + /* An authenticated FIPS approved combination key has + * sufficient security for security level 4 or lower. + */ + if (sec_level <= BT_SECURITY_FIPS) + goto encrypt; + break; + case HCI_LK_AUTH_COMBINATION_P192: + /* An authenticated combination key has sufficient security for + * security level 3 or lower. + */ + if (sec_level <= BT_SECURITY_HIGH) + goto encrypt; + break; + case HCI_LK_UNAUTH_COMBINATION_P192: + case HCI_LK_UNAUTH_COMBINATION_P256: + /* An unauthenticated combination key has sufficient security + * for security level 2 or lower. + */ + if (sec_level <= BT_SECURITY_MEDIUM) + goto encrypt; + break; + case HCI_LK_COMBINATION: + /* A combination key has always sufficient security for the + * security levels 2 or lower. High security level requires the + * combination key is generated using maximum PIN code length + * (16). For pre 2.1 units. + */ + if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16) + goto encrypt; + break; + default: + break; + } auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) -- cgit v1.2.3 From dcda165706b9fbfd685898d46a6749d7d397e0c0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 15 Sep 2023 14:42:27 -0700 Subject: Bluetooth: hci_core: Fix build warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following warnings: net/bluetooth/hci_core.c: In function ‘hci_register_dev’: net/bluetooth/hci_core.c:2620:54: warning: ‘%d’ directive output may be truncated writing between 1 and 10 bytes into a region of size 5 [-Wformat-truncation=] 2620 | snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); | ^~ net/bluetooth/hci_core.c:2620:50: note: directive argument in the range [0, 2147483647] 2620 | snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); | ^~~~~~~ net/bluetooth/hci_core.c:2620:9: note: ‘snprintf’ output between 5 and 14 bytes into a destination of size 8 2620 | snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index db4f28d68d71..9e89843c259b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2617,7 +2617,11 @@ int hci_register_dev(struct hci_dev *hdev) if (id < 0) return id; - snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); + error = dev_set_name(&hdev->dev, "hci%u", id); + if (error) + return error; + + hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); @@ -2639,8 +2643,6 @@ int hci_register_dev(struct hci_dev *hdev) if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); - dev_set_name(&hdev->dev, "%s", hdev->name); - error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; -- cgit v1.2.3 From b938790e70540bf4f2e653dcd74b232494d06c8f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 15 Sep 2023 13:24:47 -0700 Subject: Bluetooth: hci_codec: Fix leaking content of local_codecs The following memory leak can be observed when the controller supports codecs which are stored in local_codecs list but the elements are never freed: unreferenced object 0xffff88800221d840 (size 32): comm "kworker/u3:0", pid 36, jiffies 4294898739 (age 127.060s) hex dump (first 32 bytes): f8 d3 02 03 80 88 ff ff 80 d8 21 02 80 88 ff ff ..........!..... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __kmalloc+0x47/0x120 [] hci_codec_list_add.isra.0+0x2d/0x160 [] hci_read_codec_capabilities+0x183/0x270 [] hci_read_supported_codecs+0x1bb/0x2d0 [] hci_read_local_codecs_sync+0x3e/0x60 [] hci_dev_open_sync+0x943/0x11e0 [] hci_power_on+0x10d/0x3f0 [] process_one_work+0x404/0x800 [] worker_thread+0x374/0x670 [] kthread+0x188/0x1c0 [] ret_from_fork+0x2b/0x50 [] ret_from_fork_asm+0x1a/0x30 Cc: stable@vger.kernel.org Fixes: 8961987f3f5f ("Bluetooth: Enumerate local supported codec and cache details") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 1 + net/bluetooth/hci_event.c | 1 + net/bluetooth/hci_sync.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9e89843c259b..195aea2198a9 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2786,6 +2786,7 @@ void hci_release_dev(struct hci_dev *hdev) hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); + hci_codec_list_clear(&hdev->local_codecs); hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 35f251041eeb..31d02b54eea1 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -33,6 +33,7 @@ #include "hci_request.h" #include "hci_debugfs.h" +#include "hci_codec.h" #include "a2mp.h" #include "amp.h" #include "smp.h" diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index fd7c5d902856..d06e07a0ea5a 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5074,6 +5074,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); + hci_codec_list_clear(&hdev->local_codecs); hci_dev_put(hdev); return err; -- cgit v1.2.3 From 31db78a4923ef5e2008f2eed321811ca79e7f71b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Sep 2023 08:34:15 +0200 Subject: wifi: mac80211: fix potential key use-after-free When ieee80211_key_link() is called by ieee80211_gtk_rekey_add() but returns 0 due to KRACK protection (identical key reinstall), ieee80211_gtk_rekey_add() will still return a pointer into the key, in a potential use-after-free. This normally doesn't happen since it's only called by iwlwifi in case of WoWLAN rekey offload which has its own KRACK protection, but still better to fix, do that by returning an error code and converting that to success on the cfg80211 boundary only, leaving the error for bad callers of ieee80211_gtk_rekey_add(). Reported-by: Dan Carpenter Fixes: fdf7cb4185b6 ("mac80211: accept key reinstall without changing anything") Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 +++ net/mac80211/key.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 45e7a5d9c7d9..e883c41a2163 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -566,6 +566,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } err = ieee80211_key_link(key, link, sta); + /* KRACK protection, shouldn't happen but just silently accept key */ + if (err == -EALREADY) + err = 0; out_unlock: mutex_unlock(&local->sta_mtx); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 13050dc9321f..84ba20c3e3dc 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -905,7 +905,7 @@ int ieee80211_key_link(struct ieee80211_key *key, */ if (ieee80211_key_identical(sdata, old_key, key)) { ieee80211_key_free_unused(key); - ret = 0; + ret = -EALREADY; goto out; } -- cgit v1.2.3 From d097ae01ebd48adc028aebcf760117a5317975dc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Sep 2023 08:34:16 +0200 Subject: wifi: mac80211: fix potential key leak When returning from ieee80211_key_link(), the key needs to have been freed or successfully installed. This was missed in a number of error paths, fix it. Signed-off-by: Johannes Berg --- net/mac80211/key.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 84ba20c3e3dc..0665ff5e456e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -802,6 +802,9 @@ static void ieee80211_key_destroy(struct ieee80211_key *key, void ieee80211_key_free_unused(struct ieee80211_key *key) { + if (!key) + return; + WARN_ON(key->sdata || key->local); ieee80211_key_free_common(key); } @@ -854,7 +857,7 @@ int ieee80211_key_link(struct ieee80211_key *key, * can cause warnings to appear. */ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; - int ret = -EOPNOTSUPP; + int ret; mutex_lock(&sdata->local->key_mtx); @@ -868,8 +871,10 @@ int ieee80211_key_link(struct ieee80211_key *key, * the same cipher. Enforce the assumption for pairwise keys. */ if ((alt_key && alt_key->conf.cipher != key->conf.cipher) || - (old_key && old_key->conf.cipher != key->conf.cipher)) + (old_key && old_key->conf.cipher != key->conf.cipher)) { + ret = -EOPNOTSUPP; goto out; + } } else if (sta) { struct link_sta_info *link_sta = &sta->deflink; int link_id = key->conf.link_id; @@ -895,8 +900,10 @@ int ieee80211_key_link(struct ieee80211_key *key, /* Non-pairwise keys must also not switch the cipher on rekey */ if (!pairwise) { - if (old_key && old_key->conf.cipher != key->conf.cipher) + if (old_key && old_key->conf.cipher != key->conf.cipher) { + ret = -EOPNOTSUPP; goto out; + } } /* @@ -904,9 +911,8 @@ int ieee80211_key_link(struct ieee80211_key *key, * new version of the key to avoid nonce reuse or replay issues. */ if (ieee80211_key_identical(sdata, old_key, key)) { - ieee80211_key_free_unused(key); ret = -EALREADY; - goto out; + goto unlock; } key->local = sdata->local; @@ -930,7 +936,11 @@ int ieee80211_key_link(struct ieee80211_key *key, ieee80211_key_free(key, delay_tailroom); } + key = NULL; + out: + ieee80211_key_free_unused(key); + unlock: mutex_unlock(&sdata->local->key_mtx); return ret; -- cgit v1.2.3 From 0914468adf92296c4cba8a2134e06e3dea150f2e Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 18 Sep 2023 14:10:54 +0300 Subject: wifi: cfg80211: Fix 6GHz scan configuration When the scan request includes a non broadcast BSSID, when adding the scan parameters for 6GHz collocated scanning, do not include entries that do not match the given BSSID. Signed-off-by: Ilan Peer Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230918140607.6d31d2a96baf.I6c4e3e3075d1d1878ee41f45190fdc6b86f18708@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0cf1ce7b6934..939deecf0bbe 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -908,6 +908,10 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) !cfg80211_find_ssid_match(ap, request)) continue; + if (!is_broadcast_ether_addr(request->bssid) && + !ether_addr_equal(request->bssid, ap->bssid)) + continue; + if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) continue; -- cgit v1.2.3 From 084cf2aeca97566db4fa15d55653c1cba2db83ed Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Sep 2023 14:10:55 +0300 Subject: wifi: mac80211: work around Cisco AP 9115 VHT MPDU length Cisco AP module 9115 with FW 17.3 has a bug and sends a too large maximum MPDU length in the association response (indicating 12k) that it cannot actually process. Work around that by taking the minimum between what's in the association response and the BSS elements (from beacon or probe response). Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230918140607.d1966a9a532e.I090225babb7cd4d1081ee9acd40e7de7e41c15ae@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 ++- net/mac80211/ibss.c | 2 +- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mesh_plink.c | 2 +- net/mac80211/mlme.c | 27 +++++++++++++++++++++++++-- net/mac80211/vht.c | 16 ++++++++++++++-- 6 files changed, 44 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e883c41a2163..0e3a1753a51c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1860,7 +1860,8 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, /* VHT can override some HT caps such as the A-MSDU max length */ if (params->vht_capa) ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - params->vht_capa, link_sta); + params->vht_capa, NULL, + link_sta); if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index e1900077bc4b..5542c93edfba 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1072,7 +1072,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, &chandef); memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie)); ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - &cap_ie, + &cap_ie, NULL, &sta->deflink); if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap))) rates_updated |= true; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b3d00259e1d6..98ef1fe1226e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2141,6 +2141,7 @@ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, + const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta); enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index f3d5bb0a59f1..a1e526419e9d 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -451,7 +451,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, changed |= IEEE80211_RC_BW_CHANGED; ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, - elems->vht_cap_elem, + elems->vht_cap_elem, NULL, &sta->deflink); ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 46d46cfab6c8..0e61eb5a29d1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4202,10 +4202,33 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, elems->ht_cap_elem, link_sta); - if (elems->vht_cap_elem && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) + if (elems->vht_cap_elem && + !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) { + const struct ieee80211_vht_cap *bss_vht_cap = NULL; + const struct cfg80211_bss_ies *ies; + + /* + * Cisco AP module 9115 with FW 17.3 has a bug and sends a + * too large maximum MPDU length in the association response + * (indicating 12k) that it cannot actually process ... + * Work around that. + */ + rcu_read_lock(); + ies = rcu_dereference(cbss->ies); + if (ies) { + const struct element *elem; + + elem = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, + ies->data, ies->len); + if (elem && elem->datalen >= sizeof(*bss_vht_cap)) + bss_vht_cap = (const void *)elem->data; + } + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, - link_sta); + bss_vht_cap, link_sta); + rcu_read_unlock(); + } if (elems->he_operation && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && elems->he_cap) { diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index c1250aa47808..b3a5c3e96a72 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2022 Intel Corporation + * Copyright (C) 2018 - 2023 Intel Corporation */ #include @@ -116,12 +116,14 @@ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, + const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; + u32 mpdu_len; memset(vht_cap, 0, sizeof(*vht_cap)); @@ -317,11 +319,21 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); + /* + * Work around the Cisco 9115 FW 17.3 bug by taking the min of + * both reported MPDU lengths. + */ + mpdu_len = vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK; + if (vht_cap_ie2) + mpdu_len = min_t(u32, mpdu_len, + le32_get_bits(vht_cap_ie2->vht_cap_info, + IEEE80211_VHT_CAP_MAX_MPDU_MASK)); + /* * FIXME - should the amsdu len be per link? store per link * and maintain a minimum? */ - switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { + switch (mpdu_len) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; break; -- cgit v1.2.3 From 61304336c67358d49a989e5e0060d8c99bad6ca8 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Tue, 1 Aug 2023 02:47:51 -0400 Subject: wifi: mac80211: allow transmitting EAPOL frames with tainted key Lower layer device driver stop/wake TX by calling ieee80211_stop_queue()/ ieee80211_wake_queue() while hw scan. Sometimes hw scan and PTK rekey are running in parallel, when M4 sent from wpa_supplicant arrive while the TX queue is stopped, then the M4 will pending send, and then new key install from wpa_supplicant. After TX queue wake up by lower layer device driver, the M4 will be dropped by below call stack. When key install started, the current key flag is set KEY_FLAG_TAINTED in ieee80211_pairwise_rekey(), and then mac80211 wait key install complete by lower layer device driver. Meanwhile ieee80211_tx_h_select_key() will return TX_DROP for the M4 in step 12 below, and then ieee80211_free_txskb() called by ieee80211_tx_dequeue(), so the M4 will not send and free, then the rekey process failed becaue AP not receive M4. Please see details in steps below. There are a interval between KEY_FLAG_TAINTED set for current key flag and install key complete by lower layer device driver, the KEY_FLAG_TAINTED is set in this interval, all packet including M4 will be dropped in this interval, the interval is step 8~13 as below. issue steps: TX thread install key thread 1. stop_queue -idle- 2. sending M4 -idle- 3. M4 pending -idle- 4. -idle- starting install key from wpa_supplicant 5. -idle- =>ieee80211_key_replace() 6. -idle- =>ieee80211_pairwise_rekey() and set currently key->flags |= KEY_FLAG_TAINTED 7. -idle- =>ieee80211_key_enable_hw_accel() 8. -idle- =>drv_set_key() and waiting key install complete from lower layer device driver 9. wake_queue -waiting state- 10. re-sending M4 -waiting state- 11. =>ieee80211_tx_h_select_key() -waiting state- 12. drop M4 by KEY_FLAG_TAINTED -waiting state- 13. -idle- install key complete with success/fail success: clear flag KEY_FLAG_TAINTED fail: start disconnect Hence add check in step 11 above to allow the EAPOL send out in the interval. If lower layer device driver use the old key/cipher to encrypt the M4, then AP received/decrypt M4 correctly, after M4 send out, lower layer device driver install the new key/cipher to hardware and return success. If lower layer device driver use new key/cipher to send the M4, then AP will/should drop the M4, then it is same result with this issue, AP will/ should kick out station as well as this issue. issue log: kworker/u16:4-5238 [000] 6456.108926: stop_queue: phy1 queue:0, reason:0 wpa_supplicant-961 [003] 6456.119737: rdev_tx_control_port: wiphy_name=phy1 name=wlan0 ifindex=6 dest=ARRAY[9e, 05, 31, 20, 9b, d0] proto=36488 unencrypted=0 wpa_supplicant-961 [003] 6456.119839: rdev_return_int_cookie: phy1, returned 0, cookie: 504 wpa_supplicant-961 [003] 6456.120287: rdev_add_key: phy1, netdev:wlan0(6), key_index: 0, mode: 0, pairwise: true, mac addr: 9e:05:31:20:9b:d0 wpa_supplicant-961 [003] 6456.120453: drv_set_key: phy1 vif:wlan0(2) sta:9e:05:31:20:9b:d0 cipher:0xfac04, flags=0x9, keyidx=0, hw_key_idx=0 kworker/u16:9-3829 [001] 6456.168240: wake_queue: phy1 queue:0, reason:0 kworker/u16:9-3829 [001] 6456.168255: drv_wake_tx_queue: phy1 vif:wlan0(2) sta:9e:05:31:20:9b:d0 ac:0 tid:7 kworker/u16:9-3829 [001] 6456.168305: cfg80211_control_port_tx_status: wdev(1), cookie: 504, ack: false wpa_supplicant-961 [003] 6459.167982: drv_return_int: phy1 - -110 issue call stack: nl80211_frame_tx_status+0x230/0x340 [cfg80211] cfg80211_control_port_tx_status+0x1c/0x28 [cfg80211] ieee80211_report_used_skb+0x374/0x3e8 [mac80211] ieee80211_free_txskb+0x24/0x40 [mac80211] ieee80211_tx_dequeue+0x644/0x954 [mac80211] ath10k_mac_tx_push_txq+0xac/0x238 [ath10k_core] ath10k_mac_op_wake_tx_queue+0xac/0xe0 [ath10k_core] drv_wake_tx_queue+0x80/0x168 [mac80211] __ieee80211_wake_txqs+0xe8/0x1c8 [mac80211] _ieee80211_wake_txqs+0xb4/0x120 [mac80211] ieee80211_wake_txqs+0x48/0x80 [mac80211] tasklet_action_common+0xa8/0x254 tasklet_action+0x2c/0x38 __do_softirq+0xdc/0x384 Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20230801064751.25803-1-quic_wgong@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7fe7280e8437..d45d4be63dd8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -665,7 +665,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) } if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED && - !ieee80211_is_deauth(hdr->frame_control))) + !ieee80211_is_deauth(hdr->frame_control)) && + tx->skb->protocol != tx->sdata->control_port_protocol) return TX_DROP; if (!skip_hw && tx->key && -- cgit v1.2.3 From 334bf33eec5701a1e4e967bcb7cc8611a998334b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 25 Sep 2023 17:18:56 +0200 Subject: wifi: cfg80211: avoid leaking stack data into trace If the structure is not initialized then boolean types might be copied into the tracing data without being initialised. This causes data from the stack to leak into the trace and also triggers a UBSAN failure which can easily be avoided here. Signed-off-by: Benjamin Berg Link: https://lore.kernel.org/r/20230925171855.a9271ef53b05.I8180bae663984c91a3e036b87f36a640ba409817@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7a88361b3414..931a03f4549c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8501,7 +8501,7 @@ static int nl80211_update_mesh_config(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct mesh_config cfg; + struct mesh_config cfg = {}; u32 mask; int err; -- cgit v1.2.3 From aaba3cd33fc9593a858beeee419c0e6671ee9551 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 25 Sep 2023 17:30:29 +0200 Subject: wifi: mac80211: Create resources for disabled links When associating to an MLD AP, links may be disabled. Create all resources associated with a disabled link so that we can later enable it without having to create these resources on the fly. Fixes: 6d543b34dbcf ("wifi: mac80211: Support disabled links during association") Signed-off-by: Benjamin Berg Link: https://lore.kernel.org/r/20230925173028.f9afdb26f6c7.I4e6e199aaefc1bf017362d64f3869645fa6830b5@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0e61eb5a29d1..0c9198997482 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5130,9 +5130,10 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, continue; valid_links |= BIT(link_id); - if (assoc_data->link[link_id].disabled) { + if (assoc_data->link[link_id].disabled) dormant_links |= BIT(link_id); - } else if (link_id != assoc_data->assoc_link_id) { + + if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_allocate_link(sta, link_id); if (err) goto out_err; @@ -5147,7 +5148,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link; struct link_sta_info *link_sta; - if (!cbss || assoc_data->link[link_id].disabled) + if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); -- cgit v1.2.3 From 9b7177b1df64b8d7f85700027c324aadd6aded00 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 25 Sep 2023 20:52:58 -0700 Subject: bpf: tcp_read_skb needs to pop skb regardless of seq Before fix e5c6de5fa0258 tcp_read_skb() would increment the tp->copied-seq value. This (as described in the commit) would cause an error for apps because once that is incremented the application might believe there is no data to be read. Then some apps would stall or abort believing no data is available. However, the fix is incomplete because it introduces another issue in the skb dequeue. The loop does tcp_recv_skb() in a while loop to consume as many skbs as possible. The problem is the call is ... tcp_recv_skb(sk, seq, &offset) ... where 'seq' is: u32 seq = tp->copied_seq; Now we can hit a case where we've yet incremented copied_seq from BPF side, but then tcp_recv_skb() fails this test ... if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) ... so that instead of returning the skb we call tcp_eat_recv_skb() which frees the skb. This is because the routine believes the SKB has been collapsed per comment: /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ This can't happen here we've unlinked the full SKB and orphaned it. Anyways it would confuse any BPF programs if the data were suddenly moved underneath it. To fix this situation do simpler operation and just skb_peek() the data of the queue followed by the unlink. It shouldn't need to check this condition and tcp_read_skb() reads entire skbs so there is no need to handle the 'offset!=0' case as we would see in tcp_read_sock(). Fixes: e5c6de5fa0258 ("bpf, sockmap: Incorrectly handling copied_seq") Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230926035300.135096-2-john.fastabend@gmail.com --- net/ipv4/tcp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c3040a63ebd..3f66cdeef7de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1621,16 +1621,13 @@ EXPORT_SYMBOL(tcp_read_sock); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; struct sk_buff *skb; int copied = 0; - u32 offset; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; - while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; @@ -1643,13 +1640,10 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) copied = used; break; } - seq += used; copied += used; - if (tcp_flags & TCPHDR_FIN) { - ++seq; + if (tcp_flags & TCPHDR_FIN) break; - } } return copied; } -- cgit v1.2.3 From da9e915eaf5dadb1963b7738cdfa42ed55212445 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 25 Sep 2023 20:52:59 -0700 Subject: bpf, sockmap: Do not inc copied_seq when PEEK flag set When data is peek'd off the receive queue we shouldn't considered it copied from tcp_sock side. When we increment copied_seq this will confuse tcp_data_ready() because copied_seq can be arbitrarily increased. From application side it results in poll() operations not waking up when expected. Notice tcp stack without BPF recvmsg programs also does not increment copied_seq. We broke this when we moved copied_seq into recvmsg to only update when actual copy was happening. But, it wasn't working correctly either before because the tcp_data_ready() tried to use the copied_seq value to see if data was read by user yet. See fixes tags. Fixes: e5c6de5fa0258 ("bpf, sockmap: Incorrectly handling copied_seq") Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230926035300.135096-3-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 81f0dff69e0b..327268203001 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -222,6 +222,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int *addr_len) { struct tcp_sock *tcp = tcp_sk(sk); + int peek = flags & MSG_PEEK; u32 seq = tcp->copied_seq; struct sk_psock *psock; int copied = 0; @@ -311,7 +312,8 @@ msg_bytes_ready: copied = -EAGAIN; } out: - WRITE_ONCE(tcp->copied_seq, seq); + if (!peek) + WRITE_ONCE(tcp->copied_seq, seq); tcp_rcv_space_adjust(sk); if (copied > 0) __tcp_cleanup_rbuf(sk, copied); -- cgit v1.2.3 From b80e31baa43614e086a9d29dc1151932b1bd7fc5 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 20 Sep 2023 12:20:55 +0200 Subject: bpf, sockmap: Reject sk_msg egress redirects to non-TCP sockets With a SOCKMAP/SOCKHASH map and an sk_msg program user can steer messages sent from one TCP socket (s1) to actually egress from another TCP socket (s2): tcp_bpf_sendmsg(s1) // = sk_prot->sendmsg tcp_bpf_send_verdict(s1) // __SK_REDIRECT case tcp_bpf_sendmsg_redir(s2) tcp_bpf_push_locked(s2) tcp_bpf_push(s2) tcp_rate_check_app_limited(s2) // expects tcp_sock tcp_sendmsg_locked(s2) // ditto There is a hard-coded assumption in the call-chain, that the egress socket (s2) is a TCP socket. However in commit 122e6c79efe1 ("sock_map: Update sock type checks for UDP") we have enabled redirects to non-TCP sockets. This was done for the sake of BPF sk_skb programs. There was no indention to support sk_msg send-to-egress use case. As a result, attempts to send-to-egress through a non-TCP socket lead to a crash due to invalid downcast from sock to tcp_sock: BUG: kernel NULL pointer dereference, address: 000000000000002f ... Call Trace: ? show_regs+0x60/0x70 ? __die+0x1f/0x70 ? page_fault_oops+0x80/0x160 ? do_user_addr_fault+0x2d7/0x800 ? rcu_is_watching+0x11/0x50 ? exc_page_fault+0x70/0x1c0 ? asm_exc_page_fault+0x27/0x30 ? tcp_tso_segs+0x14/0xa0 tcp_write_xmit+0x67/0xce0 __tcp_push_pending_frames+0x32/0xf0 tcp_push+0x107/0x140 tcp_sendmsg_locked+0x99f/0xbb0 tcp_bpf_push+0x19d/0x3a0 tcp_bpf_sendmsg_redir+0x55/0xd0 tcp_bpf_send_verdict+0x407/0x550 tcp_bpf_sendmsg+0x1a1/0x390 inet_sendmsg+0x6a/0x70 sock_sendmsg+0x9d/0xc0 ? sockfd_lookup_light+0x12/0x80 __sys_sendto+0x10e/0x160 ? syscall_enter_from_user_mode+0x20/0x60 ? __this_cpu_preempt_check+0x13/0x20 ? lockdep_hardirqs_on+0x82/0x110 __x64_sys_sendto+0x1f/0x30 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Reject selecting a non-TCP sockets as redirect target from a BPF sk_msg program to prevent the crash. When attempted, user will receive an EACCES error from send/sendto/sendmsg() syscall. Fixes: 122e6c79efe1 ("sock_map: Update sock type checks for UDP") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20230920102055.42662-1-jakub@cloudflare.com --- net/core/sock_map.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index cb11750b1df5..4292c2ed1828 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -668,6 +668,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -1267,6 +1269,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; -- cgit v1.2.3 From 25563b581ba3a1f263a00e8c9a97f5e7363be6fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 08:46:26 +0000 Subject: net: fix possible store tearing in neigh_periodic_work() While looking at a related syzbot report involving neigh_periodic_work(), I found that I forgot to add an annotation when deleting an RCU protected item from a list. Readers use rcu_deference(*np), we need to use either rcu_assign_pointer() or WRITE_ONCE() on writer side to prevent store tearing. I use rcu_assign_pointer() to have lockdep support, this was the choice made in neigh_flush_dev(). Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6b76cd103195..7212c7e521ef 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -988,7 +988,9 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); -- cgit v1.2.3 From 5baa0433a15eadd729625004c37463acb982eca7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 09:27:13 +0000 Subject: neighbour: fix data-races around n->output n->output field can be read locklessly, while a writer might change the pointer concurrently. Add missing annotations to prevent load-store tearing. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/bridge/br_netfilter_hooks.c | 2 +- net/core/neighbour.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 15186247b59a..033034d68f1f 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -294,7 +294,7 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ /* tell br_dev_xmit to continue with forwarding */ nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ - ret = neigh->output(neigh, skb); + ret = READ_ONCE(neigh->output)(neigh, skb); } neigh_release(neigh); return ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7212c7e521ef..9c09f091cbff 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -410,7 +410,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; - n->output = neigh_blackhole; + WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else @@ -920,7 +920,7 @@ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); - neigh->output = neigh->ops->output; + WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; @@ -932,7 +932,7 @@ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); - neigh->output = neigh->ops->connected_output; + WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) @@ -1449,7 +1449,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, if (n2) n1 = n2; } - n1->output(n1, skb); + READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); @@ -3155,7 +3155,7 @@ int neigh_xmit(int index, struct net_device *dev, rcu_read_unlock(); goto out_kfree_skb; } - err = neigh->output(neigh, skb); + err = READ_ONCE(neigh->output)(neigh, skb); rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { -- cgit v1.2.3 From 9d4c75800f61e5d75c1659ba201b6c0c7ead3070 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Sep 2023 11:41:19 +0100 Subject: ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data() Including the transhdrlen in length is a problem when the packet is partially filled (e.g. something like send(MSG_MORE) happened previously) when appending to an IPv4 or IPv6 packet as we don't want to repeat the transport header or account for it twice. This can happen under some circumstances, such as splicing into an L2TP socket. The symptom observed is a warning in __ip6_append_data(): WARNING: CPU: 1 PID: 5042 at net/ipv6/ip6_output.c:1800 __ip6_append_data.isra.0+0x1be8/0x47f0 net/ipv6/ip6_output.c:1800 that occurs when MSG_SPLICE_PAGES is used to append more data to an already partially occupied skbuff. The warning occurs when 'copy' is larger than the amount of data in the message iterator. This is because the requested length includes the transport header length when it shouldn't. This can be triggered by, for example: sfd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_L2TP); bind(sfd, ...); // ::1 connect(sfd, ...); // ::1 port 7 send(sfd, buffer, 4100, MSG_MORE); sendfile(sfd, dfd, NULL, 1024); Fix this by only adding transhdrlen into the length if the write queue is empty in l2tp_ip6_sendmsg(), analogously to how UDP does things. l2tp_ip_sendmsg() looks like it won't suffer from this problem as it builds the UDP packet itself. Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6") Reported-by: syzbot+62cbf263225ae13ff153@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/0000000000001c12b30605378ce8@google.com/ Suggested-by: Willem de Bruijn Signed-off-by: David Howells cc: Eric Dumazet cc: Willem de Bruijn cc: "David S. Miller" cc: David Ahern cc: Paolo Abeni cc: Jakub Kicinski cc: netdev@vger.kernel.org cc: bpf@vger.kernel.org cc: syzkaller-bugs@googlegroups.com Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ed8ebb6f5909..11f3d375cec0 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -507,7 +507,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; - ulen = len + transhdrlen; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) @@ -628,6 +627,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); + ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0; err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, -- cgit v1.2.3 From 26297b4ce1ce4ea40bc9a48ec99f45da3f64d2e2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Thu, 21 Sep 2023 18:46:40 -0500 Subject: net: replace calls to sock->ops->connect() with kernel_connect() commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") ensured that kernel_connect() will not overwrite the address parameter in cases where BPF connect hooks perform an address rewrite. This change replaces direct calls to sock->ops->connect() in net with kernel_connect() to make these call safe. Link: https://lore.kernel.org/netdev/20230912013332.2048422-1-jrife@google.com/ Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect") Cc: stable@vger.kernel.org Reviewed-by: Willem de Bruijn Signed-off-by: Jordan Rife Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- net/rds/tcp_connect.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index da5af28ff57b..6e4ed1e11a3b 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1505,8 +1505,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - salen, 0); + result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f0c477c5d1db..d788c6d28986 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -173,7 +173,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) -- cgit v1.2.3 From 86a7e0b69bd5b812e48a20c66c2161744f3caa16 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Thu, 21 Sep 2023 18:46:41 -0500 Subject: net: prevent rewrite of msg_name in sock_sendmsg() Callers of sock_sendmsg(), and similarly kernel_sendmsg(), in kernel space may observe their value of msg_name change in cases where BPF sendmsg hooks rewrite the send address. This has been confirmed to break NFS mounts running in UDP mode and has the potential to break other systems. This patch: 1) Creates a new function called __sock_sendmsg() with same logic as the old sock_sendmsg() function. 2) Replaces calls to sock_sendmsg() made by __sys_sendto() and __sys_sendmsg() with __sock_sendmsg() to avoid an unnecessary copy, as these system calls are already protected. 3) Modifies sock_sendmsg() so that it makes a copy of msg_name if present before passing it down the stack to insulate callers from changes to the send address. Link: https://lore.kernel.org/netdev/20230912013332.2048422-1-jrife@google.com/ Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Cc: stable@vger.kernel.org Reviewed-by: Willem de Bruijn Signed-off-by: Jordan Rife Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/socket.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index c8b08b32f097..a39ec136f5cf 100644 --- a/net/socket.c +++ b/net/socket.c @@ -737,6 +737,14 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) return ret; } +static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) +{ + int err = security_socket_sendmsg(sock, msg, + msg_data_left(msg)); + + return err ?: sock_sendmsg_nosec(sock, msg); +} + /** * sock_sendmsg - send a message through @sock * @sock: socket @@ -747,10 +755,19 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - int err = security_socket_sendmsg(sock, msg, - msg_data_left(msg)); + struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; + struct sockaddr_storage address; + int ret; - return err ?: sock_sendmsg_nosec(sock, msg); + if (msg->msg_name) { + memcpy(&address, msg->msg_name, msg->msg_namelen); + msg->msg_name = &address; + } + + ret = __sock_sendmsg(sock, msg); + msg->msg_name = save_addr; + + return ret; } EXPORT_SYMBOL(sock_sendmsg); @@ -1138,7 +1155,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = sock_sendmsg(sock, &msg); + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -2174,7 +2191,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg); + err = __sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -2538,7 +2555,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys); + err = __sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. -- cgit v1.2.3 From c889a99a21bf124c3db08d09df919f0eccc5ea4c Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Thu, 21 Sep 2023 18:46:42 -0500 Subject: net: prevent address rewrite in kernel_bind() Similar to the change in commit 0bdf399342c5("net: Avoid address overwrite in kernel_connect"), BPF hooks run on bind may rewrite the address passed to kernel_bind(). This change 1) Makes a copy of the bind address in kernel_bind() to insulate callers. 2) Replaces direct calls to sock->ops->bind() in net with kernel_bind() Link: https://lore.kernel.org/netdev/20230912013332.2048422-1-jrife@google.com/ Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind") Cc: stable@vger.kernel.org Reviewed-by: Willem de Bruijn Signed-off-by: Jordan Rife Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 2 +- net/socket.c | 7 ++++++- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 6e4ed1e11a3b..4174076c66fa 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1439,7 +1439,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1546,7 +1546,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index d788c6d28986..a0046e99d6df 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -145,7 +145,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = sock->ops->bind(sock, addr, addrlen); + ret = kernel_bind(sock, addr, addrlen); if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 014fa24418c1..53b3535a1e4a 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -306,7 +306,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) addr_len = sizeof(*sin); } - ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); diff --git a/net/socket.c b/net/socket.c index a39ec136f5cf..c4a6f5532955 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3516,7 +3516,12 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { - return READ_ONCE(sock->ops)->bind(sock, addr, addrlen); + struct sockaddr_storage address; + + memcpy(&address, addr, addrlen); + + return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address, + addrlen); } EXPORT_SYMBOL(kernel_bind); -- cgit v1.2.3 From 4b2b606075e50cdae62ab2356b0a1e206947c354 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 22 Sep 2023 15:55:08 +0800 Subject: ipv4/fib: send notify when delete source address routes After deleting an interface address in fib_del_ifaddr(), the function scans the fib_info list for stray entries and calls fib_flush() and fib_table_flush(). Then the stray entries will be deleted silently and no RTM_DELROUTE notification will be sent. This lack of notification can make routing daemons, or monitor like `ip monitor route` miss the routing changes. e.g. + ip link add dummy1 type dummy + ip link add dummy2 type dummy + ip link set dummy1 up + ip link set dummy2 up + ip addr add 192.168.5.5/24 dev dummy1 + ip route add 7.7.7.0/24 dev dummy2 src 192.168.5.5 + ip -4 route 7.7.7.0/24 dev dummy2 scope link src 192.168.5.5 192.168.5.0/24 dev dummy1 proto kernel scope link src 192.168.5.5 + ip monitor route + ip addr del 192.168.5.5/24 dev dummy1 Deleted 192.168.5.0/24 dev dummy1 proto kernel scope link src 192.168.5.5 Deleted broadcast 192.168.5.255 dev dummy1 table local proto kernel scope link src 192.168.5.5 Deleted local 192.168.5.5 dev dummy1 table local proto kernel scope host src 192.168.5.5 As Ido reminded, fib_table_flush() isn't only called when an address is deleted, but also when an interface is deleted or put down. The lack of notification in these cases is deliberate. And commit 7c6bb7d2faaf ("net/ipv6: Add knob to skip DELROUTE message on device down") introduced a sysctl to make IPv6 behave like IPv4 in this regard. So we can't send the route delete notify blindly in fib_table_flush(). To fix this issue, let's add a new flag in "struct fib_info" to track the deleted prefer source address routes, and only send notify for them. After update: + ip monitor route + ip addr del 192.168.5.5/24 dev dummy1 Deleted 192.168.5.0/24 dev dummy1 proto kernel scope link src 192.168.5.5 Deleted broadcast 192.168.5.255 dev dummy1 table local proto kernel scope link src 192.168.5.5 Deleted local 192.168.5.5 dev dummy1 table local proto kernel scope host src 192.168.5.5 Deleted 7.7.7.0/24 dev dummy2 scope link src 192.168.5.5 Suggested-by: Thomas Haller Signed-off-by: Hangbin Liu Acked-by: Nicolas Dichtel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230922075508.848925-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/fib_semantics.c | 1 + net/ipv4/fib_trie.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index eafa4a033515..1ea82bc33ef1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1887,6 +1887,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; + fi->pfsrc_removed = true; ret++; } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d13fb9e76b97..9bdfdab906fe 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2027,6 +2027,7 @@ void fib_table_flush_external(struct fib_table *tb) int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; + struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; @@ -2089,6 +2090,9 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); + if (fi->pfsrc_removed) + rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); -- cgit v1.2.3 From 9593c7cb6cf670ef724d17f7f9affd7a8d2ad0c5 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 22 Sep 2023 23:04:58 +0200 Subject: ipv6: tcp: add a missing nf_reset_ct() in 3WHS handling Commit b0e214d21203 ("netfilter: keep conntrack reference until IPsecv6 policy checks are done") is a direct copy of the old commit b59c270104f0 ("[NETFILTER]: Keep conntrack reference until IPsec policy checks are done") but for IPv6. However, it also copies a bug that this old commit had. That is: when the third packet of 3WHS connection establishment contains payload, it is added into socket receive queue without the XFRM check and the drop of connection tracking context. That leads to nf_conntrack module being impossible to unload as it waits for all the conntrack references to be dropped while the packet release is deferred in per-cpu cache indefinitely, if not consumed by the application. The issue for IPv4 was fixed in commit 6f0012e35160 ("tcp: add a missing nf_reset_ct() in 3WHS handling") by adding a missing XFRM check and correctly dropping the conntrack context. However, the issue was introduced to IPv6 code afterwards. Fixing it the same way for IPv6 now. Fixes: b0e214d21203 ("netfilter: keep conntrack reference until IPsecv6 policy checks are done") Link: https://lore.kernel.org/netdev/d589a999-d4dd-2768-b2d5-89dec64a4a42@ovn.org/ Signed-off-by: Ilya Maximets Acked-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230922210530.2045146-1-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- net/ipv6/tcp_ipv6.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3a88545a265d..44b6949d72b2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1640,9 +1640,12 @@ process: struct sock *nsk; sk = req->rsk_listener; - drop_reason = tcp_inbound_md5_hash(sk, skb, - &hdr->saddr, &hdr->daddr, - AF_INET6, dif, sdif); + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + drop_reason = SKB_DROP_REASON_XFRM_POLICY; + else + drop_reason = tcp_inbound_md5_hash(sk, skb, + &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); @@ -1689,6 +1692,7 @@ process: } goto discard_and_relse; } + nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); -- cgit v1.2.3 From 8957261cd8149ed9d0738c01c0320bcbff989407 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Fri, 8 Sep 2023 10:15:48 +0530 Subject: ethtool: plca: fix plca enable data type while parsing the value The ETHTOOL_A_PLCA_ENABLED data type is u8. But while parsing the value from the attribute, nla_get_u32() is used in the plca_update_sint() function instead of nla_get_u8(). So plca_cfg.enabled variable is updated with some garbage value instead of 0 or 1 and always enables plca even though plca is disabled through ethtool application. This bug has been fixed by parsing the values based on the attributes type in the policy. Fixes: 8580e16c28f3 ("net/ethtool: add netlink interface for the PLCA RS") Signed-off-by: Parthiban Veerasooran Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230908044548.5878-1-Parthiban.Veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- net/ethtool/plca.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index b238a1afe9ae..b1e2e3b5027f 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -21,16 +21,6 @@ struct plca_reply_data { #define PLCA_REPDATA(__reply_base) \ container_of(__reply_base, struct plca_reply_data, base) -static void plca_update_sint(int *dst, const struct nlattr *attr, - bool *mod) -{ - if (!attr) - return; - - *dst = nla_get_u32(attr); - *mod = true; -} - // PLCA get configuration message ------------------------------------------- // const struct nla_policy ethnl_plca_get_cfg_policy[] = { @@ -38,6 +28,29 @@ const struct nla_policy ethnl_plca_get_cfg_policy[] = { NLA_POLICY_NESTED(ethnl_header_policy), }; +static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid, + bool *mod) +{ + const struct nlattr *attr = tb[attrid]; + + if (!attr || + WARN_ON_ONCE(attrid >= ARRAY_SIZE(ethnl_plca_set_cfg_policy))) + return; + + switch (ethnl_plca_set_cfg_policy[attrid].type) { + case NLA_U8: + *dst = nla_get_u8(attr); + break; + case NLA_U32: + *dst = nla_get_u32(attr); + break; + default: + WARN_ON_ONCE(1); + } + + *mod = true; +} + static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) @@ -144,13 +157,13 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) return -EOPNOTSUPP; memset(&plca_cfg, 0xff, sizeof(plca_cfg)); - plca_update_sint(&plca_cfg.enabled, tb[ETHTOOL_A_PLCA_ENABLED], &mod); - plca_update_sint(&plca_cfg.node_id, tb[ETHTOOL_A_PLCA_NODE_ID], &mod); - plca_update_sint(&plca_cfg.node_cnt, tb[ETHTOOL_A_PLCA_NODE_CNT], &mod); - plca_update_sint(&plca_cfg.to_tmr, tb[ETHTOOL_A_PLCA_TO_TMR], &mod); - plca_update_sint(&plca_cfg.burst_cnt, tb[ETHTOOL_A_PLCA_BURST_CNT], + plca_update_sint(&plca_cfg.enabled, tb, ETHTOOL_A_PLCA_ENABLED, &mod); + plca_update_sint(&plca_cfg.node_id, tb, ETHTOOL_A_PLCA_NODE_ID, &mod); + plca_update_sint(&plca_cfg.node_cnt, tb, ETHTOOL_A_PLCA_NODE_CNT, &mod); + plca_update_sint(&plca_cfg.to_tmr, tb, ETHTOOL_A_PLCA_TO_TMR, &mod); + plca_update_sint(&plca_cfg.burst_cnt, tb, ETHTOOL_A_PLCA_BURST_CNT, &mod); - plca_update_sint(&plca_cfg.burst_tmr, tb[ETHTOOL_A_PLCA_BURST_TMR], + plca_update_sint(&plca_cfg.burst_tmr, tb, ETHTOOL_A_PLCA_BURST_TMR, &mod); if (!mod) return 0; -- cgit v1.2.3 From dfc7f7a988dad34c3bf4c053124fb26aa6c5f916 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Fri, 8 Sep 2023 19:58:53 -0400 Subject: net: nfc: llcp: Add lock when modifying device list The device list needs its associated lock held when modifying it, or the list could become corrupted, as syzbot discovered. Reported-and-tested-by: syzbot+c1d0a03d305972dbbe14@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c1d0a03d305972dbbe14 Signed-off-by: Jeremy Cline Reviewed-by: Simon Horman Fixes: 6709d4b7bc2e ("net: nfc: Fix use-after-free caused by nfc_llcp_find_local") Link: https://lore.kernel.org/r/20230908235853.1319596-1-jeremy@jcline.org Signed-off-by: Jakub Kicinski --- net/nfc/llcp_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index f60e424e0607..6705bb895e23 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1636,7 +1636,9 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); + spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); + spin_unlock(&llcp_devices_lock); return 0; } -- cgit v1.2.3 From af84f9e447a65b4b9f79e7e5d69e19039b431c56 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Sep 2023 10:42:10 +0200 Subject: netfilter: nft_payload: rebuild vlan header on h_proto access nft can perform merging of adjacent payload requests. This means that: ether saddr 00:11 ... ether type 8021ad ... is a single payload expression, for 8 bytes, starting at the ethernet source offset. Check that offset+length is fully within the source/destination mac addersses. This bug prevents 'ether type' from matching the correct h_proto in case vlan tag got stripped. Fixes: de6843be3082 ("netfilter: nft_payload: rebuild vlan header when needed") Reported-by: David Ward Signed-off-by: Florian Westphal --- net/netfilter/nft_payload.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 8cb800989947..120f6d395b98 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -154,6 +154,17 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } +static bool nft_payload_need_vlan_copy(const struct nft_payload *priv) +{ + unsigned int len = priv->offset + priv->len; + + /* data past ether src/dst requested, copy needed */ + if (len > offsetof(struct ethhdr, h_proto)) + return true; + + return false; +} + void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -172,7 +183,7 @@ void nft_payload_eval(const struct nft_expr *expr, goto err; if (skb_vlan_tag_present(skb) && - priv->offset >= offsetof(struct ethhdr, h_proto)) { + nft_payload_need_vlan_copy(priv)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; -- cgit v1.2.3 From 8e56b063c86569e51eed1c5681ce6361fa97fc7a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 3 Oct 2023 13:17:53 -0400 Subject: netfilter: handle the connecting collision properly in nf_conntrack_proto_sctp In Scenario A and B below, as the delayed INIT_ACK always changes the peer vtag, SCTP ct with the incorrect vtag may cause packet loss. Scenario A: INIT_ACK is delayed until the peer receives its own INIT_ACK 192.168.1.2 > 192.168.1.1: [INIT] [init tag: 1328086772] 192.168.1.1 > 192.168.1.2: [INIT] [init tag: 1414468151] 192.168.1.2 > 192.168.1.1: [INIT ACK] [init tag: 1328086772] 192.168.1.1 > 192.168.1.2: [INIT ACK] [init tag: 1650211246] * 192.168.1.2 > 192.168.1.1: [COOKIE ECHO] 192.168.1.1 > 192.168.1.2: [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: [COOKIE ACK] Scenario B: INIT_ACK is delayed until the peer completes its own handshake 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] 192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] * This patch fixes it as below: In SCTP_CID_INIT processing: - clear ct->proto.sctp.init[!dir] if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]. (Scenario E) - set ct->proto.sctp.init[dir]. In SCTP_CID_INIT_ACK processing: - drop it if !ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] && ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario B, Scenario C) - drop it if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario A) In SCTP_CID_COOKIE_ACK processing: - clear ct->proto.sctp.init[dir] and ct->proto.sctp.init[!dir]. (Scenario D) Also, it's important to allow the ct state to move forward with cookie_echo and cookie_ack from the opposite dir for the collision scenarios. There are also other Scenarios where it should allow the packet through, addressed by the processing above: Scenario C: new CT is created by INIT_ACK. Scenario D: start INIT on the existing ESTABLISHED ct. Scenario E: start INIT after the old collision on the existing ESTABLISHED ct. 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] (both side are stopped, then start new connection again in hours) 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 242308742] Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Signed-off-by: Xin Long Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_proto_sctp.c | 43 +++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b6bcc8f2f46b..c6bd533983c1 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -112,7 +112,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -126,7 +126,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -412,6 +412,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ACK) { + ct->proto.sctp.init[dir] = 0; + ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); @@ -461,16 +464,18 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT || - sch->type == SCTP_CID_INIT_ACK) { - struct sctp_inithdr _inithdr, *ih; + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _ih, *ih; - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) goto out_unlock; - pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !dir); + + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so @@ -481,6 +486,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; + } else if (sch->type == SCTP_CID_INIT_ACK) { + struct sctp_inithdr _ih, *ih; + __be32 vtag; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) + goto out_unlock; + + vtag = ct->proto.sctp.vtag[!dir]; + if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) + goto out_unlock; + /* collision */ + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && + vtag != ih->init_tag) + goto out_unlock; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; -- cgit v1.2.3 From 0d880dc6f032e0b541520e9926f398a77d3d433c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sat, 23 Sep 2023 03:53:50 +0200 Subject: netfilter: nf_tables: Deduplicate nft_register_obj audit logs When adding/updating an object, the transaction handler emits suitable audit log entries already, the one in nft_obj_notify() is redundant. To fix that (and retain the audit logging from objects' 'update' callback), Introduce an "audit log free" variant for internal use. Fixes: c520292f29b8 ("audit: log nftables configuration change events once per table") Signed-off-by: Phil Sutter Reviewed-by: Richard Guy Briggs Acked-by: Paul Moore (Audit) Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 44 +++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4356189360fb..a72b6aeefb1b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7871,24 +7871,14 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, return nft_delobj(&ctx, obj); } -void nft_obj_notify(struct net *net, const struct nft_table *table, - struct nft_object *obj, u32 portid, u32 seq, int event, - u16 flags, int family, int report, gfp_t gfp) +static void +__nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; - char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); - - audit_log_nfcfg(buf, - family, - obj->handle, - event == NFT_MSG_NEWOBJ ? - AUDIT_NFT_OP_OBJ_REGISTER : - AUDIT_NFT_OP_OBJ_UNREGISTER, - gfp); - kfree(buf); if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -7911,13 +7901,35 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } + +void nft_obj_notify(struct net *net, const struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, int event, + u16 flags, int family, int report, gfp_t gfp) +{ + struct nftables_pernet *nft_net = nft_pernet(net); + char *buf = kasprintf(gfp, "%s:%u", + table->name, nft_net->base_seq); + + audit_log_nfcfg(buf, + family, + obj->handle, + event == NFT_MSG_NEWOBJ ? + AUDIT_NFT_OP_OBJ_REGISTER : + AUDIT_NFT_OP_OBJ_UNREGISTER, + gfp); + kfree(buf); + + __nft_obj_notify(net, table, obj, portid, seq, event, + flags, family, report, gfp); +} EXPORT_SYMBOL_GPL(nft_obj_notify); static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { - nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->flags, ctx->family, ctx->report, GFP_KERNEL); + __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, + ctx->seq, event, ctx->flags, ctx->family, + ctx->report, GFP_KERNEL); } /* -- cgit v1.2.3 From 087388278e0f301f4c61ddffb1911d3a180f84b8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Sep 2023 15:12:44 +0200 Subject: netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure nft_rbtree_gc_elem() walks back and removes the end interval element that comes before the expired element. There is a small chance that we've cached this element as 'rbe_ge'. If this happens, we hold and test a pointer that has been queued for freeing. It also causes spurious insertion failures: $ cat test-testcases-sets-0044interval_overlap_0.1/testout.log Error: Could not process rule: File exists add element t s { 0 - 2 } ^^^^^^ Failed to insert 0 - 2 given: table ip t { set s { type inet_service flags interval,timeout timeout 2s gc-interval 2s } } The set (rbtree) is empty. The 'failure' doesn't happen on next attempt. Reason is that when we try to insert, the tree may hold an expired element that collides with the range we're adding. While we do evict/erase this element, we can trip over this check: if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; rbe_ge was erased by the synchronous gc, we should not have done this check. Next attempt won't find it, so retry results in successful insertion. Restart in-kernel to avoid such spurious errors. Such restart are rare, unless userspace intentionally adds very large numbers of elements with very short timeouts while setting a huge gc interval. Even in this case, this cannot loop forever, on each retry an existing element has been removed. As the caller is holding the transaction mutex, its impossible for a second entity to add more expiring elements to the tree. After this it also becomes feasible to remove the async gc worker and perform all garbage collection from the commit path. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_rbtree.c | 46 ++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 487572dcd614..2660ceab3759 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -233,10 +233,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, rb_erase(&rbe->node, &priv->root); } -static int nft_rbtree_gc_elem(const struct nft_set *__set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, - u8 genmask) +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -246,7 +245,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); if (!gc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -261,6 +260,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, prev = rb_prev(prev); } + rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_remove(net, set, priv, rbe_prev); @@ -272,7 +272,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, */ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe_prev); } @@ -280,13 +280,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, nft_rbtree_gc_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe); nft_trans_gc_queue_sync_done(gc); - return 0; + return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, @@ -314,7 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - int d, err; + int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the @@ -363,9 +363,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (nft_set_elem_expired(&rbe->ext) && nft_set_elem_active(&rbe->ext, cur_genmask)) { - err = nft_rbtree_gc_elem(set, priv, rbe, genmask); - if (err < 0) - return err; + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; continue; } @@ -486,11 +491,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe = elem->priv; int err; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); return err; } -- cgit v1.2.3 From 0add5c597f3253a9c6108a0a81d57f44ab0d9d30 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Tue, 26 Sep 2023 14:27:30 -0400 Subject: ipv4: Set offload_failed flag in fibmatch results Due to a small omission, the offload_failed flag is missing from ipv4 fibmatch results. Make sure it is set correctly. The issue can be witnessed using the following commands: echo "1 1" > /sys/bus/netdevsim/new_device ip link add dummy1 up type dummy ip route add 192.0.2.0/24 dev dummy1 echo 1 > /sys/kernel/debug/netdevsim/netdevsim1/fib/fail_route_offload ip route add 198.51.100.0/24 dev dummy1 ip route # 192.168.15.0/24 has rt_trap # 198.51.100.0/24 has rt_offload_failed ip route get 192.168.15.1 fibmatch # Result has rt_trap ip route get 198.51.100.1 fibmatch # Result differs from the route shown by `ip route`, it is missing # rt_offload_failed ip link del dev dummy1 echo 1 > /sys/bus/netdevsim/del_device Fixes: 36c5100e859d ("IPv4: Add "offload failed" indication to routes") Signed-off-by: Benjamin Poirier Reviewed-by: Ido Schimmel Reviewed-by: Simon Horman Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230926182730.231208-1-bpoirier@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a57062283219..b214b5a2e045 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3417,6 +3417,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); + fri.offload_failed = + READ_ONCE(fa->offload_failed); break; } } -- cgit v1.2.3 From 08e50cf071847323414df0835109b6f3560d44f5 Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Wed, 27 Sep 2023 18:14:14 +0000 Subject: tipc: fix a potential deadlock on &tx->lock It seems that tipc_crypto_key_revoke() could be be invoked by wokequeue tipc_crypto_work_rx() under process context and timer/rx callback under softirq context, thus the lock acquisition on &tx->lock seems better use spin_lock_bh() to prevent possible deadlock. This flaw was found by an experimental static analysis tool I am developing for irq-related deadlock. tipc_crypto_work_rx() --> tipc_crypto_key_distr() --> tipc_bcast_xmit() --> tipc_bcbase_xmit() --> tipc_bearer_bc_xmit() --> tipc_crypto_xmit() --> tipc_ehdr_build() --> tipc_crypto_key_revoke() --> spin_lock(&tx->lock) --> tipc_disc_timeout() --> tipc_bearer_xmit_skb() --> tipc_crypto_xmit() --> tipc_ehdr_build() --> tipc_crypto_key_revoke() --> spin_lock(&tx->lock) Signed-off-by: Chengfeng Ye Reviewed-by: Jacob Keller Acked-by: Jon Maloy Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Link: https://lore.kernel.org/r/20230927181414.59928-1-dg573847474@gmail.com Signed-off-by: Jakub Kicinski --- net/tipc/crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 302fd749c424..43c3f1c971b8 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1441,14 +1441,14 @@ static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; - spin_lock(&tx->lock); + spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); - spin_unlock(&tx->lock); + spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; -- cgit v1.2.3 From 059217c18be6757b95bfd77ba53fb50b48b8a816 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 1 Oct 2023 11:12:38 -0400 Subject: tcp: fix quick-ack counting to count actual ACKs of new data This commit fixes quick-ack counting so that it only considers that a quick-ack has been provided if we are sending an ACK that newly acknowledges data. The code was erroneously using the number of data segments in outgoing skbs when deciding how many quick-ack credits to remove. This logic does not make sense, and could cause poor performance in request-response workloads, like RPC traffic, where requests or responses can be multi-segment skbs. When a TCP connection decides to send N quick-acks, that is to accelerate the cwnd growth of the congestion control module controlling the remote endpoint of the TCP connection. That quick-ack decision is purely about the incoming data and outgoing ACKs. It has nothing to do with the outgoing data or the size of outgoing data. And in particular, an ACK only serves the intended purpose of allowing the remote congestion control to grow the congestion window quickly if the ACK is ACKing or SACKing new data. The fix is simple: only count packets as serving the goal of the quickack mechanism if they are ACKing/SACKing new data. We can tell whether this is the case by checking inet_csk_ack_scheduled(), since we schedule an ACK exactly when we are ACKing/SACKing new data. Fixes: fc6415bcb0f5 ("[TCP]: Fix quick-ack decrementing with TSO.") Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231001151239.1866845-1-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ccfc8bbf7455..aa0fc8c766e5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -177,8 +177,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, - u32 rcv_nxt) +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,7 +191,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ - tcp_dec_quickack_mode(sk, pkts); + tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1387,7 +1386,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); -- cgit v1.2.3 From 4720852ed9afb1c5ab84e96135cb5b73d5afde6f Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 1 Oct 2023 11:12:39 -0400 Subject: tcp: fix delayed ACKs for MSS boundary condition This commit fixes poor delayed ACK behavior that can cause poor TCP latency in a particular boundary condition: when an application makes a TCP socket write that is an exact multiple of the MSS size. The problem is that there is painful boundary discontinuity in the current delayed ACK behavior. With the current delayed ACK behavior, we have: (1) If an app reads data when > 1*MSS is unacknowledged, then tcp_cleanup_rbuf() ACKs immediately because of: tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || (2) If an app reads all received data, and the packets were < 1*MSS, and either (a) the app is not ping-pong or (b) we received two packets < 1*MSS, then tcp_cleanup_rbuf() ACKs immediately beecause of: ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && (3) *However*: if an app reads exactly 1*MSS of data, tcp_cleanup_rbuf() does not send an immediate ACK. This is true even if the app is not ping-pong and the 1*MSS of data had the PSH bit set, suggesting the sending application completed an application write. Thus if the app is not ping-pong, we have this painful case where >1*MSS gets an immediate ACK, and <1*MSS gets an immediate ACK, but a write whose last skb is an exact multiple of 1*MSS can get a 40ms delayed ACK. This means that any app that transfers data in one direction and takes care to align write size or packet size with MSS can suffer this problem. With receive zero copy making 4KB MSS values more common, it is becoming more common to have application writes naturally align with MSS, and more applications are likely to encounter this delayed ACK problem. The fix in this commit is to refine the delayed ACK heuristics with a simple check: immediately ACK a received 1*MSS skb with PSH bit set if the app reads all data. Why? If an skb has a len of exactly 1*MSS and has the PSH bit set then it is likely the end of an application write. So more data may not be arriving soon, and yet the data sender may be waiting for an ACK if cwnd-bound or using TX zero copy. Thus we set ICSK_ACK_PUSHED in this case so that tcp_cleanup_rbuf() will send an ACK immediately if the app reads all of the data and is not ping-pong. Note that this logic is also executed for the case where len > MSS, but in that case this logic does not matter (and does not hurt) because tcp_cleanup_rbuf() will always ACK immediately if the app reads data and there is more than an MSS of unACKed data. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Cc: Xin Guo Link: https://lore.kernel.org/r/20231001151239.1866845-2-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 06fe1cf645d5..8afb0950a697 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -253,6 +253,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) if (unlikely(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE)) tcp_gro_dev_warn(sk, skb, len); + /* If the skb has a len of exactly 1*MSS and has the PSH bit + * set then it is likely the end of an application write. So + * more data may not be arriving soon, and yet the data sender + * may be waiting for an ACK if cwnd-bound or using TX zero + * copy. So we set ICSK_ACK_PUSHED here so that + * tcp_cleanup_rbuf() will send an ACK immediately if the app + * reads all of the data and is not ping-pong. If len > MSS + * then this logic does not matter (and does not hurt) because + * tcp_cleanup_rbuf() will always ACK immediately if the app + * reads data and there is more than an MSS of unACKed data. + */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. -- cgit v1.2.3 From 2222a78075f0c19ca18db53fd6623afb4aff602d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2023 10:58:45 -0400 Subject: sctp: update transport state when processing a dupcook packet During the 4-way handshake, the transport's state is set to ACTIVE in sctp_process_init() when processing INIT_ACK chunk on client or COOKIE_ECHO chunk on server. In the collision scenario below: 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] 192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] when processing COOKIE_ECHO on 192.168.1.2, as it's in COOKIE_WAIT state, sctp_sf_do_dupcook_b() is called by sctp_sf_do_5_2_4_dupcook() where it creates a new association and sets its transport to ACTIVE then updates to the old association in sctp_assoc_update(). However, in sctp_assoc_update(), it will skip the transport update if it finds a transport with the same ipaddr already existing in the old asoc, and this causes the old asoc's transport state not to move to ACTIVE after the handshake. This means if DATA retransmission happens at this moment, it won't be able to enter PF state because of the check 'transport->state == SCTP_ACTIVE' in sctp_do_8_2_transport_strike(). This patch fixes it by updating the transport in sctp_assoc_update() with sctp_assoc_add_peer() where it updates the transport state if there is already a transport with the same ipaddr exists in the old asoc. Signed-off-by: Xin Long Reviewed-by: Simon Horman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/r/fd17356abe49713ded425250cc1ae51e9f5846c6.1696172325.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/associola.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 796529167e8d..c45c192b7878 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1159,8 +1159,7 @@ int sctp_assoc_update(struct sctp_association *asoc, /* Add any peer addresses from the new association. */ list_for_each_entry(trans, &new->peer.transport_addr_list, transports) - if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr) && - !sctp_assoc_add_peer(asoc, &trans->ipaddr, + if (!sctp_assoc_add_peer(asoc, &trans->ipaddr, GFP_ATOMIC, trans->state)) return -ENOMEM; -- cgit v1.2.3 From 1f4e803cd9c9166eb8b6c8b0b8e4124f7499fc07 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2023 11:04:20 -0400 Subject: sctp: update hb timer immediately after users change hb_interval Currently, when hb_interval is changed by users, it won't take effect until the next expiry of hb timer. As the default value is 30s, users have to wait up to 30s to wait its hb_interval update to work. This becomes pretty bad in containers where a much smaller value is usually set on hb_interval. This patch improves it by resetting the hb timer immediately once the value of hb_interval is updated by users. Note that we don't address the already existing 'problem' when sending a heartbeat 'on demand' if one hb has just been sent(from the timer) mentioned in: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg590224.html Signed-off-by: Xin Long Reviewed-by: Simon Horman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/r/75465785f8ee5df2fb3acdca9b8fafdc18984098.1696172660.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ab943e8fb1db..7f89e43154c0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2450,6 +2450,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, if (trans) { trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + sctp_transport_reset_hb_timer(trans); } else if (asoc) { asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); -- cgit v1.2.3 From d0f95894fda7d4f895b29c1097f92d7fee278cb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Oct 2023 18:34:55 +0000 Subject: netlink: annotate data-races around sk->sk_err syzbot caught another data-race in netlink when setting sk->sk_err. Annotate all of them for good measure. BUG: KCSAN: data-race in netlink_recvmsg / netlink_recvmsg write to 0xffff8881613bb220 of 4 bytes by task 28147 on cpu 0: netlink_recvmsg+0x448/0x780 net/netlink/af_netlink.c:1994 sock_recvmsg_nosec net/socket.c:1027 [inline] sock_recvmsg net/socket.c:1049 [inline] __sys_recvfrom+0x1f4/0x2e0 net/socket.c:2229 __do_sys_recvfrom net/socket.c:2247 [inline] __se_sys_recvfrom net/socket.c:2243 [inline] __x64_sys_recvfrom+0x78/0x90 net/socket.c:2243 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd write to 0xffff8881613bb220 of 4 bytes by task 28146 on cpu 1: netlink_recvmsg+0x448/0x780 net/netlink/af_netlink.c:1994 sock_recvmsg_nosec net/socket.c:1027 [inline] sock_recvmsg net/socket.c:1049 [inline] __sys_recvfrom+0x1f4/0x2e0 net/socket.c:2229 __do_sys_recvfrom net/socket.c:2247 [inline] __se_sys_recvfrom net/socket.c:2243 [inline] __x64_sys_recvfrom+0x78/0x90 net/socket.c:2243 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00000000 -> 0x00000016 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28146 Comm: syz-executor.0 Not tainted 6.6.0-rc3-syzkaller-00055-g9ed22ae6be81 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/06/2023 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231003183455.3410550-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 642b9d382fb4..eb086b06d60d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -352,7 +352,7 @@ static void netlink_overrun(struct sock *sk) if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; + WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } @@ -1605,7 +1605,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) goto out; } - sk->sk_err = p->code; + WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; @@ -1991,7 +1991,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { - sk->sk_err = -ret; + WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } @@ -2511,7 +2511,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, err_bad_put: nlmsg_free(skb); err_skb: - NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); -- cgit v1.2.3 From a5efdbcece83af94180e8d7c0a6e22947318499d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 4 Oct 2023 13:38:11 -0700 Subject: mptcp: fix delegated action races The delegated action infrastructure is prone to the following race: different CPUs can try to schedule different delegated actions on the same subflow at the same time. Each of them will check different bits via mptcp_subflow_delegate(), and will try to schedule the action on the related per-cpu napi instance. Depending on the timing, both can observe an empty delegated list node, causing the same entry to be added simultaneously on two different lists. The root cause is that the delegated actions infra does not provide a single synchronization point. Address the issue reserving an additional bit to mark the subflow as scheduled for delegation. Acquiring such bit guarantee the caller to own the delegated list node, and being able to safely schedule the subflow. Clear such bit only when the subflow scheduling is completed, ensuring proper barrier in place. Additionally swap the meaning of the delegated_action bitmask, to allow the usage of the existing helper to set multiple bit at once. Fixes: bcd97734318d ("mptcp: use delegate action to schedule 3rd ack retrans") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20231004-send-net-20231004-v1-1-28de4ac663ae@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 28 ++++++++++++++-------------- net/mptcp/protocol.h | 35 ++++++++++++----------------------- net/mptcp/subflow.c | 10 ++++++++-- 3 files changed, 34 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e252539b1e19..c3b83cb390d9 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3425,24 +3425,21 @@ static void schedule_3rdack_retransmission(struct sock *ssk) sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } -void mptcp_subflow_process_delegated(struct sock *ssk) +void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; - if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) { + if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); - mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); } - if (test_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status)) { + if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); - mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_ACK); - } } static int mptcp_hash(struct sock *sk) @@ -3968,14 +3965,17 @@ static int mptcp_napi_poll(struct napi_struct *napi, int budget) struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); - if (!sock_owned_by_user(ssk) && - mptcp_subflow_has_delegated_action(subflow)) - mptcp_subflow_process_delegated(ssk); - /* ... elsewhere tcp_release_cb_override already processed - * the action or will do at next release_sock(). - * In both case must dequeue the subflow here - on the same - * CPU that scheduled it. - */ + if (!sock_owned_by_user(ssk)) { + mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); + } else { + /* tcp_release_cb_override already processed + * the action or will do at next release_sock(). + * In both case must dequeue the subflow here - on the same + * CPU that scheduled it. + */ + smp_wmb(); + clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); + } bh_unlock_sock(ssk); sock_put(ssk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ed61d6850cce..3612545fa62e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -444,9 +444,11 @@ struct mptcp_delegated_action { DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); -#define MPTCP_DELEGATE_SEND 0 -#define MPTCP_DELEGATE_ACK 1 +#define MPTCP_DELEGATE_SCHEDULED 0 +#define MPTCP_DELEGATE_SEND 1 +#define MPTCP_DELEGATE_ACK 2 +#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ @@ -564,23 +566,24 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } -void mptcp_subflow_process_delegated(struct sock *ssk); +void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { + long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); - /* The implied barrier pairs with mptcp_subflow_delegated_done(), and - * ensures the below list check sees list updates done prior to status - * bit changes + /* The implied barrier pairs with tcp_release_cb_override() + * mptcp_napi_poll(), and ensures the below list check sees list + * updates done prior to delegated status bits changes */ - if (!test_and_set_bit(action, &subflow->delegated_status)) { - /* still on delegated list from previous scheduling */ - if (!list_empty(&subflow->delegated_node)) + old = set_mask_bits(&subflow->delegated_status, 0, set_bits); + if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { + if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; delegated = this_cpu_ptr(&mptcp_delegated_actions); @@ -605,20 +608,6 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) return ret; } -static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow) -{ - return !!READ_ONCE(subflow->delegated_status); -} - -static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow, int action) -{ - /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before - * touching the status bit - */ - smp_wmb(); - clear_bit(action, &subflow->delegated_status); -} - int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 918c1a235790..9c1f8d1d63d2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1956,9 +1956,15 @@ static void subflow_ulp_clone(const struct request_sock *req, static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + long status; - if (mptcp_subflow_has_delegated_action(subflow)) - mptcp_subflow_process_delegated(ssk); + /* process and clear all the pending actions, but leave the subflow into + * the napi queue. To respect locking, only the same CPU that originated + * the action can touch the list. mptcp_napi_poll will take care of it. + */ + status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); + if (status) + mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } -- cgit v1.2.3 From e5ed101a602873d65d2d64edaba93e8c73ec1b0f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 4 Oct 2023 13:38:12 -0700 Subject: mptcp: userspace pm allow creating id 0 subflow This patch drops id 0 limitation in mptcp_nl_cmd_sf_create() to allow creating additional subflows with the local addr ID 0. There is no reason not to allow additional subflows from this local address: we should be able to create new subflows from the initial endpoint. This limitation was breaking fullmesh support from userspace. Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/391 Cc: stable@vger.kernel.org Suggested-by: Matthieu Baerts Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20231004-send-net-20231004-v1-2-28de4ac663ae@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index b5a8aa4c1ebd..d042d32beb4d 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -307,12 +307,6 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) goto create_err; } - if (addr_l.id == 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); - err = -EINVAL; - goto create_err; - } - err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); -- cgit v1.2.3