From e9789c7cc182484fc031fd88097eb14cb26c4596 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Sep 2019 18:24:43 -0700 Subject: sch_cbq: validate TCA_CBQ_WRROPT to avoid crash syzbot reported a crash in cbq_normalize_quanta() caused by an out of range cl->priority. iproute2 enforces this check, but malicious users do not. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI Modules linked in: CPU: 1 PID: 26447 Comm: syz-executor.1 Not tainted 5.3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:cbq_normalize_quanta.part.0+0x1fd/0x430 net/sched/sch_cbq.c:902 RSP: 0018:ffff8801a5c333b0 EFLAGS: 00010206 RAX: 0000000020000003 RBX: 00000000fffffff8 RCX: ffffc9000712f000 RDX: 00000000000043bf RSI: ffffffff83be8962 RDI: 0000000100000018 RBP: ffff8801a5c33420 R08: 000000000000003a R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 00000000000002ef R13: ffff88018da95188 R14: dffffc0000000000 R15: 0000000000000015 FS: 00007f37d26b1700(0000) GS:ffff8801dad00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004c7cec CR3: 00000001bcd0a006 CR4: 00000000001626f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] cbq_normalize_quanta include/net/pkt_sched.h:27 [inline] [] cbq_addprio net/sched/sch_cbq.c:1097 [inline] [] cbq_set_wrr+0x2d7/0x450 net/sched/sch_cbq.c:1115 [] cbq_change_class+0x987/0x225b net/sched/sch_cbq.c:1537 [] tc_ctl_tclass+0x555/0xcd0 net/sched/sch_api.c:2329 [] rtnetlink_rcv_msg+0x485/0xc10 net/core/rtnetlink.c:5248 [] netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2510 [] rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5266 [] netlink_unicast_kernel net/netlink/af_netlink.c:1324 [inline] [] netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1350 [] netlink_sendmsg+0x89a/0xd50 net/netlink/af_netlink.c:1939 [] sock_sendmsg_nosec net/socket.c:673 [inline] [] sock_sendmsg+0x12e/0x170 net/socket.c:684 [] ___sys_sendmsg+0x81d/0x960 net/socket.c:2359 [] __sys_sendmsg+0x105/0x1d0 net/socket.c:2397 [] SYSC_sendmsg net/socket.c:2406 [inline] [] SyS_sendmsg+0x29/0x30 net/socket.c:2404 [] do_syscall_64+0x528/0x770 arch/x86/entry/common.c:305 [] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 06c7a2da21bc..39b427dc7512 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1127,6 +1127,33 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, }; +static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], + struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); + return -EINVAL; + } + + err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, + cbq_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CBQ_WRROPT]) { + const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); + + if (wrr->priority > TC_CBQ_MAXPRIO) { + NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); + err = -EINVAL; + } + } + return err; +} + static int cbq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1139,13 +1166,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); q->delay_timer.function = cbq_undelay; - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; @@ -1464,13 +1485,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t struct cbq_class *parent; struct qdisc_rate_table *rtab = NULL; - if (!opt) { - NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, - extack); + err = cbq_opt_parse(tb, opt, extack); if (err < 0) return err; -- cgit v1.2.3 From 0e141f757b2c78c983df893e9993313e2dc21e38 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 27 Sep 2019 14:58:20 +0800 Subject: erspan: remove the incorrect mtu limit for erspan erspan driver calls ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. It causes the dev mtu of the erspan device to not be greater than 1500, this limit value is not correct for ipgre tap device. Tested: Before patch: # ip link set erspan0 mtu 1600 Error: mtu greater than device maximum. After patch: # ip link set erspan0 mtu 1600 # ip -d link show erspan0 21: erspan0@NONE: mtu 1600 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 0 Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a53a543fe055..52690bb3e40f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1446,6 +1446,7 @@ static void erspan_setup(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &erspan_netdev_ops; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; -- cgit v1.2.3 From 8c7138b33e5c690c308b2a7085f6313fdcb3f616 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 27 Sep 2019 16:00:31 -0700 Subject: net: Unpublish sk from sk_reuseport_cb before call_rcu The "reuse->sock[]" array is shared by multiple sockets. The going away sk must unpublish itself from "reuse->sock[]" before making call_rcu() call. However, this unpublish-action is currently done after a grace period and it may cause use-after-free. The fix is to move reuseport_detach_sock() to sk_destruct(). Due to the above reason, any socket with sk_reuseport_cb has to go through the rcu grace period before freeing it. It is a rather old bug (~3 yrs). The Fixes tag is not necessary the right commit but it is the one that introduced the SOCK_RCU_FREE logic and this fix is depending on it. Fixes: a4298e4522d6 ("net: add SOCK_RCU_FREE socket flag") Cc: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/core/sock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 07863edbe6fc..e23ec80a67bf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1700,8 +1700,6 @@ static void __sk_destruct(struct rcu_head *head) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -1728,7 +1726,14 @@ static void __sk_destruct(struct rcu_head *head) void sk_destruct(struct sock *sk) { - if (sock_flag(sk, SOCK_RCU_FREE)) + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + reuseport_detach_sock(sk); + use_call_rcu = true; + } + + if (use_call_rcu) call_rcu(&sk->sk_rcu, __sk_destruct); else __sk_destruct(&sk->sk_rcu); -- cgit v1.2.3 From 68ce6688a5baefde30914fc07fc27292dbbe8320 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:01:39 +0300 Subject: net: sched: taprio: Fix potential integer overflow in taprio_set_picos_per_byte The speed divisor is used in a context expecting an s64, but it is evaluated using 32-bit arithmetic. To avoid that happening, instead of multiplying by 1,000,000 in the first place, simplify the fraction and do a standard 32 bit division instead. Fixes: f04b514c0ce2 ("taprio: Set default link speed to 10 Mbps in taprio_set_picos_per_byte") Reported-by: Gustavo A. R. Silva Suggested-by: Eric Dumazet Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2f7b34205c82..2aab46ada94f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1048,8 +1048,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev, speed = ecmd.base.speed; skip: - picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - speed * 1000 * 1000); + picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", -- cgit v1.2.3 From f88eb7c0d002a67ef31aeb7850b42ff69abc46dc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 20 Sep 2019 21:54:17 +0200 Subject: nl80211: validate beacon head We currently don't validate the beacon head, i.e. the header, fixed part and elements that are to go in front of the TIM element. This means that the variable elements there can be malformed, e.g. have a length exceeding the buffer size, but most downstream code from this assumes that this has already been checked. Add the necessary checks to the netlink policy. Cc: stable@vger.kernel.org Fixes: ed1b6cc7f80f ("cfg80211/nl80211: add beacon settings") Link: https://lore.kernel.org/r/1569009255-I7ac7fbe9436e9d8733439eab8acbbd35e55c74ef@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d21b1581a665..7386421e2ad3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info) return __cfg80211_rdev_from_attrs(netns, info->attrs); } +static int validate_beacon_head(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + const struct element *elem; + const struct ieee80211_mgmt *mgmt = (void *)data; + unsigned int fixedlen = offsetof(struct ieee80211_mgmt, + u.beacon.variable); + + if (len < fixedlen) + goto err; + + if (ieee80211_hdrlen(mgmt->frame_control) != + offsetof(struct ieee80211_mgmt, u.beacon)) + goto err; + + data += fixedlen; + len -= fixedlen; + + for_each_element(elem, data, len) { + /* nothing */ + } + + if (for_each_element_completed(elem, data, len)) + return 0; + +err: + NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head"); + return -EINVAL; +} + static int validate_ie_attr(const struct nlattr *attr, struct netlink_ext_ack *extack) { @@ -338,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, - [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY, - .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_BEACON_HEAD] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head, + IEEE80211_MAX_DATA_LEN), [NL80211_ATTR_BEACON_TAIL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, IEEE80211_MAX_DATA_LEN), -- cgit v1.2.3 From 242b0931c1918c56cd1dc5563fd250a3c39b996d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 20 Sep 2019 21:54:18 +0200 Subject: cfg80211: validate SSID/MBSSID element ordering assumption The code copying the data assumes that the SSID element is before the MBSSID element, but since the data is untrusted from the AP, this cannot be guaranteed. Validate that this is indeed the case and ignore the MBSSID otherwise, to avoid having to deal with both cases for the copy of data that should be between them. Cc: stable@vger.kernel.org Fixes: 0b8fb8235be8 ("cfg80211: Parsing of Multiple BSSID information in scanning") Link: https://lore.kernel.org/r/1569009255-I1673911f5eae02964e21bdc11b2bf58e5e207e59@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d313c9befa23..ff1016607f0b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1723,7 +1723,12 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, return; new_ie_len -= trans_ssid[1]; mbssid = cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen); - if (!mbssid) + /* + * It's not valid to have the MBSSID element before SSID + * ignore if that happens - the code below assumes it is + * after (while copying things inbetween). + */ + if (!mbssid || mbssid < trans_ssid) return; new_ie_len -= mbssid[1]; rcu_read_lock(); -- cgit v1.2.3 From f43e5210c739fe76a4b0ed851559d6902f20ceb1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 23 Sep 2019 13:51:16 +0200 Subject: cfg80211: initialize on-stack chandefs In a few places we don't properly initialize on-stack chandefs, resulting in EDMG data to be non-zero, which broke things. Additionally, in a few places we rely on the driver to init the data completely, but perhaps we shouldn't as non-EDMG drivers may not initialize the EDMG data, also initialize it there. Cc: stable@vger.kernel.org Fixes: 2a38075cd0be ("nl80211: Add support for EDMG channels") Reported-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Link: https://lore.kernel.org/r/1569239475-I2dcce394ecf873376c386a78f31c2ec8b538fa25@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 +++- net/wireless/reg.c | 2 +- net/wireless/wext-compat.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7386421e2ad3..10617566a117 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2669,6 +2669,8 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + memset(chandef, 0, sizeof(*chandef)); + chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = control_freq; @@ -3209,7 +3211,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_channel) { int ret; - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; ret = rdev_get_channel(rdev, wdev, &chandef); if (ret == 0) { diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5311d0ae2454..420c4207ab59 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2108,7 +2108,7 @@ static void reg_call_notifier(struct wiphy *wiphy, static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) { - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); enum nl80211_iftype iftype; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 7b6529d81c61..cac9e28d852b 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -798,7 +798,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - struct cfg80211_chan_def chandef; + struct cfg80211_chan_def chandef = {}; int ret; switch (wdev->iftype) { -- cgit v1.2.3 From b501426cf86e70649c983c52f4c823b3c40d72a3 Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Thu, 26 Sep 2019 16:16:50 +0800 Subject: nl80211: fix null pointer dereference If the interface is not in MESH mode, the command 'iw wlanx mpath del' will cause kernel panic. The root cause is null pointer access in mpp_flush_by_proxy(), as the pointer 'sdata->u.mesh.mpp_paths' is NULL for non MESH interface. Unable to handle kernel NULL pointer dereference at virtual address 00000068 [...] PC is at _raw_spin_lock_bh+0x20/0x5c LR is at mesh_path_del+0x1c/0x17c [mac80211] [...] Process iw (pid: 4537, stack limit = 0xd83e0238) [...] [] (_raw_spin_lock_bh) from [] (mesh_path_del+0x1c/0x17c [mac80211]) [] (mesh_path_del [mac80211]) from [] (extack_doit+0x20/0x68 [compat]) [] (extack_doit [compat]) from [] (genl_rcv_msg+0x274/0x30c) [] (genl_rcv_msg) from [] (netlink_rcv_skb+0x58/0xac) [] (netlink_rcv_skb) from [] (genl_rcv+0x20/0x34) [] (genl_rcv) from [] (netlink_unicast+0x11c/0x204) [] (netlink_unicast) from [] (netlink_sendmsg+0x30c/0x370) [] (netlink_sendmsg) from [] (sock_sendmsg+0x70/0x84) [] (sock_sendmsg) from [] (___sys_sendmsg.part.3+0x188/0x228) [] (___sys_sendmsg.part.3) from [] (__sys_sendmsg+0x4c/0x70) [] (__sys_sendmsg) from [] (ret_fast_syscall+0x0/0x44) Code: e2822c02 e2822001 e5832004 f590f000 (e1902f9f) ---[ end trace bbd717600f8f884d ]--- Signed-off-by: Miaoqing Pan Link: https://lore.kernel.org/r/1569485810-761-1-git-send-email-miaoqing@codeaurora.org [trim useless data from commit message] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 10617566a117..141cdb171665 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6305,6 +6305,9 @@ static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->del_mpath) return -EOPNOTSUPP; + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + return rdev_del_mpath(rdev, dev, dst); } -- cgit v1.2.3 From 8ed31a264065ae92058ce54aa3cc8da8d81dc6d7 Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Fri, 27 Sep 2019 10:03:16 +0800 Subject: mac80211: fix txq null pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the interface type is P2P_DEVICE or NAN, read the file of '/sys/kernel/debug/ieee80211/phyx/netdev:wlanx/aqm' will get a NULL pointer dereference. As for those interface type, the pointer sdata->vif.txq is NULL. Unable to handle kernel NULL pointer dereference at virtual address 00000011 CPU: 1 PID: 30936 Comm: cat Not tainted 4.14.104 #1 task: ffffffc0337e4880 task.stack: ffffff800cd20000 PC is at ieee80211_if_fmt_aqm+0x34/0xa0 [mac80211] LR is at ieee80211_if_fmt_aqm+0x34/0xa0 [mac80211] [...] Process cat (pid: 30936, stack limit = 0xffffff800cd20000) [...] [] ieee80211_if_fmt_aqm+0x34/0xa0 [mac80211] [] ieee80211_if_read+0x60/0xbc [mac80211] [] ieee80211_if_read_aqm+0x28/0x30 [mac80211] [] full_proxy_read+0x2c/0x48 [] __vfs_read+0x2c/0xd4 [] vfs_read+0x8c/0x108 [] SyS_read+0x40/0x7c Signed-off-by: Miaoqing Pan Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/1569549796-8223-1-git-send-email-miaoqing@codeaurora.org [trim useless data from commit message] Signed-off-by: Johannes Berg --- net/mac80211/debugfs_netdev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index b1438fd4d876..64b544ae9966 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -487,9 +487,14 @@ static ssize_t ieee80211_if_fmt_aqm( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { struct ieee80211_local *local = sdata->local; - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + struct txq_info *txqi; int len; + if (!sdata->vif.txq) + return 0; + + txqi = to_txq_info(sdata->vif.txq); + spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -658,7 +663,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); - if (sdata->local->ops->wake_tx_queue) + if (sdata->local->ops->wake_tx_queue && + sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) DEBUGFS_ADD(aqm); } -- cgit v1.2.3 From d8dec42b5c2d2b273bc30b0e073cfbe832d69902 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 1 Oct 2019 13:19:23 +0200 Subject: mac80211: keep BHs disabled while calling drv_tx_wake_queue() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers typically expect this, as it's the case for almost all cases where this is called (i.e. from the TX path). Also, the code in mac80211 itself (if the driver calls ieee80211_tx_dequeue()) expects this as it uses this_cpu_ptr() without additional protection. This should fix various reports of the problem: https://bugzilla.kernel.org/show_bug.cgi?id=204127 https://lore.kernel.org/linux-wireless/CAN5HydrWb3o_FE6A1XDnP1E+xS66d5kiEuhHfiGKkLNQokx13Q@mail.gmail.com/ https://lore.kernel.org/lkml/nycvar.YFH.7.76.1909111238470.473@cbobk.fhfr.pm/ Cc: stable@vger.kernel.org Reported-and-tested-by: Jiri Kosina Reported-by: Aaron Hill Reported-by: Lukas Redlinger Reported-by: Oleksii Shevchuk Fixes: 21a5d4c3a45c ("mac80211: add stop/start logic for software TXQs") Link: https://lore.kernel.org/r/1569928763-I3e8838c5ecad878e59d4a94eb069a90f6641461a@changeid Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/util.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 051a02ddcb85..32a7a53833c0 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -247,7 +247,8 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) struct sta_info *sta; int i; - spin_lock_bh(&fq->lock); + local_bh_disable(); + spin_lock(&fq->lock); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; @@ -273,9 +274,9 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) &txqi->flags)) continue; - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); - spin_lock_bh(&fq->lock); + spin_lock(&fq->lock); } } @@ -288,12 +289,14 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac) goto out; - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); drv_wake_tx_queue(local, txqi); + local_bh_enable(); return; out: - spin_unlock_bh(&fq->lock); + spin_unlock(&fq->lock); + local_bh_enable(); } static void -- cgit v1.2.3 From 895b5c9f206eb7d25dc1360a8ccfc5958895eb89 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 29 Sep 2019 20:54:03 +0200 Subject: netfilter: drop bridge nf reset from nf_reset commit 174e23810cd31 ("sk_buff: drop all skb extensions on free and skb scrubbing") made napi recycle always drop skb extensions. The additional skb_ext_del() that is performed via nf_reset on napi skb recycle is not needed anymore. Most nf_reset() calls in the stack are there so queued skb won't block 'rmmod nf_conntrack' indefinitely. This removes the skb_ext_del from nf_reset, and renames it to a more fitting nf_reset_ct(). In a few selected places, add a call to skb_ext_reset to make sure that no active extensions remain. I am submitting this for "net", because we're still early in the release cycle. The patch applies to net-next too, but I think the rename causes needless divergence between those trees. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/batman-adv/soft-interface.c | 2 +- net/core/skbuff.c | 2 +- net/dccp/ipv4.c | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 4 ++-- net/ipv6/ip6_input.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/raw.c | 2 +- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_eth.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/openvswitch/vport-internal_dev.c | 2 +- net/packet/af_packet.c | 4 ++-- net/sctp/input.c | 2 +- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_interface.c | 2 +- net/xfrm/xfrm_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 24 files changed, 27 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index a1146cb10919..9cbed6f5a85a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -436,7 +436,7 @@ void batadv_interface_rx(struct net_device *soft_iface, /* clean the netfilter state now that the batman-adv header has been * removed */ - nf_reset(skb); + nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 01d65206f4fb..529133611ea2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5120,7 +5120,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb_ext_reset(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); #ifdef CONFIG_NET_SWITCHDEV diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b685bc82f8d0..d9b4200ed12d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -871,7 +871,7 @@ lookup: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 1e2392b7c64e..c59a78a267c3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -199,7 +199,7 @@ resubmit: kfree_skb(skb); return; } - nf_reset(skb); + nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 313470f6bb14..716d5472c022 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1794,7 +1794,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb, ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - nf_reset(skb); + nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, @@ -2140,7 +2140,7 @@ int ip_mr_input(struct sk_buff *skb) mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { - nf_reset(skb); + nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index af3fbf76dbd3..6cc5743c553a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 80da5a66d5d7..3183413ebc6c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return NET_RX_DROP; } - nf_reset(skb); + nf_reset_ct(skb); skb_push(skb, skb->data - skb_network_header(skb)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2ee45e3755e9..bf124b1742df 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1916,7 +1916,7 @@ process: if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard_and_relse; - nf_reset(skb); + nf_reset_ct(skb); if (tcp_filter(sk, skb)) goto discard_and_relse; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf755156a684..e8443cc5c1ab 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1969,7 +1969,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); @@ -2298,7 +2298,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - nf_reset(skb); + nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index d432d0011c16..7e5df23cbe7b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -371,7 +371,7 @@ resubmit_final: /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded indefinitely. */ - nf_reset(skb); + nf_reset_ct(skb); skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index e6c9da9866b1..a0a2de30be3e 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -54,7 +54,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, return; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_reset(skb); + nf_reset_ct(skb); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif if (hooknum == NF_INET_PRE_ROUTING || diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 6e1888ee4036..a77f6b7d3a7c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -215,7 +215,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) /* Not releasing hash table! */ if (clone) { - nf_reset(clone); + nf_reset_ct(clone); rawv6_rcv(sk, clone); } } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 105e5a7092e7..f82ea12bac37 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1078,7 +1078,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); - nf_reset(skb); + nf_reset_ct(skb); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index bd3f39349d40..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -151,7 +151,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); rcu_read_lock(); dev = rcu_dereference(spriv->dev); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 622833317dcb..0d7c887a2b75 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -193,7 +193,7 @@ pass_up: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 687e23a8b326..802f19aba7e3 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -206,7 +206,7 @@ pass_up: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; - nf_reset(skb); + nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 9c464d24beec..888d3068a492 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -613,7 +613,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) ret = ip_vs_confirm_conntrack(skb); if (ret == NF_ACCEPT) { - nf_reset(skb); + nf_reset_ct(skb); skb_forward_csum(skb); } return ret; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d2437b5b2f6a..21c90d3a7ebf 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -237,7 +237,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb) } skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); secpath_reset(skb); skb->pkt_type = PACKET_HOST; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2742b006d25..82a50e850245 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1821,7 +1821,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; @@ -2121,7 +2121,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb_dst_drop(skb); /* drop conntrack reference */ - nf_reset(skb); + nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; diff --git a/net/sctp/input.c b/net/sctp/input.c index 1008cdc44dd6..5a070fb5b278 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -201,7 +201,7 @@ int sctp_rcv(struct sk_buff *skb) if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; - nf_reset(skb); + nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 6088bc2dc11e..9b599ed66d97 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -706,7 +706,7 @@ resume: if (err) goto drop; - nf_reset(skb); + nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 2ab4859df55a..0f5131bc3342 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -185,7 +185,7 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); - nf_reset(skb); + nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9499b35feb92..b1db55b50ba1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -502,7 +502,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err) struct net *net = xs_net(skb_dst(skb)->xfrm); while (likely((err = xfrm_output_one(skb, err)) == 0)) { - nf_reset(skb); + nf_reset_ct(skb); err = skb_dst(skb)->ops->local_out(net, skb->sk, skb); if (unlikely(err != 1)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 21e939235b39..f2d1e573ea55 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2808,7 +2808,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) continue; } - nf_reset(skb); + nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); -- cgit v1.2.3 From 34a4c95abd25ab41fb390b985a08a651b1fa0b0f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 30 Sep 2019 11:05:49 +0200 Subject: netfilter: nft_connlimit: disable bh on garbage collection BH must be disabled when invoking nf_conncount_gc_list() to perform garbage collection, otherwise deadlock might happen. nf_conncount_add+0x1f/0x50 [nf_conncount] nft_connlimit_eval+0x4c/0xe0 [nft_connlimit] nft_dynset_eval+0xb5/0x100 [nf_tables] nft_do_chain+0xea/0x420 [nf_tables] ? sch_direct_xmit+0x111/0x360 ? noqueue_init+0x10/0x10 ? __qdisc_run+0x84/0x510 ? tcp_packet+0x655/0x1610 [nf_conntrack] ? ip_finish_output2+0x1a7/0x430 ? tcp_error+0x130/0x150 [nf_conntrack] ? nf_conntrack_in+0x1fc/0x4c0 [nf_conntrack] nft_do_chain_ipv4+0x66/0x80 [nf_tables] nf_hook_slow+0x44/0xc0 ip_rcv+0xb5/0xd0 ? ip_rcv_finish_core.isra.19+0x360/0x360 __netif_receive_skb_one_core+0x52/0x70 netif_receive_skb_internal+0x34/0xe0 napi_gro_receive+0xba/0xe0 e1000_clean_rx_irq+0x1e9/0x420 [e1000e] e1000e_poll+0xbe/0x290 [e1000e] net_rx_action+0x149/0x3b0 __do_softirq+0xde/0x2d8 irq_exit+0xba/0xc0 do_IRQ+0x85/0xd0 common_interrupt+0xf/0xf RIP: 0010:nf_conncount_gc_list+0x3b/0x130 [nf_conncount] Fixes: 2f971a8f4255 ("netfilter: nf_conncount: move all list iterations under spinlock") Reported-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_connlimit.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index af1497ab9464..69d6173f91e2 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -218,8 +218,13 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); + bool ret; - return nf_conncount_gc_list(net, &priv->list); + local_bh_disable(); + ret = nf_conncount_gc_list(net, &priv->list); + local_bh_enable(); + + return ret; } static struct nft_expr_type nft_connlimit_type; -- cgit v1.2.3 From 9a9251a3534745d08a92abfeca0ca467b912b5f6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:37:22 +0300 Subject: net: sched: taprio: Avoid division by zero on invalid link speed The check in taprio_set_picos_per_byte is currently not robust enough and will trigger this division by zero, due to e.g. PHYLINK not setting kset->base.speed when there is no PHY connected: [ 27.109992] Division by zero in kernel. [ 27.113842] CPU: 1 PID: 198 Comm: tc Not tainted 5.3.0-rc5-01246-gc4006b8c2637-dirty #212 [ 27.121974] Hardware name: Freescale LS1021A [ 27.126234] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 27.133938] [] (show_stack) from [] (dump_stack+0xb0/0xc4) [ 27.141124] [] (dump_stack) from [] (Ldiv0_64+0x8/0x18) [ 27.148052] [] (Ldiv0_64) from [] (div64_u64+0xcc/0xf0) [ 27.154978] [] (div64_u64) from [] (div64_s64+0x4c/0x68) [ 27.161993] [] (div64_s64) from [] (taprio_set_picos_per_byte+0xe8/0xf4) [ 27.170388] [] (taprio_set_picos_per_byte) from [] (taprio_change+0x668/0xcec) [ 27.179302] [] (taprio_change) from [] (qdisc_create+0x1fc/0x4f4) [ 27.187091] [] (qdisc_create) from [] (tc_modify_qdisc+0x1ac/0x6f8) [ 27.195055] [] (tc_modify_qdisc) from [] (rtnetlink_rcv_msg+0x268/0x2dc) [ 27.203449] [] (rtnetlink_rcv_msg) from [] (netlink_rcv_skb+0xe0/0x114) [ 27.211756] [] (netlink_rcv_skb) from [] (netlink_unicast+0x1b4/0x22c) [ 27.219977] [] (netlink_unicast) from [] (netlink_sendmsg+0x284/0x340) [ 27.228198] [] (netlink_sendmsg) from [] (sock_sendmsg+0x14/0x24) [ 27.235988] [] (sock_sendmsg) from [] (___sys_sendmsg+0x214/0x228) [ 27.243863] [] (___sys_sendmsg) from [] (__sys_sendmsg+0x50/0x8c) [ 27.251652] [] (__sys_sendmsg) from [] (ret_fast_syscall+0x0/0x54) [ 27.259524] Exception stack(0xe8045fa8 to 0xe8045ff0) [ 27.264546] 5fa0: b6f608c8 000000f8 00000003 bed7e2f0 00000000 00000000 [ 27.272681] 5fc0: b6f608c8 000000f8 004ce54c 00000128 5d3ce8c7 00000000 00000026 00505c9c [ 27.280812] 5fe0: 00000070 bed7e298 004ddd64 b6dd1e64 Russell King points out that the ethtool API says zero is a valid return value of __ethtool_get_link_ksettings: * If it is enabled then they are read-only; if the link * is up they represent the negotiated link mode; if the link is down, * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. So, it seems that taprio is not following the API... I'd suggest either fixing taprio, or getting agreement to change the ethtool API. The chosen path was to fix taprio. Fixes: 7b9eba7ba0c1 ("net/sched: taprio: fix picos_per_byte miscalculation") Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 2aab46ada94f..68b543f85a96 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1044,7 +1044,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev, if (err < 0) goto skip; - if (ecmd.base.speed != SPEED_UNKNOWN) + if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: -- cgit v1.2.3 From 83c8c3cf45163f0c823db37be6ab04dfcf8ac751 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Sep 2019 02:39:48 +0300 Subject: net: sched: cbs: Avoid division by zero when calculating the port rate As explained in the "net: sched: taprio: Avoid division by zero on invalid link speed" commit, it is legal for the ethtool API to return zero as a link speed. So guard against it to ensure we don't perform a division by zero in kernel. Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation") Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 1bef152c5721..b2905b03a432 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -306,7 +306,7 @@ static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) if (err < 0) goto skip; - if (ecmd.base.speed != SPEED_UNKNOWN) + if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: -- cgit v1.2.3 From 93c2fcb01ae919b6e882b0931383d33aaa9bf7a6 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 30 Sep 2019 11:52:21 +0530 Subject: devlink: Fix error handling in param and info_get dumpit cb If any of the param or info_get op returns error, dumpit cb is skipping to dump remaining params or info_get ops for all the drivers. Fix to not return if any of the param/info_get op returns error as not supported and continue to dump remaining information. v2: Modify the patch to return error, except for params/info_get op that return -EOPNOTSUPP as suggested by Andrew Lunn. Also, modify commit message to reflect the same. Cc: Andrew Lunn Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam Acked-by: Jiri Pirko Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/core/devlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index e48680efe54a..f80151eeaf51 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3172,7 +3172,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { + if (err && err != -EOPNOTSUPP) { mutex_unlock(&devlink->lock); goto out; } @@ -3432,7 +3432,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) { + if (err && err != -EOPNOTSUPP) { mutex_unlock(&devlink->lock); goto out; } @@ -4088,7 +4088,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); mutex_unlock(&devlink->lock); - if (err) + if (err && err != -EOPNOTSUPP) break; idx++; } -- cgit v1.2.3 From 0d9138ffac24cf8b75366ede3a68c951e6dcc575 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 30 Sep 2019 18:43:50 +0000 Subject: vsock: Fix a lockdep warning in __vsock_release() Lockdep is unhappy if two locks from the same class are held. Fix the below warning for hyperv and virtio sockets (vmci socket code doesn't have the issue) by using lock_sock_nested() when __vsock_release() is called recursively: ============================================ WARNING: possible recursive locking detected 5.3.0+ #1 Not tainted -------------------------------------------- server/1795 is trying to acquire lock: ffff8880c5158990 (sk_lock-AF_VSOCK){+.+.}, at: hvs_release+0x10/0x120 [hv_sock] but task is already holding lock: ffff8880c5158150 (sk_lock-AF_VSOCK){+.+.}, at: __vsock_release+0x2e/0xf0 [vsock] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(sk_lock-AF_VSOCK); lock(sk_lock-AF_VSOCK); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by server/1795: #0: ffff8880c5d05ff8 (&sb->s_type->i_mutex_key#10){+.+.}, at: __sock_release+0x2d/0xa0 #1: ffff8880c5158150 (sk_lock-AF_VSOCK){+.+.}, at: __vsock_release+0x2e/0xf0 [vsock] stack backtrace: CPU: 5 PID: 1795 Comm: server Not tainted 5.3.0+ #1 Call Trace: dump_stack+0x67/0x90 __lock_acquire.cold.67+0xd2/0x20b lock_acquire+0xb5/0x1c0 lock_sock_nested+0x6d/0x90 hvs_release+0x10/0x120 [hv_sock] __vsock_release+0x24/0xf0 [vsock] __vsock_release+0xa0/0xf0 [vsock] vsock_release+0x12/0x30 [vsock] __sock_release+0x37/0xa0 sock_close+0x14/0x20 __fput+0xc1/0x250 task_work_run+0x98/0xc0 do_exit+0x344/0xc60 do_group_exit+0x47/0xb0 get_signal+0x15c/0xc50 do_signal+0x30/0x720 exit_to_usermode_loop+0x50/0xa0 do_syscall_64+0x24e/0x270 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f4184e85f31 Tested-by: Stefano Garzarella Signed-off-by: Dexuan Cui Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 16 ++++++++++++---- net/vmw_vsock/hyperv_transport.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ab47bf3ab66e..2ab43b2bba31 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -638,7 +638,7 @@ struct sock *__vsock_create(struct net *net, } EXPORT_SYMBOL_GPL(__vsock_create); -static void __vsock_release(struct sock *sk) +static void __vsock_release(struct sock *sk, int level) { if (sk) { struct sk_buff *skb; @@ -648,9 +648,17 @@ static void __vsock_release(struct sock *sk) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ + /* The release call is supposed to use lock_sock_nested() + * rather than lock_sock(), if a sock lock should be acquired. + */ transport->release(vsk); - lock_sock(sk); + /* When "level" is SINGLE_DEPTH_NESTING, use the nested + * version to avoid the warning "possible recursive locking + * detected". When "level" is 0, lock_sock_nested(sk, level) + * is the same as lock_sock(sk). + */ + lock_sock_nested(sk, level); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -659,7 +667,7 @@ static void __vsock_release(struct sock *sk) /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { - __vsock_release(pending); + __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } @@ -708,7 +716,7 @@ EXPORT_SYMBOL_GPL(vsock_stream_has_space); static int vsock_release(struct socket *sock) { - __vsock_release(sock->sk); + __vsock_release(sock->sk, 0); sock->sk = NULL; sock->state = SS_FREE; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 261521d286d6..c443db7af8d4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -559,7 +559,7 @@ static void hvs_release(struct vsock_sock *vsk) struct sock *sk = sk_vsock(vsk); bool remove_sock; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); remove_sock = hvs_close_lock_held(vsk); release_sock(sk); if (remove_sock) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 5bb70c692b1e..a666ef8fc54e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -820,7 +820,7 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); -- cgit v1.2.3 From 3256a2d6ab1f71f9a1bd2d7f6f18eb8108c48d17 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Sep 2019 15:44:44 -0700 Subject: tcp: adjust rto_base in retransmits_timed_out() The cited commit exposed an old retransmits_timed_out() bug which assumed it could call tcp_model_timeout() with TCP_RTO_MIN as rto_base for all states. But flows in SYN_SENT or SYN_RECV state uses a different RTO base (1 sec instead of 200 ms, unless BPF choses another value) This caused a reduction of SYN retransmits from 6 to 4 with the default /proc/sys/net/ipv4/tcp_syn_retries value. Fixes: a41e8a88b06e ("tcp: better handle TCP_USER_TIMEOUT in SYN_SENT state") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Marek Majkowski Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 40de2d2364a1..05be564414e9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -198,8 +198,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (likely(timeout == 0)) - timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); + if (likely(timeout == 0)) { + unsigned int rto_base = TCP_RTO_MIN; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + rto_base = tcp_timeout_init(sk); + timeout = tcp_model_timeout(sk, boundary, rto_base); + } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } -- cgit v1.2.3 From a3ce2a21bb8969ae27917281244fa91bf5f286d7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 30 Sep 2019 20:28:34 -0700 Subject: ipv6: Handle race in addrconf_dad_work Rajendra reported a kernel panic when a link was taken down: [ 6870.263084] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 [ 6870.271856] IP: [] __ipv6_ifa_notify+0x154/0x290 [ 6870.570501] Call Trace: [ 6870.573238] [] ? ipv6_ifa_notify+0x26/0x40 [ 6870.579665] [] ? addrconf_dad_completed+0x4c/0x2c0 [ 6870.586869] [] ? ipv6_dev_mc_inc+0x196/0x260 [ 6870.593491] [] ? addrconf_dad_work+0x10a/0x430 [ 6870.600305] [] ? __switch_to_asm+0x34/0x70 [ 6870.606732] [] ? process_one_work+0x18a/0x430 [ 6870.613449] [] ? worker_thread+0x4d/0x490 [ 6870.619778] [] ? process_one_work+0x430/0x430 [ 6870.626495] [] ? kthread+0xd9/0xf0 [ 6870.632145] [] ? __switch_to_asm+0x34/0x70 [ 6870.638573] [] ? kthread_park+0x60/0x60 [ 6870.644707] [] ? ret_from_fork+0x57/0x70 [ 6870.650936] Code: 31 c0 31 d2 41 b9 20 00 08 02 b9 09 00 00 0 addrconf_dad_work is kicked to be scheduled when a device is brought up. There is a race between addrcond_dad_work getting scheduled and taking the rtnl lock and a process taking the link down (under rtnl). The latter removes the host route from the inet6_addr as part of addrconf_ifdown which is run for NETDEV_DOWN. The former attempts to use the host route in ipv6_ifa_notify. If the down event removes the host route due to the race to the rtnl, then the BUG listed above occurs. This scenario does not occur when the ipv6 address is not kept (net.ipv6.conf.all.keep_addr_on_down = 0) as addrconf_ifdown sets the state of the ifp to DEAD. Handle when the addresses are kept by checking IF_READY which is reset by addrconf_ifdown. The 'dead' flag for an inet6_addr is set only under rtnl, in addrconf_ifdown and it means the device is getting removed (or IPv6 is disabled). The interesting cases for changing the idev flag are addrconf_notify (NETDEV_UP and NETDEV_CHANGE) and addrconf_ifdown (reset the flag). The former does not have the idev lock - only rtnl; the latter has both. Based on that the existing dead + IF_READY check can be moved to right after the rtnl_lock in addrconf_dad_work. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Reported-by: Rajendra Dendukuri Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a576ff92c39..dd3be06d5a06 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4032,6 +4032,12 @@ static void addrconf_dad_work(struct work_struct *w) rtnl_lock(); + /* check if device was taken down before this delayed work + * function could be canceled + */ + if (idev->dead || !(idev->if_flags & IF_READY)) + goto out; + spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { action = DAD_BEGIN; @@ -4077,11 +4083,6 @@ static void addrconf_dad_work(struct work_struct *w) goto out; write_lock_bh(&idev->lock); - if (idev->dead || !(idev->if_flags & IF_READY)) { - write_unlock_bh(&idev->lock); - goto out; - } - spin_lock(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DEAD) { spin_unlock(&ifp->lock); -- cgit v1.2.3 From e95584a889e1902fdf1ded9712e2c3c3083baf96 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Wed, 2 Oct 2019 18:49:43 +0700 Subject: tipc: fix unlimited bundling of small messages We have identified a problem with the "oversubscription" policy in the link transmission code. When small messages are transmitted, and the sending link has reached the transmit window limit, those messages will be bundled and put into the link backlog queue. However, bundles of data messages are counted at the 'CRITICAL' level, so that the counter for that level, instead of the counter for the real, bundled message's level is the one being increased. Subsequent, to-be-bundled data messages at non-CRITICAL levels continue to be tested against the unchanged counter for their own level, while contributing to an unrestrained increase at the CRITICAL backlog level. This leaves a gap in congestion control algorithm for small messages that can result in starvation for other users or a "real" CRITICAL user. Even that eventually can lead to buffer exhaustion & link reset. We fix this by keeping a 'target_bskb' buffer pointer at each levels, then when bundling, we only bundle messages at the same importance level only. This way, we know exactly how many slots a certain level have occupied in the queue, so can manage level congestion accurately. By bundling messages at the same level, we even have more benefits. Let consider this: - One socket sends 64-byte messages at the 'CRITICAL' level; - Another sends 4096-byte messages at the 'LOW' level; When a 64-byte message comes and is bundled the first time, we put the overhead of message bundle to it (+ 40-byte header, data copy, etc.) for later use, but the next message can be a 4096-byte one that cannot be bundled to the previous one. This means the last bundle carries only one payload message which is totally inefficient, as for the receiver also! Later on, another 64-byte message comes, now we make a new bundle and the same story repeats... With the new bundling algorithm, this will not happen, the 64-byte messages will be bundled together even when the 4096-byte message(s) comes in between. However, if the 4096-byte messages are sent at the same level i.e. 'CRITICAL', the bundling algorithm will again cause the same overhead. Also, the same will happen even with only one socket sending small messages at a rate close to the link transmit's one, so that, when one message is bundled, it's transmitted shortly. Then, another message comes, a new bundle is created and so on... We will solve this issue radically by another patch. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Reported-by: Hoang Le Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/link.c | 29 ++++++++++++++++++----------- net/tipc/msg.c | 5 +---- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 6cc75ffd9e2c..999eab592de8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -160,6 +160,7 @@ struct tipc_link { struct { u16 len; u16 limit; + struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; u16 window; @@ -880,6 +881,7 @@ static void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { struct sk_buff_head list; + u32 imp; __skb_queue_head_init(&list); @@ -901,11 +903,10 @@ void tipc_link_reset(struct tipc_link *l) __skb_queue_purge(&l->deferdq); __skb_queue_purge(&l->backlogq); __skb_queue_purge(&l->failover_deferdq); - l->backlog[TIPC_LOW_IMPORTANCE].len = 0; - l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; - l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; - l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0; - l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0; + for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) { + l->backlog[imp].len = 0; + l->backlog[imp].target_bskb = NULL; + } kfree_skb(l->reasm_buf); kfree_skb(l->reasm_tnlmsg); kfree_skb(l->failover_reasm_skb); @@ -947,7 +948,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; struct sk_buff_head *transmq = &l->transmq; struct sk_buff_head *backlogq = &l->backlogq; - struct sk_buff *skb, *_skb, *bskb; + struct sk_buff *skb, *_skb, **tskb; int pkt_cnt = skb_queue_len(list); int rc = 0; @@ -999,19 +1000,21 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, seqno++; continue; } - if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) { + tskb = &l->backlog[imp].target_bskb; + if (tipc_msg_bundle(*tskb, hdr, mtu)) { kfree_skb(__skb_dequeue(list)); l->stats.sent_bundled++; continue; } - if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) { + if (tipc_msg_make_bundle(tskb, hdr, mtu, l->addr)) { kfree_skb(__skb_dequeue(list)); - __skb_queue_tail(backlogq, bskb); - l->backlog[msg_importance(buf_msg(bskb))].len++; + __skb_queue_tail(backlogq, *tskb); + l->backlog[imp].len++; l->stats.sent_bundled++; l->stats.sent_bundles++; continue; } + l->backlog[imp].target_bskb = NULL; l->backlog[imp].len += skb_queue_len(list); skb_queue_splice_tail_init(list, backlogq); } @@ -1027,6 +1030,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l, u16 seqno = l->snd_nxt; u16 ack = l->rcv_nxt - 1; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u32 imp; while (skb_queue_len(&l->transmq) < l->window) { skb = skb_peek(&l->backlogq); @@ -1037,7 +1041,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l, break; __skb_dequeue(&l->backlogq); hdr = buf_msg(skb); - l->backlog[msg_importance(hdr)].len--; + imp = msg_importance(hdr); + l->backlog[imp].len--; + if (unlikely(skb == l->backlog[imp].target_bskb)) + l->backlog[imp].target_bskb = NULL; __skb_queue_tail(&l->transmq, skb); /* next retransmit attempt */ if (link_is_bc_sndlink(l)) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index e6d49cdc61b4..922d262e153f 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -543,10 +543,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, bmsg = buf_msg(_skb); tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0, INT_H_SIZE, dnode); - if (msg_isdata(msg)) - msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE); - else - msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE); + msg_set_importance(bmsg, msg_importance(msg)); msg_set_seqno(bmsg, msg_seqno(msg)); msg_set_ack(bmsg, msg_ack(msg)); msg_set_bcast_ack(bmsg, msg_bcast_ack(msg)); -- cgit v1.2.3 From d64bf89a75b65f83f06be9fb8f978e60d53752db Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Tue, 1 Oct 2019 10:21:02 -0700 Subject: net/rds: Fix error handling in rds_ib_add_one() rds_ibdev:ipaddr_list and rds_ibdev:conn_list are initialized after allocation some resources such as protection domain. If allocation of such resources fail, then these uninitialized variables are accessed in rds_ib_dev_free() in failure path. This can potentially crash the system. The code has been updated to initialize these variables very early in the function. Signed-off-by: Dotan Barak Signed-off-by: Sudhakar Dindukurti Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 45acab2de0cf..9de2ae22d583 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -143,6 +143,9 @@ static void rds_ib_add_one(struct ib_device *device) refcount_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); @@ -203,9 +206,6 @@ static void rds_ib_add_one(struct ib_device *device) device->name, rds_ibdev->use_fastreg ? "FRMR" : "FMR"); - INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); - INIT_LIST_HEAD(&rds_ibdev->conn_list); - down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); up_write(&rds_ib_devices_lock); -- cgit v1.2.3 From 3e8db7e56082156a37b71d7334860c10fcea8025 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 21:58:19 +0300 Subject: net: dsa: sja1105: Fix sleeping while atomic in .port_hwtstamp_set Currently this stack trace can be seen with CONFIG_DEBUG_ATOMIC_SLEEP=y: [ 41.568348] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909 [ 41.576757] in_atomic(): 1, irqs_disabled(): 0, pid: 208, name: ptp4l [ 41.583212] INFO: lockdep is turned off. [ 41.587123] CPU: 1 PID: 208 Comm: ptp4l Not tainted 5.3.0-rc6-01445-ge950f2d4bc7f-dirty #1827 [ 41.599873] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 41.607584] [] (show_stack) from [] (dump_stack+0xd4/0x100) [ 41.614863] [] (dump_stack) from [] (___might_sleep+0x1c8/0x2b4) [ 41.622574] [] (___might_sleep) from [] (__mutex_lock+0x48/0xab8) [ 41.630368] [] (__mutex_lock) from [] (mutex_lock_nested+0x1c/0x24) [ 41.638340] [] (mutex_lock_nested) from [] (sja1105_static_config_reload+0x30/0x27c) [ 41.647779] [] (sja1105_static_config_reload) from [] (sja1105_hwtstamp_set+0x108/0x1cc) [ 41.657562] [] (sja1105_hwtstamp_set) from [] (dev_ifsioc+0x18c/0x330) [ 41.665788] [] (dev_ifsioc) from [] (dev_ioctl+0x320/0x6e8) [ 41.673064] [] (dev_ioctl) from [] (sock_ioctl+0x334/0x5e8) [ 41.680340] [] (sock_ioctl) from [] (do_vfs_ioctl+0xb0/0xa10) [ 41.687789] [] (do_vfs_ioctl) from [] (ksys_ioctl+0x34/0x58) [ 41.695151] [] (ksys_ioctl) from [] (ret_fast_syscall+0x0/0x28) [ 41.702768] Exception stack(0xe8495fa8 to 0xe8495ff0) [ 41.707796] 5fa0: beff4a8c 00000001 00000011 000089b0 beff4a8c beff4a80 [ 41.715933] 5fc0: beff4a8c 00000001 0000000c 00000036 b6fa98c8 004e19c1 00000001 00000000 [ 41.724069] 5fe0: 004dcedc beff4a6c 004c0738 b6e7af4c [ 41.729860] BUG: scheduling while atomic: ptp4l/208/0x00000002 [ 41.735682] INFO: lockdep is turned off. Enabling RX timestamping will logically disturb the fastpath (processing of meta frames). Replace bool hwts_rx_en with a bit that is checked atomically from the fastpath and temporarily unset from the sleepable context during a change of the RX timestamping process (a destructive operation anyways, requires switch reset). If found unset, the fastpath (net/dsa/tag_sja1105.c) will just drop any received meta frame and not take the meta_lock at all. Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9c9aff3e52cf..63ef2a14c934 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -156,7 +156,11 @@ static struct sk_buff /* Step 1: A timestampable frame was received. * Buffer it until we get its meta frame. */ - if (is_link_local && sp->data->hwts_rx_en) { + if (is_link_local) { + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + /* Do normal processing. */ + return skb; + spin_lock(&sp->data->meta_lock); /* Was this a link-local frame instead of the meta * that we were expecting? @@ -187,6 +191,12 @@ static struct sk_buff } else if (is_meta) { struct sk_buff *stampable_skb; + /* Drop the meta frame if we're not in the right state + * to process it. + */ + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) + return NULL; + spin_lock(&sp->data->meta_lock); stampable_skb = sp->data->stampable_skb; -- cgit v1.2.3 From df551058f7a303bd3a17a4cef001349974962ce8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 Oct 2019 08:31:59 +0200 Subject: xsk: Fix crash in poll when device does not support ndo_xsk_wakeup Fixes a crash in poll() when an AF_XDP socket is opened in copy mode and the bound device does not have ndo_xsk_wakeup defined. Avoid trying to call the non-existing ndo and instead call the internal xsk sendmsg function to send packets in the same way (from the application's point of view) as calling sendmsg() in any mode or poll() in zero-copy mode would have done. The application should behave in the same way independent on if zero-copy mode or copy mode is used. Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings") Reported-by: syzbot+a5765ed8cdb1cca4d249@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1569997919-11541-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index fa8fbb8fa3c8..9044073fbf22 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -305,9 +305,8 @@ out: } EXPORT_SYMBOL(xsk_umem_consume_tx); -static int xsk_zc_xmit(struct sock *sk) +static int xsk_zc_xmit(struct xdp_sock *xs) { - struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev = xs->dev; return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, @@ -327,11 +326,10 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, - size_t total_len) +static int xsk_generic_xmit(struct sock *sk) { - u32 max_batch = TX_BATCH_SIZE; struct xdp_sock *xs = xdp_sk(sk); + u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; @@ -394,6 +392,18 @@ out: return err; } +static int __xsk_sendmsg(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + if (unlikely(!xs->tx)) + return -ENOBUFS; + + return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); +} + static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); @@ -402,21 +412,18 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) if (unlikely(!xsk_is_bound(xs))) return -ENXIO; - if (unlikely(!(xs->dev->flags & IFF_UP))) - return -ENETDOWN; - if (unlikely(!xs->tx)) - return -ENOBUFS; - if (need_wait) + if (unlikely(need_wait)) return -EOPNOTSUPP; - return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); + return __xsk_sendmsg(sk); } static unsigned int xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { unsigned int mask = datagram_poll(file, sock, wait); - struct xdp_sock *xs = xdp_sk(sock->sk); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; struct xdp_umem *umem; @@ -426,9 +433,14 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock, dev = xs->dev; umem = xs->umem; - if (umem->need_wakeup) - dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, - umem->need_wakeup); + if (umem->need_wakeup) { + if (dev->netdev_ops->ndo_xsk_wakeup) + dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, + umem->need_wakeup); + else + /* Poll needs to drive Tx also in copy mode */ + __xsk_sendmsg(sk); + } if (xs->rx && !xskq_empty_desc(xs->rx)) mask |= POLLIN | POLLRDNORM; -- cgit v1.2.3 From 6af1799aaf3f1bc8defedddfa00df3192445bbf3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2019 09:38:55 -0700 Subject: ipv6: drop incoming packets having a v4mapped source address This began with a syzbot report. syzkaller was injecting IPv6 TCP SYN packets having a v4mapped source address. After an unsuccessful 4-tuple lookup, TCP creates a request socket (SYN_RECV) and calls reqsk_queue_hash_req() reqsk_queue_hash_req() calls sk_ehashfn(sk) At this point we have AF_INET6 sockets, and the heuristic used by sk_ehashfn() to either hash the IPv4 or IPv6 addresses is to use ipv6_addr_v4mapped(&sk->sk_v6_daddr) For the particular spoofed packet, we end up hashing V4 addresses which were not initialized by the TCP IPv6 stack, so KMSAN fired a warning. I first fixed sk_ehashfn() to test both source and destination addresses, but then faced various problems, including user-space programs like packetdrill that had similar assumptions. Instead of trying to fix the whole ecosystem, it is better to admit that we have a dual stack behavior, and that we can not build linux kernels without V4 stack anyway. The dual stack API automatically forces the traffic to be IPv4 if v4mapped addresses are used at bind() or connect(), so it makes no sense to allow IPv6 traffic to use the same v4mapped class. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Hannes Frederic Sowa Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 7e5df23cbe7b..3d71c7d6102c 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -223,6 +223,16 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, if (ipv6_addr_is_multicast(&hdr->saddr)) goto err; + /* While RFC4291 is not explicit about v4mapped addresses + * in IPv6 headers, it seems clear linux dual-stack + * model can not deal properly with these. + * Security models could be fooled by ::ffff:127.0.0.1 for example. + * + * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02 + */ + if (ipv6_addr_v4mapped(&hdr->saddr)) + goto err; + skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); -- cgit v1.2.3 From 44b321e5020d782ad6e8ae8183f09b163be6e6e2 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 2 Oct 2019 13:29:22 -0400 Subject: udp: fix gso_segs calculations Commit dfec0ee22c0a ("udp: Record gso_segs when supporting UDP segmentation offload") added gso_segs calculation, but incorrectly got sizeof() the pointer and not the underlying data type. In addition let's fix the v6 case. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Fixes: dfec0ee22c0a ("udp: Record gso_segs when supporting UDP segmentation offload") Signed-off-by: Josh Hunt Reviewed-by: Alexander Duyck Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e8443cc5c1ab..3e2b90181879 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -856,7 +856,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh), + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), cork->gso_size); goto csum_partial; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index aae4938f3dea..eb9a9934ac05 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1143,6 +1143,8 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), + cork->gso_size); goto csum_partial; } -- cgit v1.2.3 From 4094871db1d65810acab3d57f6089aa39ef7f648 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 2 Oct 2019 13:29:23 -0400 Subject: udp: only do GSO if # of segs > 1 Prior to this change an application sending <= 1MSS worth of data and enabling UDP GSO would fail if the system had SW GSO enabled, but the same send would succeed if HW GSO offload is enabled. In addition to this inconsistency the error in the SW GSO case does not get back to the application if sending out of a real device so the user is unaware of this failure. With this change we only perform GSO if the # of segments is > 1 even if the application has enabled segmentation. I've also updated the relevant udpgso selftests. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Josh Hunt Reviewed-by: Willem de Bruijn Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/udp.c | 11 +++++++---- net/ipv6/udp.c | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3e2b90181879..14bc654b6842 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -821,6 +821,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; + int datalen = len - sizeof(*uh); __wsum csum = 0; /* @@ -854,10 +855,12 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, return -EIO; } - skb_shinfo(skb)->gso_size = cork->gso_size; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), - cork->gso_size); + if (datalen > cork->gso_size) { + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, + cork->gso_size); + } goto csum_partial; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index eb9a9934ac05..6324d3a8cb53 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1109,6 +1109,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, __wsum csum = 0; int offset = skb_transport_offset(skb); int len = skb->len - offset; + int datalen = len - sizeof(*uh); /* * Create a UDP header @@ -1141,10 +1142,12 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, return -EIO; } - skb_shinfo(skb)->gso_size = cork->gso_size; - skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(*uh), - cork->gso_size); + if (datalen > cork->gso_size) { + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, + cork->gso_size); + } goto csum_partial; } -- cgit v1.2.3 From 3afb0961884046c8fb4acbce65139088959681c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2019 20:19:59 -0700 Subject: tcp: fix slab-out-of-bounds in tcp_zerocopy_receive() Apparently a refactoring patch brought a bug, that was caught by syzbot [1] Original code was correct, do not try to be smarter than the compiler :/ [1] BUG: KASAN: slab-out-of-bounds in tcp_zerocopy_receive net/ipv4/tcp.c:1807 [inline] BUG: KASAN: slab-out-of-bounds in do_tcp_getsockopt.isra.0+0x2c6c/0x3120 net/ipv4/tcp.c:3654 Read of size 4 at addr ffff8880943cf188 by task syz-executor.2/17508 CPU: 0 PID: 17508 Comm: syz-executor.2 Not tainted 5.3.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351 __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482 kasan_report+0x12/0x17 mm/kasan/common.c:618 __asan_report_load4_noabort+0x14/0x20 mm/kasan/generic_report.c:131 tcp_zerocopy_receive net/ipv4/tcp.c:1807 [inline] do_tcp_getsockopt.isra.0+0x2c6c/0x3120 net/ipv4/tcp.c:3654 tcp_getsockopt+0xbf/0xe0 net/ipv4/tcp.c:3680 sock_common_getsockopt+0x94/0xd0 net/core/sock.c:3098 __sys_getsockopt+0x16d/0x310 net/socket.c:2129 __do_sys_getsockopt net/socket.c:2144 [inline] __se_sys_getsockopt net/socket.c:2141 [inline] __x64_sys_getsockopt+0xbe/0x150 net/socket.c:2141 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 Fixes: d8e18a516f8f ("net: Use skb accessors in network core") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Cc: Matthew Wilcox (Oracle) Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 79c325a07ba5..f98a1882e537 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1798,13 +1798,11 @@ static int tcp_zerocopy_receive(struct sock *sk, } if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { int remaining = zc->recv_skip_hint; - int size = skb_frag_size(frags); - while (remaining && (size != PAGE_SIZE || + while (remaining && (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags))) { - remaining -= size; + remaining -= skb_frag_size(frags); frags++; - size = skb_frag_size(frags); } zc->recv_skip_hint -= remaining; break; -- cgit v1.2.3 From 4152561f5da3fca92af7179dd538ea89e248f9d0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 4 Oct 2019 10:51:31 +0100 Subject: mac80211: Reject malformed SSID elements Although this shouldn't occur in practice, it's a good idea to bounds check the length field of the SSID element prior to using it for things like allocations or memcpy operations. Cc: Cc: Kees Cook Reported-by: Nicolas Waisman Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20191004095132.15777-1-will@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 26a2f49208b6..54dd8849d1cc 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2633,7 +2633,8 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, rcu_read_lock(); ssid = ieee80211_bss_get_ie(cbss, WLAN_EID_SSID); - if (WARN_ON_ONCE(ssid == NULL)) + if (WARN_ONCE(!ssid || ssid[1] > IEEE80211_MAX_SSID_LEN, + "invalid SSID element (len=%d)", ssid ? ssid[1] : -1)) ssid_len = 0; else ssid_len = ssid[1]; @@ -5233,7 +5234,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); ssidie = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); - if (!ssidie) { + if (!ssidie || ssidie[1] > sizeof(assoc_data->ssid)) { rcu_read_unlock(); kfree(assoc_data); return -EINVAL; -- cgit v1.2.3 From 4ac2813cc867ae563a1ba5a9414bfb554e5796fa Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 4 Oct 2019 10:51:32 +0100 Subject: cfg80211: wext: avoid copying malformed SSIDs Ensure the SSID element is bounds-checked prior to invoking memcpy() with its length field, when copying to userspace. Cc: Cc: Kees Cook Reported-by: Nicolas Waisman Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20191004095132.15777-2-will@kernel.org [adjust commit log a bit] Signed-off-by: Johannes Berg --- net/wireless/wext-sme.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index c67d7a82ab13..73fd0eae08ca 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -202,6 +202,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; + int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) @@ -219,7 +220,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, if (ie) { data->flags = 1; data->length = ie[1]; - memcpy(ssid, ie + 2, data->length); + if (data->length > IW_ESSID_MAX_SIZE) + ret = -EINVAL; + else + memcpy(ssid, ie + 2, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { @@ -229,7 +233,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, } wdev_unlock(wdev); - return 0; + return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, -- cgit v1.2.3 From 7a512eb865aa5fcfb396c71047bc9c3b046836b3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 4 Oct 2019 00:44:40 +0300 Subject: net: make sock_prot_memory_pressure() return "const char *" This function returns string literals which are "const char *". Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index e23ec80a67bf..fac2b4d80de5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3497,7 +3497,7 @@ static long sock_prot_memory_allocated(struct proto *proto) return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } -static char *sock_prot_memory_pressure(struct proto *proto) +static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; -- cgit v1.2.3 From 8ae72cbf62d2c1879456c0c5872f958e18f53711 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 3 Oct 2019 14:46:15 -0700 Subject: Revert "ipv6: Handle race in addrconf_dad_work" This reverts commit a3ce2a21bb8969ae27917281244fa91bf5f286d7. Eric reported tests failings with commit. After digging into it, the bottom line is that the DAD sequence is not to be messed with. There are too many cases that are expected to proceed regardless of whether a device is up. Revert the patch and I will send a different solution for the problem Rajendra reported. Signed-off-by: David Ahern Cc: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dd3be06d5a06..6a576ff92c39 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4032,12 +4032,6 @@ static void addrconf_dad_work(struct work_struct *w) rtnl_lock(); - /* check if device was taken down before this delayed work - * function could be canceled - */ - if (idev->dead || !(idev->if_flags & IF_READY)) - goto out; - spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { action = DAD_BEGIN; @@ -4083,6 +4077,11 @@ static void addrconf_dad_work(struct work_struct *w) goto out; write_lock_bh(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) { + write_unlock_bh(&idev->lock); + goto out; + } + spin_lock(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DEAD) { spin_unlock(&ifp->lock); -- cgit v1.2.3 From b406472b5ad79ede8d10077f0c8f05505ace8b6d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 Oct 2019 15:11:17 +0200 Subject: net: ipv4: avoid mixed n_redirects and rate_tokens usage Since commit c09551c6ff7f ("net: ipv4: use a dedicated counter for icmp_v4 redirect packets") we use 'n_redirects' to account for redirect packets, but we still use 'rate_tokens' to compute the redirect packets exponential backoff. If the device sent to the relevant peer any ICMP error packet after sending a redirect, it will also update 'rate_token' according to the leaking bucket schema; typically 'rate_token' will raise above BITS_PER_LONG and the redirect packets backoff algorithm will produce undefined behavior. Fix the issue using 'n_redirects' to compute the exponential backoff in ip_rt_send_redirect(). Note that we still clear rate_tokens after a redirect silence period, to avoid changing an established behaviour. The root cause predates git history; before the mentioned commit in the critical scenario, the kernel stopped sending redirects, after the mentioned commit the behavior more randomic. Reported-by: Xiumei Mu Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: c09551c6ff7f ("net: ipv4: use a dedicated counter for icmp_v4 redirect packets") Signed-off-by: Paolo Abeni Acked-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/route.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7dcce724c78b..14654876127e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -916,16 +916,15 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (peer->rate_tokens == 0 || time_after(jiffies, (peer->rate_last + - (ip_rt_redirect_load << peer->rate_tokens)))) { + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; - ++peer->rate_tokens; ++peer->n_redirects; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - peer->rate_tokens == ip_rt_redirect_number) + peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); -- cgit v1.2.3 From 2d819d250a1393a3e725715425ab70a0e0772a71 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 4 Oct 2019 08:03:09 -0700 Subject: ipv6: Handle missing host route in __ipv6_ifa_notify Rajendra reported a kernel panic when a link was taken down: [ 6870.263084] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 [ 6870.271856] IP: [] __ipv6_ifa_notify+0x154/0x290 [ 6870.570501] Call Trace: [ 6870.573238] [] ? ipv6_ifa_notify+0x26/0x40 [ 6870.579665] [] ? addrconf_dad_completed+0x4c/0x2c0 [ 6870.586869] [] ? ipv6_dev_mc_inc+0x196/0x260 [ 6870.593491] [] ? addrconf_dad_work+0x10a/0x430 [ 6870.600305] [] ? __switch_to_asm+0x34/0x70 [ 6870.606732] [] ? process_one_work+0x18a/0x430 [ 6870.613449] [] ? worker_thread+0x4d/0x490 [ 6870.619778] [] ? process_one_work+0x430/0x430 [ 6870.626495] [] ? kthread+0xd9/0xf0 [ 6870.632145] [] ? __switch_to_asm+0x34/0x70 [ 6870.638573] [] ? kthread_park+0x60/0x60 [ 6870.644707] [] ? ret_from_fork+0x57/0x70 [ 6870.650936] Code: 31 c0 31 d2 41 b9 20 00 08 02 b9 09 00 00 0 addrconf_dad_work is kicked to be scheduled when a device is brought up. There is a race between addrcond_dad_work getting scheduled and taking the rtnl lock and a process taking the link down (under rtnl). The latter removes the host route from the inet6_addr as part of addrconf_ifdown which is run for NETDEV_DOWN. The former attempts to use the host route in __ipv6_ifa_notify. If the down event removes the host route due to the race to the rtnl, then the BUG listed above occurs. Since the DAD sequence can not be aborted, add a check for the missing host route in __ipv6_ifa_notify. The only way this should happen is due to the previously mentioned race. The host route is created when the address is added to an interface; it is only removed on a down event where the address is kept. Add a warning if the host route is missing AND the device is up; this is a situation that should never happen. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Reported-by: Rajendra Dendukuri Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6a576ff92c39..34ccef18b40e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5964,13 +5964,20 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) switch (event) { case RTM_NEWADDR: /* - * If the address was optimistic - * we inserted the route at the start of - * our DAD process, so we don't need - * to do it again + * If the address was optimistic we inserted the route at the + * start of our DAD process, so we don't need to do it again. + * If the device was taken down in the middle of the DAD + * cycle there is a race where we could get here without a + * host route, so nothing to insert. That will be fixed when + * the device is brought up. */ - if (!rcu_access_pointer(ifp->rt->fib6_node)) + if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) { ip6_ins_rt(net, ifp->rt); + } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) { + pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n", + &ifp->addr, ifp->idev->dev->name); + } + if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) -- cgit v1.2.3 From 474f0813a3002cb299bb73a5a93aa1f537a80ca8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2019 10:34:45 -0700 Subject: sch_dsmark: fix potential NULL deref in dsmark_init() Make sure TCA_DSMARK_INDICES was provided by the user. syzbot reported : kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8799 Comm: syz-executor235 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:nla_get_u16 include/net/netlink.h:1501 [inline] RIP: 0010:dsmark_init net/sched/sch_dsmark.c:364 [inline] RIP: 0010:dsmark_init+0x193/0x640 net/sched/sch_dsmark.c:339 Code: 85 db 58 0f 88 7d 03 00 00 e8 e9 1a ac fb 48 8b 9d 70 ff ff ff 48 b8 00 00 00 00 00 fc ff df 48 8d 7b 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85 ca RSP: 0018:ffff88809426f3b8 EFLAGS: 00010247 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff85c6eb09 RDX: 0000000000000000 RSI: ffffffff85c6eb17 RDI: 0000000000000004 RBP: ffff88809426f4b0 R08: ffff88808c4085c0 R09: ffffed1015d26159 R10: ffffed1015d26158 R11: ffff8880ae930ac7 R12: ffff8880a7e96940 R13: dffffc0000000000 R14: ffff88809426f8c0 R15: 0000000000000000 FS: 0000000001292880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000080 CR3: 000000008ca1b000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: qdisc_create+0x4ee/0x1210 net/sched/sch_api.c:1237 tc_modify_qdisc+0x524/0x1c50 net/sched/sch_api.c:1653 rtnetlink_rcv_msg+0x463/0xb00 net/core/rtnetlink.c:5223 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x803/0x920 net/socket.c:2311 __sys_sendmsg+0x105/0x1d0 net/socket.c:2356 __do_sys_sendmsg net/socket.c:2365 [inline] __se_sys_sendmsg net/socket.c:2363 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440369 Fixes: 758cc43c6d73 ("[PKT_SCHED]: Fix dsmark to apply changes consistent") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index bad1cbe59a56..05605b30bef3 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -361,6 +361,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, goto errout; err = -EINVAL; + if (!tb[TCA_DSMARK_INDICES]) + goto errout; indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); if (hweight32(indices) != 1) -- cgit v1.2.3 From a0c2dc1fe63e2869b74c1c7f6a81d1745c8a695d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2019 11:08:34 -0700 Subject: nfc: fix memory leak in llcp_sock_bind() sysbot reported a memory leak after a bind() has failed. While we are at it, abort the operation if kmemdup() has failed. BUG: memory leak unreferenced object 0xffff888105d83ec0 (size 32): comm "syz-executor067", pid 7207, jiffies 4294956228 (age 19.430s) hex dump (first 32 bytes): 00 69 6c 65 20 72 65 61 64 00 6e 65 74 3a 5b 34 .ile read.net:[4 30 32 36 35 33 33 30 39 37 5d 00 00 00 00 00 00 026533097]...... backtrace: [<0000000036bac473>] kmemleak_alloc_recursive /./include/linux/kmemleak.h:43 [inline] [<0000000036bac473>] slab_post_alloc_hook /mm/slab.h:522 [inline] [<0000000036bac473>] slab_alloc /mm/slab.c:3319 [inline] [<0000000036bac473>] __do_kmalloc /mm/slab.c:3653 [inline] [<0000000036bac473>] __kmalloc_track_caller+0x169/0x2d0 /mm/slab.c:3670 [<000000000cd39d07>] kmemdup+0x27/0x60 /mm/util.c:120 [<000000008e57e5fc>] kmemdup /./include/linux/string.h:432 [inline] [<000000008e57e5fc>] llcp_sock_bind+0x1b3/0x230 /net/nfc/llcp_sock.c:107 [<000000009cb0b5d3>] __sys_bind+0x11c/0x140 /net/socket.c:1647 [<00000000492c3bbc>] __do_sys_bind /net/socket.c:1658 [inline] [<00000000492c3bbc>] __se_sys_bind /net/socket.c:1656 [inline] [<00000000492c3bbc>] __x64_sys_bind+0x1e/0x30 /net/socket.c:1656 [<0000000008704b2a>] do_syscall_64+0x76/0x1a0 /arch/x86/entry/common.c:296 [<000000009f4c57a4>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 30cc4587659e ("NFC: Move LLCP code to the NFC top level diirectory") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 8dfea26536c9..ccdd790e163a 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -107,9 +107,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); - + if (!llcp_sock->service_name) { + ret = -ENOMEM; + goto put_dev; + } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { + kfree(llcp_sock->service_name); + llcp_sock->service_name = NULL; ret = -EADDRINUSE; goto put_dev; } -- cgit v1.2.3 From c48fc11b69e95007109206311b0187a3090591f3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:28 +0100 Subject: rxrpc: Fix call ref leak When sendmsg() finds a call to continue on with, if the call is in an inappropriate state, it doesn't release the ref it just got on that call before returning an error. This causes the following symptom to show up with kasan: BUG: KASAN: use-after-free in rxrpc_send_keepalive+0x8a2/0x940 net/rxrpc/output.c:635 Read of size 8 at addr ffff888064219698 by task kworker/0:3/11077 where line 635 is: whdr.epoch = htonl(peer->local->rxnet->epoch); The local endpoint (which cannot be pinned by the call) has been released, but not the peer (which is pinned by the call). Fix this by releasing the call in the error path. Fixes: 37411cad633f ("rxrpc: Fix potential NULL-pointer exception") Reported-by: syzbot+d850c266e3df14da1d31@syzkaller.appspotmail.com Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6a1547b270fe..22f51a7e356e 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -661,6 +661,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) case RXRPC_CALL_SERVER_PREALLOC: case RXRPC_CALL_SERVER_SECURING: case RXRPC_CALL_SERVER_ACCEPTING: + rxrpc_put_call(call, rxrpc_call_put); ret = -EBUSY; goto error_release_sock; default: -- cgit v1.2.3 From 55f6c98e3674ce16038a1949c3f9ca5a9a99f289 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put peer record rxrpc_put_peer() calls trace_rxrpc_peer() after it has done the decrement of the refcount - which looks at the debug_id in the peer record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. This can cause the following symptoms: BUG: KASAN: use-after-free in __rxrpc_put_peer net/rxrpc/peer_object.c:411 [inline] BUG: KASAN: use-after-free in rxrpc_put_peer+0x685/0x6a0 net/rxrpc/peer_object.c:435 Read of size 8 at addr ffff888097ec0058 by task syz-executor823/24216 Fixes: 1159d4b496f5 ("rxrpc: Add a tracepoint to track rxrpc_peer refcounting") Reported-by: syzbot+b9be979c55f2bea8ed30@syzkaller.appspotmail.com Signed-off-by: David Howells --- net/rxrpc/peer_object.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 9c3ac96f71cb..b700b7ecaa3d 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -382,7 +382,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) int n; n = atomic_inc_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_got, n, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n, here); return peer; } @@ -396,7 +396,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) if (peer) { int n = atomic_fetch_add_unless(&peer->usage, 1, 0); if (n > 0) - trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); + trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n + 1, here); else peer = NULL; } @@ -426,11 +426,13 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id; int n; if (peer) { + debug_id = peer->debug_id; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) __rxrpc_put_peer(peer); } @@ -443,10 +445,11 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) void rxrpc_put_peer_locked(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); + unsigned int debug_id = peer->debug_id; int n; n = atomic_dec_return(&peer->usage); - trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); -- cgit v1.2.3 From 4c1295dccc0afe0905b6ca4c62ade7f2406f2cfb Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put connection record rxrpc_put_*conn() calls trace_rxrpc_conn() after they have done the decrement of the refcount - which looks at the debug_id in the connection record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. Fixes: 363deeab6d0f ("rxrpc: Add connection tracepoint and client conn state tracepoint") Signed-off-by: David Howells --- net/rxrpc/call_accept.c | 2 +- net/rxrpc/conn_client.c | 6 ++++-- net/rxrpc/conn_object.c | 13 +++++++------ net/rxrpc/conn_service.c | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 00c095d74145..c1b1b7dd2924 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -84,7 +84,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), here); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3f1da1b49f69..700eb77173fc 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -212,7 +212,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); - trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, + atomic_read(&conn->usage), __builtin_return_address(0)); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); @@ -985,11 +986,12 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn) void rxrpc_put_client_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; do { n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here); if (n > 0) return; ASSERTCMP(n, >=, 0); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ed05b6922132..38d718e90dc6 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -269,7 +269,7 @@ bool rxrpc_queue_conn(struct rxrpc_connection *conn) if (n == 0) return false; if (rxrpc_queue_work(&conn->processor)) - trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, n + 1, here); else rxrpc_put_connection(conn); return true; @@ -284,7 +284,7 @@ void rxrpc_see_connection(struct rxrpc_connection *conn) if (conn) { int n = atomic_read(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here); } } @@ -296,7 +296,7 @@ void rxrpc_get_connection(struct rxrpc_connection *conn) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here); } /* @@ -310,7 +310,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn) if (conn) { int n = atomic_fetch_add_unless(&conn->usage, 1, 0); if (n > 0) - trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n + 1, here); else conn = NULL; } @@ -333,10 +333,11 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, void rxrpc_put_service_conn(struct rxrpc_connection *conn) { const void *here = __builtin_return_address(0); + unsigned int debug_id = conn->debug_id; int n; n = atomic_dec_return(&conn->usage); - trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, n, here); ASSERTCMP(n, >=, 0); if (n == 1) rxrpc_set_service_reap_timer(conn->params.local->rxnet, @@ -420,7 +421,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; - trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL); + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL); if (rxrpc_conn_is_client(conn)) BUG(); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index b30e13f6d95f..123d6ceab15c 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -134,7 +134,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - trace_rxrpc_conn(conn, rxrpc_conn_new_service, + trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, atomic_read(&conn->usage), __builtin_return_address(0)); } -- cgit v1.2.3 From 48c9e0ec7cbbb7370448f859ccc8e3b7eb69e755 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put call record rxrpc_put_call() calls trace_rxrpc_call() after it has done the decrement of the refcount - which looks at the debug_id in the call record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. Fixes: e34d4234b0b7 ("rxrpc: Trace rxrpc_call usage") Signed-off-by: David Howells --- net/rxrpc/call_accept.c | 2 +- net/rxrpc/call_object.c | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index c1b1b7dd2924..1f778102ed8d 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -97,7 +97,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->flags |= (1 << RXRPC_CALL_IS_SERVICE); call->state = RXRPC_CALL_SERVER_PREALLOC; - trace_rxrpc_call(call, rxrpc_call_new_service, + trace_rxrpc_call(call->debug_id, rxrpc_call_new_service, atomic_read(&call->usage), here, (const void *)user_call_ID); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 32d8dc677142..6dace078971a 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -240,7 +240,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (p->intr) __set_bit(RXRPC_CALL_IS_INTR, &call->flags); call->tx_total_len = p->tx_total_len; - trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, + atomic_read(&call->usage), here, (const void *)p->user_call_ID); /* We need to protect a partially set up call against the user as we @@ -290,8 +291,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error; - trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), - here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_connected, + atomic_read(&call->usage), here, NULL); rxrpc_start_call_timer(call); @@ -313,8 +314,8 @@ error_dup_user_ID: error: __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, ret); - trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), - here, ERR_PTR(ret)); + trace_rxrpc_call(call->debug_id, rxrpc_call_error, + atomic_read(&call->usage), here, ERR_PTR(ret)); rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); @@ -376,7 +377,8 @@ bool rxrpc_queue_call(struct rxrpc_call *call) if (n == 0) return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -391,7 +393,8 @@ bool __rxrpc_queue_call(struct rxrpc_call *call) int n = atomic_read(&call->usage); ASSERTCMP(n, >=, 1); if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n, + here, NULL); else rxrpc_put_call(call, rxrpc_call_put_noqueue); return true; @@ -406,7 +409,8 @@ void rxrpc_see_call(struct rxrpc_call *call) if (call) { int n = atomic_read(&call->usage); - trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL); + trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n, + here, NULL); } } @@ -418,7 +422,7 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) const void *here = __builtin_return_address(0); int n = atomic_inc_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(call->debug_id, op, n, here, NULL); } /* @@ -445,7 +449,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); - trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + trace_rxrpc_call(call->debug_id, rxrpc_call_release, + atomic_read(&call->usage), here, (const void *)call->flags); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); @@ -534,12 +539,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { struct rxrpc_net *rxnet = call->rxnet; const void *here = __builtin_return_address(0); + unsigned int debug_id = call->debug_id; int n; ASSERT(call != NULL); n = atomic_dec_return(&call->usage); - trace_rxrpc_call(call, op, n, here, NULL); + trace_rxrpc_call(debug_id, op, n, here, NULL); ASSERTCMP(n, >=, 0); if (n == 0) { _debug("call %d dead", call->debug_id); -- cgit v1.2.3 From 9ebeddef58c41bd700419cdcece24cf64ce32276 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: rxrpc_peer needs to hold a ref on the rxrpc_local record The rxrpc_peer record needs to hold a reference on the rxrpc_local record it points as the peer is used as a base to access information in the rxrpc_local record. This can cause problems in __rxrpc_put_peer(), where we need the network namespace pointer, and in rxrpc_send_keepalive(), where we need to access the UDP socket, leading to symptoms like: BUG: KASAN: use-after-free in __rxrpc_put_peer net/rxrpc/peer_object.c:411 [inline] BUG: KASAN: use-after-free in rxrpc_put_peer+0x685/0x6a0 net/rxrpc/peer_object.c:435 Read of size 8 at addr ffff888097ec0058 by task syz-executor823/24216 Fix this by taking a ref on the local record for the peer record. Fixes: ace45bec6d77 ("rxrpc: Fix firewall route keepalive") Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing") Reported-by: syzbot+b9be979c55f2bea8ed30@syzkaller.appspotmail.com Signed-off-by: David Howells --- net/rxrpc/peer_object.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index b700b7ecaa3d..64830d8c1fdb 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -216,7 +216,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { atomic_set(&peer->usage, 1); - peer->local = local; + peer->local = rxrpc_get_local(local); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); @@ -307,7 +307,6 @@ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); - peer->local = local; rxrpc_init_peer(rx, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); @@ -417,6 +416,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); + rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } @@ -453,6 +453,7 @@ void rxrpc_put_peer_locked(struct rxrpc_peer *peer) if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); + rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } } -- cgit v1.2.3 From 91fcfbe8852edb929ff8702534525031a15d0aa6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix call crypto state cleanup Fix the cleanup of the crypto state on a call after the call has been disconnected. As the call has been disconnected, its connection ref has been discarded and so we can't go through that to get to the security ops table. Fix this by caching the security ops pointer in the rxrpc_call struct and using that when freeing the call security state. Also use this in other places we're dealing with call-specific security. The symptoms look like: BUG: KASAN: use-after-free in rxrpc_release_call+0xb2d/0xb60 net/rxrpc/call_object.c:481 Read of size 8 at addr ffff888062ffeb50 by task syz-executor.5/4764 Fixes: 1db88c534371 ("rxrpc: Fix -Wframe-larger-than= warnings from on-stack crypto") Reported-by: syzbot+eed305768ece6682bb7f@syzkaller.appspotmail.com Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_accept.c | 1 + net/rxrpc/call_object.c | 6 +++--- net/rxrpc/conn_client.c | 3 +++ net/rxrpc/recvmsg.c | 6 +++--- net/rxrpc/sendmsg.c | 2 +- 6 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1091bf35a199..ecc17dabec8f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -556,6 +556,7 @@ struct rxrpc_call { struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ + const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 1f778102ed8d..135bf5cd8dd5 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -307,6 +307,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_see_call(call); call->conn = conn; + call->security = conn->security; call->peer = rxrpc_get_peer(conn->params.peer); call->cong_cwnd = call->peer->cong_cwnd; return call; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 6dace078971a..a31c18c09894 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -493,10 +493,10 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - if (conn) { + if (conn) rxrpc_disconnect_call(call); - conn->security->free_call_crypto(call); - } + if (call->security) + call->security->free_call_crypto(call); rxrpc_cleanup_ring(call); _leave(""); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 700eb77173fc..376370cd9285 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -353,6 +353,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx, if (cp->exclusive) { call->conn = candidate; + call->security = candidate->security; call->security_ix = candidate->security_ix; call->service_id = candidate->service_id; _leave(" = 0 [exclusive %d]", candidate->debug_id); @@ -404,6 +405,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx, candidate_published: set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); call->conn = candidate; + call->security = candidate->security; call->security_ix = candidate->security_ix; call->service_id = candidate->service_id; spin_unlock(&local->client_conns_lock); @@ -426,6 +428,7 @@ found_extant_conn: spin_lock(&conn->channel_lock); call->conn = conn; + call->security = conn->security; call->security_ix = conn->security_ix; call->service_id = conn->service_id; list_add_tail(&call->chan_wait_link, &conn->waiting_calls); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 3b0becb12041..a4090797c9b2 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -251,8 +251,8 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, seq += subpacket; } - return call->conn->security->verify_packet(call, skb, offset, len, - seq, cksum); + return call->security->verify_packet(call, skb, offset, len, + seq, cksum); } /* @@ -291,7 +291,7 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, *_offset = offset; *_len = len; - call->conn->security->locate_data(call, skb, _offset, _len); + call->security->locate_data(call, skb, _offset, _len); return 0; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 22f51a7e356e..813fd6888142 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -419,7 +419,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, call->tx_winsize) sp->hdr.flags |= RXRPC_MORE_PACKETS; - ret = conn->security->secure_packet( + ret = call->security->secure_packet( call, skb, skb->mark, skb->head); if (ret < 0) goto out; -- cgit v1.2.3 From 1399c59fa92984836db90538cf92397fe7caaa57 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 4 Oct 2019 14:42:19 -0500 Subject: nl80211: fix memory leak in nl80211_get_ftm_responder_stats In nl80211_get_ftm_responder_stats, a new skb is created via nlmsg_new named msg. If nl80211hdr_put() fails, then msg should be released. The return statement should be replace by goto to error handling code. Fixes: 81e54d08d9d8 ("cfg80211: support FTM responder configuration/statistics") Signed-off-by: Navid Emamdoost Link: https://lore.kernel.org/r/20191004194220.19412-1-navid.emamdoost@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 141cdb171665..4453dd375de9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13682,7 +13682,7 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, NL80211_CMD_GET_FTM_RESPONDER_STATS); if (!hdr) - return -ENOBUFS; + goto nla_put_failure; if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; -- cgit v1.2.3 From 461c4c2b4c0731d7452bad4e77c0cdbdcea1804c Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 4 Oct 2019 15:37:06 +0300 Subject: cfg80211: fix a bunch of RCU issues in multi-bssid code cfg80211_update_notlisted_nontrans() leaves the RCU critical session too early, while still using nontrans_ssid which is RCU protected. In addition, it performs a bunch of RCU pointer update operations such as rcu_access_pointer and rcu_assign_pointer. The caller, cfg80211_inform_bss_frame_data(), also accesses the RCU pointer without holding the lock. Just wrap all of this with bss_lock. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20191004123706.15768-3-luca@coelho.fi Signed-off-by: Johannes Berg --- net/wireless/scan.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ff1016607f0b..aef240fdf8df 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1703,8 +1703,7 @@ cfg80211_parse_mbssid_frame_data(struct wiphy *wiphy, static void cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, struct cfg80211_bss *nontrans_bss, - struct ieee80211_mgmt *mgmt, size_t len, - gfp_t gfp) + struct ieee80211_mgmt *mgmt, size_t len) { u8 *ie, *new_ie, *pos; const u8 *nontrans_ssid, *trans_ssid, *mbssid; @@ -1715,6 +1714,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, const struct cfg80211_bss_ies *old; u8 cpy_len; + lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); + ie = mgmt->u.probe_resp.variable; new_ie_len = ielen; @@ -1731,23 +1732,22 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, if (!mbssid || mbssid < trans_ssid) return; new_ie_len -= mbssid[1]; - rcu_read_lock(); + nontrans_ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); - if (!nontrans_ssid) { - rcu_read_unlock(); + if (!nontrans_ssid) return; - } + new_ie_len += nontrans_ssid[1]; - rcu_read_unlock(); /* generate new ie for nontrans BSS * 1. replace SSID with nontrans BSS' SSID * 2. skip MBSSID IE */ - new_ie = kzalloc(new_ie_len, gfp); + new_ie = kzalloc(new_ie_len, GFP_ATOMIC); if (!new_ie) return; - new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, gfp); + + new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, GFP_ATOMIC); if (!new_ies) goto out_free; @@ -1901,6 +1901,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, cfg80211_parse_mbssid_frame_data(wiphy, data, mgmt, len, &non_tx_data, gfp); + spin_lock_bh(&wiphy_to_rdev(wiphy)->bss_lock); + /* check if the res has other nontransmitting bss which is not * in MBSSID IE */ @@ -1915,8 +1917,9 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, ies2 = rcu_access_pointer(tmp_bss->ies); if (ies2->tsf < ies1->tsf) cfg80211_update_notlisted_nontrans(wiphy, tmp_bss, - mgmt, len, gfp); + mgmt, len); } + spin_unlock_bh(&wiphy_to_rdev(wiphy)->bss_lock); return res; } -- cgit v1.2.3 From 95697f9907bfe3eab0ef20265a766b22e27dde64 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 4 Oct 2019 15:37:05 +0300 Subject: mac80211: accept deauth frames in IBSS mode We can process deauth frames and all, but we drop them very early in the RX path today - this could never have worked. Fixes: 2cc59e784b54 ("mac80211: reply to AUTH with DEAUTH if sta allocation fails in IBSS") Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20191004123706.15768-2-luca@coelho.fi Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 768d14c9a716..0e05ff037672 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3467,9 +3467,18 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP): /* process for all: mesh, mlme, ibss */ break; + case cpu_to_le16(IEEE80211_STYPE_DEAUTH): + if (is_multicast_ether_addr(mgmt->da) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + + /* process only for station/IBSS */ + if (sdata->vif.type != NL80211_IFTYPE_STATION && + sdata->vif.type != NL80211_IFTYPE_ADHOC) + return RX_DROP_MONITOR; + break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): - case cpu_to_le16(IEEE80211_STYPE_DEAUTH): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) -- cgit v1.2.3 From dc0c18ed229cdcca283dd78fefa38273ec37a42c Mon Sep 17 00:00:00 2001 From: Aaron Komisar Date: Wed, 2 Oct 2019 13:59:07 +0000 Subject: mac80211: fix scan when operating on DFS channels in ETSI domains In non-ETSI regulatory domains scan is blocked when operating channel is a DFS channel. For ETSI, however, once DFS channel is marked as available after the CAC, this channel will remain available (for some time) even after leaving this channel. Therefore a scan can be done without any impact on the availability of the DFS channel as no new CAC is required after the scan. Enable scan in mac80211 in these cases. Signed-off-by: Aaron Komisar Link: https://lore.kernel.org/r/1570024728-17284-1-git-send-email-aaron.komisar@tandemg.com Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 30 ++++++++++++++++++++++++++++-- net/wireless/reg.c | 1 + net/wireless/reg.h | 8 -------- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index adf94ba1ed77..4d31d9688dc2 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -520,10 +520,33 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, return 0; } +static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *sdata_iter; + + if (!ieee80211_is_radar_required(local)) + return true; + + if (!regulatory_pre_cac_allowed(local->hw.wiphy)) + return false; + + mutex_lock(&local->iflist_mtx); + list_for_each_entry(sdata_iter, &local->interfaces, list) { + if (sdata_iter->wdev.cac_started) { + mutex_unlock(&local->iflist_mtx); + return false; + } + } + mutex_unlock(&local->iflist_mtx); + + return true; +} + static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { - if (ieee80211_is_radar_required(local)) + if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) @@ -630,7 +653,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, lockdep_assert_held(&local->mtx); - if (local->scan_req || ieee80211_is_radar_required(local)) + if (local->scan_req) + return -EBUSY; + + if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 420c4207ab59..446c76d44e65 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3883,6 +3883,7 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy) return pre_cac_allowed; } +EXPORT_SYMBOL(regulatory_pre_cac_allowed); void regulatory_propagate_dfs_state(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 504133d76de4..dc8f689bd469 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -155,14 +155,6 @@ bool regulatory_indoor_allowed(void); */ #define REG_PRE_CAC_EXPIRY_GRACE_MS 2000 -/** - * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain - * @wiphy: wiphy for which pre-CAC capability is checked. - - * Pre-CAC is allowed only in ETSI domain. - */ -bool regulatory_pre_cac_allowed(struct wiphy *wiphy); - /** * regulatory_propagate_dfs_state - Propagate DFS channel state to other wiphys * @wiphy - wiphy on which radar is detected and the event will be propagated -- cgit v1.2.3 From c6ee11c39fcc1fb55130748990a8f199e76263b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Oct 2019 14:24:24 -0700 Subject: llc: fix sk_buff leak in llc_sap_state_process() syzbot reported: BUG: memory leak unreferenced object 0xffff888116270800 (size 224): comm "syz-executor641", pid 7047, jiffies 4294947360 (age 13.860s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 20 e1 2a 81 88 ff ff 00 40 3d 2a 81 88 ff ff . .*.....@=*.... backtrace: [<000000004d41b4cc>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000004d41b4cc>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000004d41b4cc>] slab_alloc_node mm/slab.c:3269 [inline] [<000000004d41b4cc>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579 [<00000000506a5965>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198 [<000000001ba5a161>] alloc_skb include/linux/skbuff.h:1058 [inline] [<000000001ba5a161>] alloc_skb_with_frags+0x5f/0x250 net/core/skbuff.c:5327 [<0000000047d9c78b>] sock_alloc_send_pskb+0x269/0x2a0 net/core/sock.c:2225 [<000000003828fe54>] sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2242 [<00000000e34d94f9>] llc_ui_sendmsg+0x10a/0x540 net/llc/af_llc.c:933 [<00000000de2de3fb>] sock_sendmsg_nosec net/socket.c:652 [inline] [<00000000de2de3fb>] sock_sendmsg+0x54/0x70 net/socket.c:671 [<000000008fe16e7a>] __sys_sendto+0x148/0x1f0 net/socket.c:1964 [...] The bug is that llc_sap_state_process() always takes an extra reference to the skb, but sometimes neither llc_sap_next_state() nor llc_sap_state_process() itself drops this reference. Fix it by changing llc_sap_next_state() to never consume a reference to the skb, rather than sometimes do so and sometimes not. Then remove the extra skb_get() and kfree_skb() from llc_sap_state_process(). Reported-by: syzbot+6bf095f9becf5efef645@syzkaller.appspotmail.com Reported-by: syzbot+31c16aa4202dace3812e@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Biggers Signed-off-by: Jakub Kicinski --- net/llc/llc_s_ac.c | 12 +++++++++--- net/llc/llc_sap.c | 23 ++++++++--------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index a94bd56bcac6..7ae4cc684d3a 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -58,8 +58,10 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } @@ -81,8 +83,10 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } @@ -135,8 +139,10 @@ int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_test_cmd(skb); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) + if (likely(!rc)) { + skb_get(skb); rc = dev_queue_xmit(skb); + } return rc; } diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index a7f7b8ff4729..be419062e19a 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -197,29 +197,22 @@ out: * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. + * + * This function always consumes a reference to the skb. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - /* - * We have to hold the skb, because llc_sap_next_state - * will kfree it in the sending path and we need to - * look at the skb->cb, where we encode llc_sap_state_ev. - */ - skb_get(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); - if (ev->ind_cfm_flag == LLC_IND) { - if (skb->sk->sk_state == TCP_LISTEN) - kfree_skb(skb); - else { - llc_save_primitive(skb->sk, skb, ev->prim); - /* queue skb to the user. */ - if (sock_queue_rcv_skb(skb->sk, skb)) - kfree_skb(skb); - } + if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) { + llc_save_primitive(skb->sk, skb, ev->prim); + + /* queue skb to the user. */ + if (sock_queue_rcv_skb(skb->sk, skb) == 0) + return; } kfree_skb(skb); } -- cgit v1.2.3 From b74555de21acd791f12c4a1aeaf653dd7ac21133 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Oct 2019 14:24:25 -0700 Subject: llc: fix sk_buff leak in llc_conn_service() syzbot reported: BUG: memory leak unreferenced object 0xffff88811eb3de00 (size 224): comm "syz-executor559", pid 7315, jiffies 4294943019 (age 10.300s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 a0 38 24 81 88 ff ff 00 c0 f2 15 81 88 ff ff ..8$............ backtrace: [<000000008d1c66a1>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000008d1c66a1>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000008d1c66a1>] slab_alloc_node mm/slab.c:3269 [inline] [<000000008d1c66a1>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579 [<00000000447d9496>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198 [<000000000cdbf82f>] alloc_skb include/linux/skbuff.h:1058 [inline] [<000000000cdbf82f>] llc_alloc_frame+0x66/0x110 net/llc/llc_sap.c:54 [<000000002418b52e>] llc_conn_ac_send_sabme_cmd_p_set_x+0x2f/0x140 net/llc/llc_c_ac.c:777 [<000000001372ae17>] llc_exec_conn_trans_actions net/llc/llc_conn.c:475 [inline] [<000000001372ae17>] llc_conn_service net/llc/llc_conn.c:400 [inline] [<000000001372ae17>] llc_conn_state_process+0x1ac/0x640 net/llc/llc_conn.c:75 [<00000000f27e53c1>] llc_establish_connection+0x110/0x170 net/llc/llc_if.c:109 [<00000000291b2ca0>] llc_ui_connect+0x10e/0x370 net/llc/af_llc.c:477 [<000000000f9c740b>] __sys_connect+0x11d/0x170 net/socket.c:1840 [...] The bug is that most callers of llc_conn_send_pdu() assume it consumes a reference to the skb, when actually due to commit b85ab56c3f81 ("llc: properly handle dev_queue_xmit() return value") it doesn't. Revert most of that commit, and instead make the few places that need llc_conn_send_pdu() to *not* consume a reference call skb_get() before. Fixes: b85ab56c3f81 ("llc: properly handle dev_queue_xmit() return value") Reported-by: syzbot+6b825a6494a04cc0e3f7@syzkaller.appspotmail.com Signed-off-by: Eric Biggers Signed-off-by: Jakub Kicinski --- net/llc/llc_c_ac.c | 8 ++++++-- net/llc/llc_conn.c | 32 +++++++++----------------------- 2 files changed, 15 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 4d78375f9872..647c0554d04c 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -372,6 +372,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -389,7 +390,8 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; @@ -406,6 +408,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { + skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } @@ -916,7 +919,8 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { - rc = llc_conn_send_pdu(sk, skb); + skb_get(skb); + llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 4ff89cb7c86f..ed2aca12460c 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -30,7 +30,7 @@ #endif static int llc_find_offset(int state, int ev_type); -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb); +static void llc_conn_send_pdus(struct sock *sk); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, struct llc_conn_state_trans *trans, @@ -193,11 +193,11 @@ out_skb_put: return rc; } -int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); - return llc_conn_send_pdus(sk, skb); + llc_conn_send_pdus(sk); } /** @@ -255,7 +255,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -296,7 +296,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ - llc_conn_send_pdus(sk, NULL); + llc_conn_send_pdus(sk); out:; } @@ -340,16 +340,12 @@ out: /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection - * @hold_skb: the skb held by caller, or NULL if does not care * - * Sends queued pdus to MAC layer for transmission. When @hold_skb is - * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent - * successfully, or 1 for failure. + * Sends queued pdus to MAC layer for transmission. */ -static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) +static void llc_conn_send_pdus(struct sock *sk) { struct sk_buff *skb; - int ret = 0; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); @@ -361,20 +357,10 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb) skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; - dev_queue_xmit(skb2); - } else { - bool is_target = skb == hold_skb; - int rc; - - if (is_target) - skb_get(skb); - rc = dev_queue_xmit(skb); - if (is_target) - ret = rc; + skb = skb2; } + dev_queue_xmit(skb); } - - return ret; } /** -- cgit v1.2.3 From fc8d5db10cbe1338a52ebc74e7feab9276721774 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Oct 2019 14:24:26 -0700 Subject: llc: fix another potential sk_buff leak in llc_ui_sendmsg() All callers of llc_conn_state_process() except llc_build_and_send_pkt() (via llc_ui_sendmsg() -> llc_ui_send_data()) assume that it always consumes a reference to the skb. Fix this caller to do the same. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Biggers Signed-off-by: Jakub Kicinski --- net/llc/af_llc.c | 34 ++++++++++++++++++++-------------- net/llc/llc_conn.c | 2 ++ net/llc/llc_if.c | 12 ++++++++---- 3 files changed, 30 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 2017b7d780f5..c74f44dfaa22 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -113,22 +113,26 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. + * + * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); - int rc = 0; if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); + int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); + if (rc) { + kfree_skb(skb); + return rc; + } } - if (unlikely(!rc)) - rc = llc_build_and_send_pkt(sk, skb); - return rc; + return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) @@ -899,7 +903,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; + struct sk_buff *skb = NULL; size_t size = 0; int rc = -EINVAL, copied = 0, hdrlen; @@ -908,10 +912,10 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) - goto release; + goto out; } else { if (llc_ui_addr_null(&llc->addr)) - goto release; + goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ @@ -919,7 +923,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) - goto release; + goto out; } hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); size = hdrlen + len; @@ -928,12 +932,12 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) copied = size - hdrlen; rc = -EINVAL; if (copied < 0) - goto release; + goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) - goto release; + goto out; skb->dev = llc->dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hdrlen); @@ -943,29 +947,31 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); + skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); + skb = NULL; out: - if (rc) { - kfree_skb(skb); -release: + kfree_skb(skb); + if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); - } release_sock(sk); return rc ? : copied; } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index ed2aca12460c..0b0c6f12153b 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -55,6 +55,8 @@ int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; * (executing it's actions and changing state), upper layer will be * indicated or confirmed, if needed. Returns 0 for success, 1 for * failure. The socket lock has to be held before calling this function. + * + * This function always consumes a reference to the skb. */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) { diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 8db03c2d5440..ad6547736c21 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -38,6 +38,8 @@ * closed and -EBUSY when sending data is not permitted in this state or * LLC has send an I pdu with p bit set to 1 and is waiting for it's * response. + * + * This function always consumes a reference to the skb. */ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) { @@ -46,20 +48,22 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); if (unlikely(llc->state == LLC_CONN_STATE_ADM)) - goto out; + goto out_free; rc = -EBUSY; if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ llc->p_flag)) { llc->failed_data_req = 1; - goto out; + goto out_free; } ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DATA_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb->dev = llc->dev; - rc = llc_conn_state_process(sk, skb); -out: + return llc_conn_state_process(sk, skb); + +out_free: + kfree_skb(skb); return rc; } -- cgit v1.2.3 From 36453c852816f19947ca482a595dffdd2efa4965 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Oct 2019 14:24:27 -0700 Subject: llc: fix sk_buff refcounting in llc_conn_state_process() If llc_conn_state_process() sees that llc_conn_service() put the skb on a list, it will drop one fewer references to it. This is wrong because the current behavior is that llc_conn_service() never consumes a reference to the skb. The code also makes the number of skb references being dropped conditional on which of ind_prim and cfm_prim are nonzero, yet neither of these affects how many references are *acquired*. So there is extra code that tries to fix this up by sometimes taking another reference. Remove the unnecessary/broken refcounting logic and instead just add an skb_get() before the only two places where an extra reference is actually consumed. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Biggers Signed-off-by: Jakub Kicinski --- net/llc/llc_conn.c | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 0b0c6f12153b..a79b739eb223 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -64,12 +64,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(skb->sk); struct llc_conn_state_ev *ev = llc_conn_ev(skb); - /* - * We have to hold the skb, because llc_conn_service will kfree it in - * the sending path and we need to look at the skb->cb, where we encode - * llc_conn_state_ev. - */ - skb_get(skb); ev->ind_prim = ev->cfm_prim = 0; /* * Send event to state machine @@ -77,21 +71,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) rc = llc_conn_service(skb->sk, skb); if (unlikely(rc != 0)) { printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); - goto out_kfree_skb; - } - - if (unlikely(!ev->ind_prim && !ev->cfm_prim)) { - /* indicate or confirm not required */ - if (!skb->next) - goto out_kfree_skb; goto out_skb_put; } - if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */ - skb_get(skb); - switch (ev->ind_prim) { case LLC_DATA_PRIM: + skb_get(skb); llc_save_primitive(sk, skb, LLC_DATA_PRIM); if (unlikely(sock_queue_rcv_skb(sk, skb))) { /* @@ -108,6 +93,7 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * skb->sk pointing to the newly created struct sock in * llc_conn_handler. -acme */ + skb_get(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); break; @@ -123,7 +109,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) sk->sk_state_change(sk); } } - kfree_skb(skb); sock_put(sk); break; case LLC_RESET_PRIM: @@ -132,14 +117,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset ind!\n", __func__); - kfree_skb(skb); break; default: - if (ev->ind_prim) { + if (ev->ind_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->ind_prim); - kfree_skb(skb); - } /* No indication */ break; } @@ -181,15 +163,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) printk(KERN_INFO "%s: received a reset conf!\n", __func__); break; default: - if (ev->cfm_prim) { + if (ev->cfm_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->cfm_prim); - break; - } - goto out_skb_put; /* No confirmation */ + /* No confirmation */ + break; } -out_kfree_skb: - kfree_skb(skb); out_skb_put: kfree_skb(skb); return rc; -- cgit v1.2.3 From 6f96c3c6904c26cea9ca2726d5d8a9b0b8205b3c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 7 Oct 2019 13:26:28 -0700 Subject: net_sched: fix backward compatibility for TCA_KIND Marcelo noticed a backward compatibility issue of TCA_KIND after we move from NLA_STRING to NLA_NUL_STRING, so it is probably too late to change it. Instead, to make everyone happy, we can just insert a NUL to terminate the string with nla_strlcpy() like we do for TC actions. Fixes: 62794fc4fbf5 ("net_sched: add max len check for TCA_KIND") Reported-by: Marcelo Ricardo Leitner Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 36 +++++++++++++++++++++++++++++++++--- net/sched/sch_api.c | 3 +-- 2 files changed, 34 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 64584a1df425..8717c0b26c90 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -162,11 +162,22 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) return TC_H_MAJ(first); } +static bool tcf_proto_check_kind(struct nlattr *kind, char *name) +{ + if (kind) + return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ; + memset(name, 0, IFNAMSIZ); + return false; +} + static bool tcf_proto_is_unlocked(const char *kind) { const struct tcf_proto_ops *ops; bool ret; + if (strlen(kind) == 0) + return false; + ops = tcf_proto_lookup_ops(kind, false, NULL); /* On error return false to take rtnl lock. Proto lookup/create * functions will perform lookup again and properly handle errors. @@ -1843,6 +1854,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -1899,13 +1911,19 @@ replay: if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } + /* Take rtnl mutex if rtnl_held was set to true on previous iteration, * block is shared (no qdisc found), qdisc is not unlocked, classifier * type is not specified, classifier is not unlocked. */ if (rtnl_held || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2063,6 +2081,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2102,13 +2121,18 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc * found), qdisc is not unlocked, classifier type is not specified, * classifier is not unlocked. */ if (!prio || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } @@ -2216,6 +2240,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; + char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; @@ -2252,12 +2277,17 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (err) return err; + if (tcf_proto_check_kind(tca[TCA_KIND], name)) { + NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); + err = -EINVAL; + goto errout; + } /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not * unlocked, classifier type is not specified, classifier is not * unlocked. */ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || - !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) { + !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 81d58b280612..1047825d9f48 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1390,8 +1390,7 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { - [TCA_KIND] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ - 1 }, + [TCA_KIND] = { .type = NLA_STRING }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, -- cgit v1.2.3 From 4b793feccae3b06764268377a4030eb774ed924e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 7 Oct 2019 13:26:29 -0700 Subject: net_sched: fix backward compatibility for TCA_ACT_KIND For TCA_ACT_KIND, we have to keep the backward compatibility too, and rely on nla_strlcpy() to check and terminate the string with a NUL. Note for TC actions, nla_strcmp() is already used to compare kind strings, so we don't need to fix other places. Fixes: 199ce850ce11 ("net_sched: add policy validation for action attributes") Reported-by: Marcelo Ricardo Leitner Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2558f00f6b3e..4e7429c6f864 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -832,8 +832,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) } static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { - [TCA_ACT_KIND] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ - 1 }, + [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, @@ -865,8 +864,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - nla_strlcpy(act_name, kind, IFNAMSIZ); - + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "TC action name too long"); + goto err_out; + } if (tb[TCA_ACT_COOKIE]) { cookie = nla_memdup_cookie(tb); if (!cookie) { -- cgit v1.2.3 From 4123f637a5129470ff9d3cb00a5a4e213f2e15cc Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 8 Oct 2019 17:56:03 +0800 Subject: ip6erspan: remove the incorrect mtu limit for ip6erspan ip6erspan driver calls ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. It causes the dev mtu of the erspan device to not be greater than 1500, this limit value is not correct for ip6erspan tap device. Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Haishuang Yan Acked-by: William Tu Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d5779d6a6065..787d9f2a6e99 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2192,6 +2192,7 @@ static void ip6erspan_tap_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = 0; dev->netdev_ops = &ip6erspan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; -- cgit v1.2.3 From 819be8108fded0b9e710bbbf81193e52f7bab2f7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:09:23 +0800 Subject: sctp: add chunks to sk_backlog when the newsk sk_socket is not set This patch is to fix a NULL-ptr deref in selinux_socket_connect_helper: [...] kasan: GPF could be caused by NULL-ptr deref or user memory access [...] RIP: 0010:selinux_socket_connect_helper+0x94/0x460 [...] Call Trace: [...] selinux_sctp_bind_connect+0x16a/0x1d0 [...] security_sctp_bind_connect+0x58/0x90 [...] sctp_process_asconf+0xa52/0xfd0 [sctp] [...] sctp_sf_do_asconf+0x785/0x980 [sctp] [...] sctp_do_sm+0x175/0x5a0 [sctp] [...] sctp_assoc_bh_rcv+0x285/0x5b0 [sctp] [...] sctp_backlog_rcv+0x482/0x910 [sctp] [...] __release_sock+0x11e/0x310 [...] release_sock+0x4f/0x180 [...] sctp_accept+0x3f9/0x5a0 [sctp] [...] inet_accept+0xe7/0x720 It was caused by that the 'newsk' sk_socket was not set before going to security sctp hook when processing asconf chunk with SCTP_PARAM_ADD_IP or SCTP_PARAM_SET_PRIMARY: inet_accept()-> sctp_accept(): lock_sock(): lock listening 'sk' do_softirq(): sctp_rcv(): <-- [1] asconf chunk arrives and enqueued in 'sk' backlog sctp_sock_migrate(): set asoc's sk to 'newsk' release_sock(): sctp_backlog_rcv(): lock 'newsk' sctp_process_asconf() <-- [2] unlock 'newsk' sock_graft(): set sk_socket <-- [3] As it shows, at [1] the asconf chunk would be put into the listening 'sk' backlog, as accept() was holding its sock lock. Then at [2] asconf would get processed with 'newsk' as asoc's sk had been set to 'newsk'. However, 'newsk' sk_socket is not set until [3], while selinux_sctp_bind_connect() would deref it, then kernel crashed. Here to fix it by adding the chunk to sk_backlog until newsk sk_socket is set when .accept() is done. Note that sk->sk_socket can be NULL when the sock is closed, so SOCK_DEAD flag is also needed to check in sctp_newsk_ready(). Thanks to Ondrej for reviewing the code. Fixes: d452930fd3b9 ("selinux: Add SCTP support") Reported-by: Ying Xu Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- net/sctp/input.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 5a070fb5b278..f2771375bfc0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -243,7 +243,7 @@ int sctp_rcv(struct sk_buff *skb) bh_lock_sock(sk); } - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); @@ -321,7 +321,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) local_bh_disable(); bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { + if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) sctp_chunk_free(chunk); else @@ -336,7 +336,13 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) if (backloged) return 0; } else { - sctp_inq_push(inqueue, chunk); + if (!sctp_newsk_ready(sk)) { + if (!sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + return 0; + sctp_chunk_free(chunk); + } else { + sctp_inq_push(inqueue, chunk); + } } done: -- cgit v1.2.3 From a954380acde6adf584de76c41e1d520097820dd5 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 8 Oct 2019 16:20:07 -0700 Subject: net: taprio: Fix returning EINVAL when configuring without flags When configuring a taprio instance if "flags" is not specified (or it's zero), taprio currently replies with an "Invalid argument" error. So, set the return value to zero after we are done with all the checks. Fixes: 9c66d1564676 ("taprio: Add support for hardware offloading") Signed-off-by: Vinicius Costa Gomes Acked-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 68b543f85a96..6719a65169d4 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1341,6 +1341,10 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } + + /* Everything went ok, return success. */ + err = 0; + out: return err; } -- cgit v1.2.3 From 11c9a7d38af524217efb7a176ad322b97ac2f163 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 9 Oct 2019 11:10:52 +0800 Subject: act_mirred: Fix mirred_init_module error handling If tcf_register_action failed, mirred_device_notifier should be unregistered. Fixes: 3b87956ea645 ("net sched: fix race in mirred device removal") Signed-off-by: YueHaibing Signed-off-by: Jakub Kicinski --- net/sched/act_mirred.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 9ce073a05414..08923b21e566 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -484,7 +484,11 @@ static int __init mirred_init_module(void) return err; pr_info("Mirror/redirect action on\n"); - return tcf_register_action(&act_mirred_ops, &mirred_net_ops); + err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); + if (err) + unregister_netdevice_notifier(&mirred_device_notifier); + + return err; } static void __exit mirred_cleanup_module(void) -- cgit v1.2.3 From 993e4c929a073595d22c85f59082f0c387e31c21 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 9 Oct 2019 11:19:10 +0200 Subject: netns: fix NLM_F_ECHO mechanism for RTM_NEWNSID The flag NLM_F_ECHO aims to reply to the user the message notified to all listeners. It was not the case with the command RTM_NEWNSID, let's fix this. Fixes: 0c7aecd4bde4 ("netns: add rtnl cmd to add and get peer netns ids") Reported-by: Guillaume Nault Signed-off-by: Nicolas Dichtel Acked-by: Guillaume Nault Tested-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a0e0d298c991..6d3e4821b02d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -245,7 +245,8 @@ static int __peernet2id(struct net *net, struct net *peer) return __peernet2id_alloc(net, peer, &no); } -static void rtnl_net_notifyid(struct net *net, int cmd, int id); +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ @@ -268,7 +269,7 @@ int peernet2id_alloc(struct net *net, struct net *peer) id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) - rtnl_net_notifyid(net, RTM_NEWNSID, id); + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL); if (alive) put_net(peer); return id; @@ -532,7 +533,7 @@ static void unhash_nsid(struct net *net, struct net *last) idr_remove(&tmp->netns_ids, id); spin_unlock_bh(&tmp->nsid_lock); if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); + rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL); if (tmp == last) break; } @@ -764,7 +765,8 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, err = alloc_netid(net, peer, nsid); spin_unlock_bh(&net->nsid_lock); if (err >= 0) { - rtnl_net_notifyid(net, RTM_NEWNSID, err); + rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, + nlh); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; @@ -1051,9 +1053,12 @@ end: return err < 0 ? err : skb->len; } -static void rtnl_net_notifyid(struct net *net, int cmd, int id) +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh) { struct net_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; @@ -1068,7 +1073,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (err < 0) goto err_out; - rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0); + rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, 0); return; err_out: -- cgit v1.2.3 From e37542ba111f3974dc622ae0a21c1787318de500 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 09:19:13 -0700 Subject: netfilter: conntrack: avoid possible false sharing As hinted by KCSAN, we need at least one READ_ONCE() to prevent a compiler optimization. More details on : https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE#it-may-improve-performance sysbot report : BUG: KCSAN: data-race in __nf_ct_refresh_acct / __nf_ct_refresh_acct read to 0xffff888123eb4f08 of 4 bytes by interrupt on cpu 0: __nf_ct_refresh_acct+0xd4/0x1b0 net/netfilter/nf_conntrack_core.c:1796 nf_ct_refresh_acct include/net/netfilter/nf_conntrack.h:201 [inline] nf_conntrack_tcp_packet+0xd40/0x3390 net/netfilter/nf_conntrack_proto_tcp.c:1161 nf_conntrack_handle_packet net/netfilter/nf_conntrack_core.c:1633 [inline] nf_conntrack_in+0x410/0xaa0 net/netfilter/nf_conntrack_core.c:1727 ipv4_conntrack_in+0x27/0x40 net/netfilter/nf_conntrack_proto.c:178 nf_hook_entry_hookfn include/linux/netfilter.h:135 [inline] nf_hook_slow+0x83/0x160 net/netfilter/core.c:512 nf_hook include/linux/netfilter.h:260 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip_rcv+0x12f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 __do_softirq+0x115/0x33f kernel/softirq.c:292 write to 0xffff888123eb4f08 of 4 bytes by task 7191 on cpu 1: __nf_ct_refresh_acct+0xfb/0x1b0 net/netfilter/nf_conntrack_core.c:1797 nf_ct_refresh_acct include/net/netfilter/nf_conntrack.h:201 [inline] nf_conntrack_tcp_packet+0xd40/0x3390 net/netfilter/nf_conntrack_proto_tcp.c:1161 nf_conntrack_handle_packet net/netfilter/nf_conntrack_core.c:1633 [inline] nf_conntrack_in+0x410/0xaa0 net/netfilter/nf_conntrack_core.c:1727 ipv4_conntrack_local+0xbe/0x130 net/netfilter/nf_conntrack_proto.c:200 nf_hook_entry_hookfn include/linux/netfilter.h:135 [inline] nf_hook_slow+0x83/0x160 net/netfilter/core.c:512 nf_hook include/linux/netfilter.h:260 [inline] __ip_local_out+0x1f7/0x2b0 net/ipv4/ip_output.c:114 ip_local_out+0x31/0x90 net/ipv4/ip_output.c:123 __ip_queue_xmit+0x3a8/0xa40 net/ipv4/ip_output.c:532 ip_queue_xmit+0x45/0x60 include/net/ip.h:236 __tcp_transmit_skb+0xdeb/0x1cd0 net/ipv4/tcp_output.c:1158 __tcp_send_ack+0x246/0x300 net/ipv4/tcp_output.c:3685 tcp_send_ack+0x34/0x40 net/ipv4/tcp_output.c:3691 tcp_cleanup_rbuf+0x130/0x360 net/ipv4/tcp.c:1575 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 7191 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: cc16921351d8 ("netfilter: conntrack: avoid same-timeout update") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jozsef Kadlecsik Cc: Florian Westphal Acked-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- net/netfilter/nf_conntrack_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0c63120b2db2..5cd610b547e0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1792,8 +1792,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (nf_ct_is_confirmed(ct)) extra_jiffies += nfct_time_stamp; - if (ct->timeout != extra_jiffies) - ct->timeout = extra_jiffies; + if (READ_ONCE(ct->timeout) != extra_jiffies) + WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); -- cgit v1.2.3 From 503978aca46124cd714703e180b9c8292ba50ba7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 12:55:53 -0700 Subject: net: avoid possible false sharing in sk_leave_memory_pressure() As mentioned in https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE#it-may-improve-performance a C compiler can legally transform : if (memory_pressure && *memory_pressure) *memory_pressure = 0; to : if (memory_pressure) *memory_pressure = 0; Fixes: 0604475119de ("tcp: add TCPMemoryPressuresChrono counter") Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.") Fixes: 3ab224be6d69 ("[NET] CORE: Introducing new memory accounting interface.") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index fac2b4d80de5..50647a10fdb7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2334,8 +2334,8 @@ static void sk_leave_memory_pressure(struct sock *sk) } else { unsigned long *memory_pressure = sk->sk_prot->memory_pressure; - if (memory_pressure && *memory_pressure) - *memory_pressure = 0; + if (memory_pressure && READ_ONCE(*memory_pressure)) + WRITE_ONCE(*memory_pressure, 0); } } -- cgit v1.2.3 From 60b173ca3d1cd1782bd0096dc17298ec242f6fb1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:51:20 -0700 Subject: net: add {READ|WRITE}_ONCE() annotations on ->rskq_accept_head reqsk_queue_empty() is called from inet_csk_listen_poll() while other cpus might write ->rskq_accept_head value. Use {READ|WRITE}_ONCE() to avoid compiler tricks and potential KCSAN splats. Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a9183543ca30..dbcf34ec8dd2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -934,7 +934,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk, req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) - queue->rskq_accept_head = req; + WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; -- cgit v1.2.3 From 1f142c17d19a5618d5a633195a46f2c8be9bf232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:10:15 -0700 Subject: tcp: annotate lockless access to tcp_memory_pressure tcp_memory_pressure is read without holding any lock, and its value could be changed on other cpus. Use READ_ONCE() to annotate these lockless reads. The write side is already using atomic ops. Fixes: b8da51ebb1aa ("tcp: introduce tcp_under_memory_pressure()") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f98a1882e537..888c92b63f5a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -326,7 +326,7 @@ void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; - if (tcp_memory_pressure) + if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; @@ -341,7 +341,7 @@ void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; - if (!tcp_memory_pressure) + if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) -- cgit v1.2.3 From 8265792bf8871acc2d00fd03883d830e2249d395 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:21:13 -0700 Subject: net: silence KCSAN warnings around sk_add_backlog() calls sk_add_backlog() callers usually read sk->sk_rcvbuf without owning the socket lock. This means sk_rcvbuf value can be changed by other cpus, and KCSAN complains. Add READ_ONCE() annotations to document the lockless nature of these reads. Note that writes over sk_rcvbuf should also use WRITE_ONCE(), but this will be done in separate patches to ease stable backports (if we decide this is relevant for stable trees). BUG: KCSAN: data-race in tcp_add_backlog / tcp_recvmsg write to 0xffff88812ab369f8 of 8 bytes by interrupt on cpu 1: __sk_add_backlog include/net/sock.h:902 [inline] sk_add_backlog include/net/sock.h:933 [inline] tcp_add_backlog+0x45a/0xcc0 net/ipv4/tcp_ipv4.c:1737 tcp_v4_rcv+0x1aba/0x1bf0 net/ipv4/tcp_ipv4.c:1925 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 read to 0xffff88812ab369f8 of 8 bytes by task 7271 on cpu 0: tcp_recvmsg+0x470/0x1a30 net/ipv4/tcp.c:2047 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 ksys_read+0xd5/0x1b0 fs/read_write.c:587 __do_sys_read fs/read_write.c:597 [inline] __se_sys_read fs/read_write.c:595 [inline] __x64_sys_read+0x4c/0x60 fs/read_write.c:595 do_syscall_64+0xcf/0x2f0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7271 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- net/core/sock.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/llc/llc_conn.c | 2 +- net/sctp/input.c | 6 +++--- net/tipc/socket.c | 6 +++--- net/x25/x25_dev.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 50647a10fdb7..1cf06934da50 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -522,7 +522,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bf124b1742df..492bf6a6b023 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1644,7 +1644,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index a79b739eb223..7b620acaca9e 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -813,7 +813,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) else { dprintk("%s: adding to backlog...\n", __func__); llc_set_backlog_type(skb, LLC_PACKET); - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) goto drop_unlock; } out: diff --git a/net/sctp/input.c b/net/sctp/input.c index f2771375bfc0..2277981559d0 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -322,7 +322,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { - if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; @@ -337,7 +337,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) return 0; } else { if (!sctp_newsk_ready(sk)) { - if (!sk_add_backlog(sk, skb, sk->sk_rcvbuf)) + if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { @@ -364,7 +364,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) struct sctp_ep_common *rcvr = chunk->rcvr; int ret; - ret = sk_add_backlog(sk, skb, sk->sk_rcvbuf); + ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3b9f8cc328f5..7c736cfec57f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2119,13 +2119,13 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_msg *hdr = buf_msg(skb); if (unlikely(msg_in_group(hdr))) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); if (unlikely(!msg_connected(hdr))) - return sk->sk_rcvbuf << msg_importance(hdr); + return READ_ONCE(sk->sk_rcvbuf) << msg_importance(hdr); if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) - return sk->sk_rcvbuf; + return READ_ONCE(sk->sk_rcvbuf); return FLOWCTL_MSG_LIM; } diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 5c111bc3c8ea..00e782335cb0 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -55,7 +55,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) if (!sock_owned_by_user(sk)) { queued = x25_process_rx_frame(sk, skb); } else { - queued = !sk_add_backlog(sk, skb, sk->sk_rcvbuf); + queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); } bh_unlock_sock(sk); sock_put(sk); -- cgit v1.2.3 From eac66402d1c342f07ff38f8d631ff95eb7ad3220 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:32:35 -0700 Subject: net: annotate sk->sk_rcvlowat lockless reads sock_rcvlowat() or int_sk_rcvlowat() might be called without the socket lock for example from tcp_poll(). Use READ_ONCE() to document the fact that other cpus might change sk->sk_rcvlowat under us and avoid KCSAN splats. Use WRITE_ONCE() on write sides too. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/filter.c | 2 +- net/core/sock.c | 2 +- net/ipv4/tcp.c | 2 +- net/sched/em_meta.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..a50c0b6846f2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4274,7 +4274,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_MARK: if (sk->sk_mark != val) { diff --git a/net/core/sock.c b/net/core/sock.c index 1cf06934da50..b7c5c6ea51ba 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -974,7 +974,7 @@ set_rcvbuf: if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; case SO_RCVTIMEO_OLD: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 888c92b63f5a..8781a92ea4b6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1699,7 +1699,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) else cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; val = min(val, cap); - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 82bd14e7ac93..4c9122fc35c9 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat) *err = -1; return; } - dst->value = sk->sk_rcvlowat; + dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) -- cgit v1.2.3 From 70c2655849a25431f31b505a07fe0c861e5e41fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:41:03 -0700 Subject: net: silence KCSAN warnings about sk->sk_backlog.len reads sk->sk_backlog.len can be written by BH handlers, and read from process contexts in a lockless way. Note the write side should also use WRITE_ONCE() or a variant. We need some agreement about the best way to do this. syzbot reported : BUG: KCSAN: data-race in tcp_add_backlog / tcp_grow_window.isra.0 write to 0xffff88812665f32c of 4 bytes by interrupt on cpu 1: sk_add_backlog include/net/sock.h:934 [inline] tcp_add_backlog+0x4a0/0xcc0 net/ipv4/tcp_ipv4.c:1737 tcp_v4_rcv+0x1aba/0x1bf0 net/ipv4/tcp_ipv4.c:1925 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 read to 0xffff88812665f32c of 4 bytes by task 7292 on cpu 0: tcp_space include/net/tcp.h:1373 [inline] tcp_grow_window.isra.0+0x6b/0x480 net/ipv4/tcp_input.c:413 tcp_event_data_recv+0x68f/0x990 net/ipv4/tcp_input.c:717 tcp_rcv_established+0xbfe/0xf50 net/ipv4/tcp_input.c:5618 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2427 release_sock+0x61/0x160 net/core/sock.c:2943 tcp_recvmsg+0x63b/0x1a30 net/ipv4/tcp.c:2181 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7292 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- net/core/sock.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index b7c5c6ea51ba..2a053999df11 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3210,7 +3210,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index fc9a4c6629ce..0851166b9175 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -175,7 +175,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7c736cfec57f..f8bbc4aab213 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3790,7 +3790,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf); i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk)); i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf); - i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len); + i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len)); if (dqueues & TIPC_DUMP_SK_SNDQ) { i += scnprintf(buf + i, sz - i, "sk_write_queue: "); -- cgit v1.2.3 From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 2 Oct 2019 10:40:55 -0400 Subject: SUNRPC: fix race to sk_err after xs_error_report Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") there has been a race to the value of the sk_err if both XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set. In that case, we may end up losing the sk_err value that existed when xs_error_report was called. Fix this by reverting to the previous behavior: instead of using SO_ERROR to retrieve the value at a later time (which might also return sk_err_soft), copy the sk_err value onto struct sock_xprt, and use that value to wake pending tasks. Signed-off-by: Benjamin Coddington Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context") Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9ac88722fa83..70e52f567b2a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1249,19 +1249,21 @@ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; - int err; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; transport = container_of(xprt, struct sock_xprt, xprt); - err = -sk->sk_err; - if (err == 0) + transport->xprt_err = -sk->sk_err; + if (transport->xprt_err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", - xprt, -err); - trace_rpc_socket_error(xprt, sk->sk_socket, err); + xprt, -transport->xprt_err); + trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); + + /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ + smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); @@ -2476,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport) static void xs_wake_error(struct sock_xprt *transport) { int sockerr; - int sockerr_len = sizeof(sockerr); if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; @@ -2485,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport) goto out; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) goto out; - if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, - (char *)&sockerr, &sockerr_len) != 0) - goto out; + sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) xprt_wake_pending_tasks(&transport->xprt, sockerr); out: -- cgit v1.2.3 From 29ee2701529e1905c0e948688f9688c68c8d4ea4 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 10 Oct 2019 10:16:09 +0200 Subject: net/smc: fix SMCD link group creation with VLAN id If creation of an SMCD link group with VLAN id fails, the initial smc_ism_get_vlan() step has to be reverted as well. Fixes: c6ba7c9ba43d ("net/smc: add base infrastructure for SMC-D and ISM") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4ca50ddf8d16..88556f0251ab 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -213,7 +213,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = SMC_CLC_DECL_MEM; - goto out; + goto ism_put_vlan; } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; @@ -289,6 +289,9 @@ clear_llc_lnk: smc_llc_link_clear(lnk); free_lgr: kfree(lgr); +ism_put_vlan: + if (ini->is_smcd && ini->vlan_id) + smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) -- cgit v1.2.3 From 882dcfe5a1785c20f45820cbe6fec4b8b647c946 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 10 Oct 2019 10:16:10 +0200 Subject: net/smc: receive returns without data smc_cdc_rxed_any_close_or_senddone() is used as an end condition for the receive loop. This conflicts with smc_cdc_msg_recv_action() which could run in parallel and set the bits checked by smc_cdc_rxed_any_close_or_senddone() before the receive is processed. In that case we could return from receive with no data, although data is available. The same applies to smc_rx_wait(). Fix this by checking for RCV_SHUTDOWN only, which is set in smc_cdc_msg_recv_action() after the receive was actually processed. Fixes: 952310ccf2d8 ("smc: receive data from RMBE") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_rx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 413a6abf227e..000002642288 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -211,8 +211,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, rc = sk_wait_event(sk, timeo, sk->sk_err || sk->sk_shutdown & RCV_SHUTDOWN || - fcrit(conn) || - smc_cdc_rxed_any_close_or_senddone(conn), + fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -310,7 +309,6 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, smc_rx_update_cons(smc, 0); if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) break; -- cgit v1.2.3 From 107529e31a87acd475ff6a0f82745821b8f70fec Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 10 Oct 2019 10:16:11 +0200 Subject: net/smc: receive pending data after RCV_SHUTDOWN smc_rx_recvmsg() first checks if data is available, and then if RCV_SHUTDOWN is set. There is a race when smc_cdc_msg_recv_action() runs in between these 2 checks, receives data and sets RCV_SHUTDOWN. In that case smc_rx_recvmsg() would return from receive without to process the available data. Fix that with a final check for data available if RCV_SHUTDOWN is set. Move the check for data into a function and call it twice. And use the existing helper smc_rx_data_available(). Fixes: 952310ccf2d8 ("smc: receive data from RMBE") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_rx.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 000002642288..97e8369002d7 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -261,6 +261,18 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_rx_data_available(conn)) + return true; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + return false; +} + /* smc_rx_recvmsg - receive data from RMBE * @msg: copy data to receive buffer * @pipe: copy data to pipe if set - indicates splice() call @@ -302,15 +314,18 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; - if (atomic_read(&conn->bytes_to_rcv)) + if (smc_rx_recvmsg_data_available(smc)) goto copy; - else if (conn->urg_state == SMC_URG_VALID) - /* we received a single urgent Byte - skip */ - smc_rx_update_cons(smc, 0); if (sk->sk_shutdown & RCV_SHUTDOWN || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) { + /* smc_cdc_msg_recv_action() could have run after + * above smc_rx_recvmsg_data_available() + */ + if (smc_rx_recvmsg_data_available(smc)) + goto copy; break; + } if (read_done) { if (sk->sk_err || -- cgit v1.2.3 From 48f9bcf91461b43249949f4e172b5c0245dde751 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Oct 2019 09:46:41 -0700 Subject: net: sctp: Rename fallthrough label to unhandled fallthrough will become a pseudo reserved keyword so this only use of fallthrough is better renamed to allow it. Signed-off-by: Joe Perches Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Acked-by: Neil Horman Signed-off-by: Linus Torvalds --- net/sctp/sm_make_chunk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e41ed2e0ae7d..48d63956a68c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2155,7 +2155,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_SET_PRIMARY: if (ep->asconf_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ @@ -2166,11 +2166,11 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; - goto fallthrough; + goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association @@ -2187,7 +2187,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or @@ -2203,7 +2203,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) - goto fallthrough; + goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - @@ -2226,7 +2226,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net, retval = SCTP_IERROR_ABORT; } break; -fallthrough: +unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); -- cgit v1.2.3 From f0308fb0708078d6c1d8a4d533941a7a191af634 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Oct 2019 15:52:34 +0100 Subject: rxrpc: Fix possible NULL pointer access in ICMP handling If an ICMP packet comes in on the UDP socket backing an AF_RXRPC socket as the UDP socket is being shut down, rxrpc_error_report() may get called to deal with it after sk_user_data on the UDP socket has been cleared, leading to a NULL pointer access when this local endpoint record gets accessed. Fix this by just returning immediately if sk_user_data was NULL. The oops looks like the following: #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page ... RIP: 0010:rxrpc_error_report+0x1bd/0x6a9 ... Call Trace: ? sock_queue_err_skb+0xbd/0xde ? __udp4_lib_err+0x313/0x34d __udp4_lib_err+0x313/0x34d icmp_unreach+0x1ee/0x207 icmp_rcv+0x25b/0x28f ip_protocol_deliver_rcu+0x95/0x10e ip_local_deliver+0xe9/0x148 __netif_receive_skb_one_core+0x52/0x6e process_backlog+0xdc/0x177 net_rx_action+0xf9/0x270 __do_softirq+0x1b6/0x39a ? smpboot_register_percpu_thread+0xce/0xce run_ksoftirqd+0x1d/0x42 smpboot_thread_fn+0x19e/0x1b3 kthread+0xf1/0xf6 ? kthread_delayed_work_timer_fn+0x83/0x83 ret_from_fork+0x24/0x30 Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Reported-by: syzbot+611164843bd48cc2190c@syzkaller.appspotmail.com Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/peer_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index c97ebdc043e4..61451281d74a 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -151,6 +151,9 @@ void rxrpc_error_report(struct sock *sk) struct rxrpc_peer *peer; struct sk_buff *skb; + if (unlikely(!local)) + return; + _enter("%p{%d}", sk, local->debug_id); /* Clear the outstanding error value on the socket so that it doesn't -- cgit v1.2.3 From d983ea6f16b835dcde2ee9a58a1e764ce68bfccc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:38 -0700 Subject: tcp: add rcu protection around tp->fastopen_rsk Both tcp_v4_err() and tcp_v6_err() do the following operations while they do not own the socket lock : fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; The problem is that without appropriate barrier, the compiler might reload tp->fastopen_rsk and trigger a NULL deref. request sockets are protected by RCU, we can simply add the missing annotations and barriers to solve the issue. Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/request_sock.c | 2 +- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp.c | 11 ++++++++--- net/ipv4/tcp_fastopen.c | 2 +- net/ipv4/tcp_input.c | 13 +++++++++---- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 11 ++++++----- net/ipv6/tcp_ipv6.c | 2 +- 10 files changed, 32 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index c9bb00008528..f35c2e998406 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - tcp_sk(sk)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dbcf34ec8dd2..eb30fc1770de 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -906,7 +906,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, percpu_counter_inc(sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { - BUG_ON(tcp_sk(child)->fastopen_rsk != req); + BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if @@ -915,7 +915,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ - tcp_sk(child)->fastopen_rsk = NULL; + RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8781a92ea4b6..c59d0bd29c5c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -543,7 +543,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && - (state != TCP_SYN_RECV || tp->fastopen_rsk)) { + (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -2487,7 +2487,10 @@ adjudge_to_death: } if (sk->sk_state == TCP_CLOSE) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. @@ -3831,8 +3834,10 @@ EXPORT_SYMBOL(tcp_md5_hash_key); void tcp_done(struct sock *sk) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 3fd451271a70..a915ade0c818 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, */ tp = tcp_sk(child); - tp->fastopen_rsk = req; + rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3578357abe30..5f9b102c3b55 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2666,7 +2666,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); - if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) && + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && tcp_try_undo_loss(sk, false)) return; @@ -2990,7 +2990,7 @@ void tcp_rearm_rto(struct sock *sk) /* If the retrans timer is currently being used by Fast Open * for SYN-ACK retrans purpose, stay put. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return; if (!tp->packets_out) { @@ -6087,6 +6087,8 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct request_sock *req; + tcp_try_undo_loss(sk, false); /* Reset rtx states to prevent spurious retransmits_timed_out() */ @@ -6096,7 +6098,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); + reqsk_fastopen_remove(sk, req, false); /* Re-arm the timer because data may have been sent out. * This is similar to the regular data transmission case @@ -6171,7 +6175,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_mstamp_refresh(tp); tp->rx_opt.saw_tstamp = 0; - req = tp->fastopen_rsk; + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); if (req) { bool req_stolen; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 492bf6a6b023..ffa366099eb2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -478,7 +478,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { @@ -2121,7 +2121,7 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - BUG_ON(tp->fastopen_rsk); + BUG_ON(rcu_access_pointer(tp->fastopen_rsk)); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bb140a5db8c0..5401dbd39c8f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -541,7 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; + RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fec6d67bfd14..84ae4d1449ea 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2482,7 +2482,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 05be564414e9..dd5a6317a801 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -386,15 +386,13 @@ abort: tcp_write_err(sk); * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ -static void tcp_fastopen_synack_timer(struct sock *sk) +static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct tcp_sock *tp = tcp_sk(sk); - struct request_sock *req; - req = tcp_sk(sk)->fastopen_rsk; req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { @@ -435,11 +433,14 @@ void tcp_retransmit_timer(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *req; - if (tp->fastopen_rsk) { + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - tcp_fastopen_synack_timer(sk); + tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e3d9f4559c99..45a95e032bdf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -406,7 +406,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ - fastopen = tp->fastopen_rsk; + fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { -- cgit v1.2.3 From dba7d9b8c739df27ff3a234c81d6c6b23e3986fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:39 -0700 Subject: tcp: annotate tp->rcv_nxt lockless reads There are few places where we fetch tp->rcv_nxt while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that tcp_inq_hint() was already using READ_ONCE(tp->rcv_nxt) syzbot reported : BUG: KCSAN: data-race in tcp_poll / tcp_queue_rcv write to 0xffff888120425770 of 4 bytes by interrupt on cpu 0: tcp_rcv_nxt_update net/ipv4/tcp_input.c:3365 [inline] tcp_queue_rcv+0x180/0x380 net/ipv4/tcp_input.c:4638 tcp_rcv_established+0xbf1/0xf50 net/ipv4/tcp_input.c:5616 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542 tcp_v4_rcv+0x1a03/0x1bf0 net/ipv4/tcp_ipv4.c:1923 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 read to 0xffff888120425770 of 4 bytes by task 7254 on cpu 1: tcp_stream_is_readable net/ipv4/tcp.c:480 [inline] tcp_poll+0x204/0x6b0 net/ipv4/tcp.c:554 sock_poll+0xed/0x250 net/socket.c:1256 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll.isra.0+0x90/0x190 fs/eventpoll.c:892 ep_send_events_proc+0x113/0x5c0 fs/eventpoll.c:1749 ep_scan_ready_list.constprop.0+0x189/0x500 fs/eventpoll.c:704 ep_send_events fs/eventpoll.c:1793 [inline] ep_poll+0xe3/0x900 fs/eventpoll.c:1930 do_epoll_wait+0x162/0x180 fs/eventpoll.c:2294 __do_sys_epoll_pwait fs/eventpoll.c:2325 [inline] __se_sys_epoll_pwait fs/eventpoll.c:2311 [inline] __x64_sys_epoll_pwait+0xcd/0x170 fs/eventpoll.c:2311 do_syscall_64+0xcf/0x2f0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 7254 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_diag.c | 2 +- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 3 ++- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c59d0bd29c5c..883ee863db43 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -477,7 +477,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (tp->rcv_nxt - tp->copied_seq >= target) || + return (READ_ONCE(tp->rcv_nxt) - tp->copied_seq >= target) || (sk->sk_prot->stream_memory_read ? sk->sk_prot->stream_memory_read(sk) : false); } @@ -2935,7 +2935,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else if (tp->repair_queue == TCP_SEND_QUEUE) tp->write_seq = val; else if (tp->repair_queue == TCP_RECV_QUEUE) - tp->rcv_nxt = val; + WRITE_ONCE(tp->rcv_nxt, val); else err = -EINVAL; break; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 81a8221d650a..cd219161f106 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -26,7 +26,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); - r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } if (info) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5f9b102c3b55..5b7c8768ed5f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3362,7 +3362,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; - tp->rcv_nxt = seq; + WRITE_ONCE(tp->rcv_nxt, seq); } /* Update our send window. @@ -5932,7 +5932,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Ok.. it's good. Set up sequence numbers and * move to established. */ - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6035,7 +6035,7 @@ discard: tp->tcp_header_len = sizeof(struct tcphdr); } - tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->copied_seq = tp->rcv_nxt; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ffa366099eb2..5089dd6bee0f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2455,7 +2455,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5401dbd39c8f..adc6ce486a38 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -462,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; struct tcp_sock *oldtp, *newtp; + u32 seq; if (!newsk) return NULL; @@ -475,8 +476,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, /* Now setup tcp_sock */ newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; + seq = treq->rcv_isn + 1; + newtp->rcv_wup = seq; + newtp->copied_seq = seq; + WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; newtp->snd_sml = newtp->snd_una = diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 45a95e032bdf..89ea0a7018b5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1895,7 +1895,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) /* Because we don't lock the socket, * we might find a transient negative value. */ - rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - + tp->copied_seq, 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " -- cgit v1.2.3 From 7db48e983930285b765743ebd665aecf9850582b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:40 -0700 Subject: tcp: annotate tp->copied_seq lockless reads There are few places where we fetch tp->copied_seq while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that tcp_inq_hint() was already using READ_ONCE(tp->copied_seq) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 20 ++++++++++---------- net/ipv4/tcp_diag.c | 3 ++- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 7 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 883ee863db43..c322ad071e17 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -477,7 +477,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (READ_ONCE(tp->rcv_nxt) - tp->copied_seq >= target) || + return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || (sk->sk_prot->stream_memory_read ? sk->sk_prot->stream_memory_read(sk) : false); } @@ -546,7 +546,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); - if (tp->urg_seq == tp->copied_seq && + if (tp->urg_seq == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; @@ -607,7 +607,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -1668,9 +1668,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb); if (!desc->count) break; - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); } - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -1819,7 +1819,7 @@ static int tcp_zerocopy_receive(struct sock *sk, out: up_read(¤t->mm->mmap_sem); if (length) { - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ @@ -2117,7 +2117,7 @@ found_ok_skb: if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; + WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; @@ -2139,7 +2139,7 @@ found_ok_skb: } } - *seq += used; + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2166,7 +2166,7 @@ skip_copy: found_fin_ok: /* Process the FIN. */ - ++*seq; + WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; @@ -2588,7 +2588,7 @@ int tcp_disconnect(struct sock *sk, int flags) __kfree_skb(sk->sk_rx_skb_cache); sk->sk_rx_skb_cache = NULL; } - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index cd219161f106..66273c8a55c2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -26,7 +26,8 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); - r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - tp->copied_seq, 0); + r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } if (info) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5b7c8768ed5f..a30aae3a6a18 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5961,7 +5961,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); @@ -6036,7 +6036,7 @@ discard: } WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6216,7 +6216,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5089dd6bee0f..39560f482e0b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2456,7 +2456,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - - tp->copied_seq, 0); + READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index adc6ce486a38..c4731d26ab4a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -478,7 +478,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; - newtp->copied_seq = seq; + WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 84ae4d1449ea..7dda12720169 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3433,7 +3433,7 @@ static void tcp_connect_init(struct sock *sk) else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 89ea0a7018b5..a62c7042fc4a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1896,7 +1896,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - - tp->copied_seq, 0); + READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " -- cgit v1.2.3 From 0f31746452e6793ad6271337438af8f4defb8940 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:41 -0700 Subject: tcp: annotate tp->write_seq lockless reads There are few places where we fetch tp->write_seq while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 20 ++++++++++++-------- net/ipv4/tcp_diag.c | 2 +- net/ipv4/tcp_ipv4.c | 21 ++++++++++++--------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 13 +++++++------ 6 files changed, 35 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c322ad071e17..96dd65cbeb85 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -616,7 +616,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_una; + answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) @@ -625,7 +625,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - tp->snd_nxt; break; default: return -ENOIOCTLCMD; @@ -1035,7 +1035,7 @@ new_segment: sk->sk_wmem_queued += copy; sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1362,7 +1362,7 @@ new_segment: if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -2562,6 +2562,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2604,9 +2605,12 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; - tp->write_seq += tp->max_window + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; @@ -2933,7 +2937,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sk->sk_state != TCP_CLOSE) err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) - tp->write_seq = val; + WRITE_ONCE(tp->write_seq, val); else if (tp->repair_queue == TCP_RECV_QUEUE) WRITE_ONCE(tp->rcv_nxt, val); else diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 66273c8a55c2..549506162dde 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -28,7 +28,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 39560f482e0b..6be568334848 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -164,9 +164,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) * without appearing to create any others. */ if (likely(!tp->repair)) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + u32 seq = tcptw->tw_snd_nxt + 65535 + 2; + + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } @@ -253,7 +255,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } inet->inet_dport = usin->sin_port; @@ -291,10 +293,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port); + WRITE_ONCE(tp->write_seq, + secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port)); tp->tsoffset = secure_tcp_ts_off(sock_net(sk), inet->inet_saddr, inet->inet_daddr); @@ -2461,7 +2464,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c4731d26ab4a..339944690329 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -498,7 +498,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7dda12720169..c17c2a78809d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1196,7 +1196,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; @@ -3449,7 +3449,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) __skb_header_release(skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); - tp->write_seq = tcb->end_seq; + WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a62c7042fc4a..4804b6dc5e65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -215,7 +215,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; @@ -311,10 +311,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); + WRITE_ONCE(tp->write_seq, + secure_tcpv6_seq(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk), np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); @@ -1907,7 +1908,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), -- cgit v1.2.3 From e0d694d638dba768b47be31c22e1a9b4f862f561 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:42 -0700 Subject: tcp: annotate tp->snd_nxt lockless reads There are few places where we fetch tp->snd_nxt while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_minisocks.c | 6 ++++-- net/ipv4/tcp_output.c | 10 +++++----- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 96dd65cbeb85..652568750cb1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -625,7 +625,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = READ_ONCE(tp->write_seq) - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 339944690329..c802bc80c400 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -482,8 +482,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + seq = treq->snt_isn + 1; + newtp->snd_sml = newtp->snd_una = seq; + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c17c2a78809d..a115a991dfb5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -67,7 +67,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); @@ -3142,7 +3142,7 @@ void tcp_send_fin(struct sock *sk) * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ - tp->snd_nxt++; + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { @@ -3426,7 +3426,7 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; @@ -3586,11 +3586,11 @@ int tcp_connect(struct sock *sk) /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { - tp->snd_nxt = TCP_SKB_CB(buff)->seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); -- cgit v1.2.3 From d9b55bf7b6788ec0bd1db1acefbc4feb1399144a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:43 -0700 Subject: tcp: annotate tp->urg_seq lockless reads There two places where we fetch tp->urg_seq while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write side use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 +++-- net/ipv4/tcp_input.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 652568750cb1..577a8c6eef9f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -546,7 +546,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); - if (tp->urg_seq == READ_ONCE(tp->copied_seq) && + if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; @@ -607,7 +607,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq); + answ = tp->urg_data && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a30aae3a6a18..16342e043ab3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5356,7 +5356,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; -- cgit v1.2.3 From ebb3b78db7bf842270a46fd4fe7cc45c78fa5ed6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:44 -0700 Subject: tcp: annotate sk->sk_rcvbuf lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_rcvbuf while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that other transports probably need similar fixes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 3 ++- net/core/skbuff.c | 2 +- net/core/sock.c | 5 +++-- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 7 ++++--- 5 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a50c0b6846f2..7deceaeeed7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4252,7 +4252,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + WRITE_ONCE(sk->sk_rcvbuf, + max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 529133611ea2..8c178703467b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4415,7 +4415,7 @@ static void skb_set_err_queue(struct sk_buff *skb) int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) { if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned int)sk->sk_rcvbuf) + (unsigned int)READ_ONCE(sk->sk_rcvbuf)) return -ENOMEM; skb_orphan(skb); diff --git a/net/core/sock.c b/net/core/sock.c index 2a053999df11..8c8f61e70141 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -831,7 +831,8 @@ set_rcvbuf: * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + WRITE_ONCE(sk->sk_rcvbuf, + max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_RCVBUFFORCE: @@ -3204,7 +3205,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 577a8c6eef9f..bc0481aa6633 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,7 +451,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; + WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); sk_sockets_allocated_inc(sk); sk->sk_route_forced_caps = NETIF_F_GSO; @@ -1711,7 +1711,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) val <<= 1; if (val > sk->sk_rcvbuf) { - sk->sk_rcvbuf = val; + WRITE_ONCE(sk->sk_rcvbuf, val); tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); } return 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 16342e043ab3..6995df20710a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -483,8 +483,9 @@ static void tcp_clamp_window(struct sock *sk) !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2]); + WRITE_ONCE(sk->sk_rcvbuf, + min(atomic_read(&sk->sk_rmem_alloc), + net->ipv4.sysctl_tcp_rmem[2])); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -648,7 +649,7 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, rcvwin * rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { - sk->sk_rcvbuf = rcvbuf; + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ tp->window_clamp = tcp_win_from_space(sk, rcvbuf); -- cgit v1.2.3 From e292f05e0df73f9fcc93329663936e1ded97a988 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:45 -0700 Subject: tcp: annotate sk->sk_sndbuf lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_sndbuf while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that other transports probably need similar fixes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 3 ++- net/core/sock.c | 15 +++++++++------ net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 3 ++- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 7deceaeeed7b..3fed5755494b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4258,7 +4258,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); break; case SO_MAX_PACING_RATE: /* 32bit version */ if (val != ~0U) diff --git a/net/core/sock.c b/net/core/sock.c index 8c8f61e70141..cd075bc86407 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -785,7 +785,8 @@ set_sndbuf: */ val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -2089,8 +2090,10 @@ EXPORT_SYMBOL(sock_i_ino); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -2191,7 +2194,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -2226,7 +2229,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -2807,7 +2810,7 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | @@ -3207,7 +3210,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc0481aa6633..111853262972 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -450,7 +450,7 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); sk_sockets_allocated_inc(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6995df20710a..a2e52ad7cdab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,8 @@ static void tcp_sndbuf_expand(struct sock *sk) sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); + WRITE_ONCE(sk->sk_sndbuf, + min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) -- cgit v1.2.3 From ab4e846a82d0ae00176de19f2db3c5c64f8eb5f2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:46 -0700 Subject: tcp: annotate sk->sk_wmem_queued lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_wmem_queued while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. sk_wmem_queued_add() helper is added so that we can in the future convert to ADD_ONCE() or equivalent if/when available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- net/core/sock.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_output.c | 14 +++++++------- net/sched/em_meta.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index 4cc8dc5db2b7..c210fc116103 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -640,7 +640,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->len += copied; skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { - sk->sk_wmem_queued += truesize; + sk_wmem_queued_add(sk, truesize); sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); diff --git a/net/core/sock.c b/net/core/sock.c index cd075bc86407..a515392ba84b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3212,7 +3212,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bbb005eb5218..7dc79b973e6e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), - .idiag_wmem = sk->sk_wmem_queued, + .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk->sk_forward_alloc, .idiag_tmem = sk_wmem_alloc_get(sk), }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 111853262972..b2ac4f074e2d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -659,7 +659,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; @@ -1034,7 +1034,7 @@ new_segment: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a115a991dfb5..0488607c5cd3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1199,7 +1199,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } @@ -1333,7 +1333,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; @@ -1443,7 +1443,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; - sk->sk_wmem_queued -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } @@ -1888,7 +1888,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return -ENOMEM; skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -2152,7 +2152,7 @@ static int tcp_mtu_probe(struct sock *sk) nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); @@ -3222,7 +3222,7 @@ int tcp_send_synack(struct sock *sk) tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -3447,7 +3447,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 4c9122fc35c9..3177dcb17316 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -446,7 +446,7 @@ META_COLLECTOR(int_sk_wmem_queued) *err = -1; return; } - dst->value = sk->sk_wmem_queued; + dst->value = READ_ONCE(sk->sk_wmem_queued); } META_COLLECTOR(int_sk_fwd_alloc) -- cgit v1.2.3 From b0818f80c8c1bc215bba276bd61c216014fab23b Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 11 Oct 2019 18:14:55 -0700 Subject: blackhole_netdev: fix syzkaller reported issue While invalidating the dst, we assign backhole_netdev instead of loopback device. However, this device does not have idev pointer and hence no ip6_ptr even if IPv6 is enabled. Possibly this has triggered the syzbot reported crash. The syzbot report does not have reproducer, however, this is the only device that doesn't have matching idev created. Crash instruction is : static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); return !!idev->cnf.ignore_routes_with_linkdown; <= crash } Also ipv6 always assumes presence of idev and never checks for it being NULL (as does the above referenced code). So adding a idev for the blackhole_netdev to avoid this class of crashes in the future. Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 7 ++++++- net/ipv6/route.c | 15 ++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 34ccef18b40e..4c87594d1389 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6996,7 +6996,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { - struct inet6_dev *idev; + struct inet6_dev *idev, *bdev; int i, err; err = ipv6_addr_label_init(); @@ -7036,10 +7036,14 @@ int __init addrconf_init(void) */ rtnl_lock(); idev = ipv6_add_dev(init_net.loopback_dev); + bdev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; + } else if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto errlo; } ip6_route_init_special_entries(); @@ -7124,6 +7128,7 @@ void addrconf_cleanup(void) addrconf_ifdown(dev, 1); } addrconf_ifdown(init_net.loopback_dev, 2); + addrconf_ifdown(blackhole_netdev, 2); /* * Check hash table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a63ff85fe141..742120728869 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt) static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - struct net_device *loopback_dev = net->loopback_dev; int cpu; - if (dev == loopback_dev) + if (dev == net->loopback_dev) return; for_each_possible_cpu(cpu) { @@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(loopback_dev); + rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); } @@ -386,13 +385,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *loopback_dev = - dev_net(dev)->loopback_dev; - if (idev && idev->dev != loopback_dev) { - struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; + if (idev && idev->dev != dev_net(dev)->loopback_dev) { + struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev); + if (ibdev) { + rt->rt6i_idev = ibdev; in6_dev_put(idev); } } -- cgit v1.2.3 From dedc5a08da07874c6e0d411e7f39c5c2cf137014 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 12 Oct 2019 13:55:06 +0200 Subject: net: avoid errors when trying to pop MLPS header on non-MPLS packets the following script: # tc qdisc add dev eth0 clsact # tc filter add dev eth0 egress matchall action mpls pop implicitly makes the kernel drop all packets transmitted by eth0, if they don't have a MPLS header. This behavior is uncommon: other encapsulations (like VLAN) just let the packet pass unmodified. Since the result of MPLS 'pop' operation would be the same regardless of the presence / absence of MPLS header(s) in the original packet, we can let skb_mpls_pop() return 0 when dealing with non-MPLS packets. For the OVS use-case, this is acceptable because __ovs_nla_copy_actions() already ensures that MPLS 'pop' operation only occurs with packets having an MPLS Ethernet type (and there are no other callers in current code, so the semantic change should be ok). v2: better documentation of use-cases for skb_mpls_pop(), thanks to Simon Horman Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC") Reviewed-by: Simon Horman Acked-by: John Hurley Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8c178703467b..03b6809ebde4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5536,7 +5536,7 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto) int err; if (unlikely(!eth_p_mpls(skb->protocol))) - return -EINVAL; + return 0; err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); if (unlikely(err)) -- cgit v1.2.3 From fa4e0f8855fcba600e0be2575ee29c69166f74bd Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 12 Oct 2019 13:55:07 +0200 Subject: net/sched: fix corrupted L2 header with MPLS 'push' and 'pop' actions the following script: # tc qdisc add dev eth0 clsact # tc filter add dev eth0 egress protocol ip matchall \ > action mpls push protocol mpls_uc label 0x355aa bos 1 causes corruption of all IP packets transmitted by eth0. On TC egress, we can't rely on the value of skb->mac_len, because it's 0 and a MPLS 'push' operation will result in an overwrite of the first 4 octets in the packet L2 header (e.g. the Destination Address if eth0 is an Ethernet); the same error pattern is present also in the MPLS 'pop' operation. Fix this error in act_mpls data plane, computing 'mac_len' as the difference between the network header and the mac header (when not at TC ingress), and use it in MPLS 'push'/'pop' core functions. v2: unbreak 'make htmldocs' because of missing documentation of 'mac_len' in skb_mpls_pop(), reported by kbuild test robot CC: Lorenzo Bianconi Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC") Reviewed-by: Simon Horman Acked-by: John Hurley Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/core/skbuff.c | 19 +++++++++++-------- net/openvswitch/actions.c | 5 +++-- net/sched/act_mpls.c | 12 ++++++++---- 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 03b6809ebde4..867e61df00db 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5477,12 +5477,14 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, * @skb: buffer * @mpls_lse: MPLS label stack entry to push * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) + * @mac_len: length of the MAC header * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len) { struct mpls_shim_hdr *lse; int err; @@ -5499,15 +5501,15 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto) return err; if (!skb->inner_protocol) { - skb_set_inner_network_header(skb, skb->mac_len); + skb_set_inner_network_header(skb, mac_len); skb_set_inner_protocol(skb, skb->protocol); } skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); + mac_len); skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); + skb_set_network_header(skb, mac_len); lse = mpls_hdr(skb); lse->label_stack_entry = mpls_lse; @@ -5526,29 +5528,30 @@ EXPORT_SYMBOL_GPL(skb_mpls_push); * * @skb: buffer * @next_proto: ethertype of header after popped MPLS header + * @mac_len: length of the MAC header * * Expects skb->data at mac header. * * Returns 0 on success, -errno otherwise. */ -int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto) +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len) { int err; if (unlikely(!eth_p_mpls(skb->protocol))) return 0; - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); if (unlikely(err)) return err; skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); + mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); + skb_set_network_header(skb, mac_len); if (skb->dev && skb->dev->type == ARPHRD_ETHER) { struct ethhdr *hdr; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 3572e11b6f21..1c77f520f474 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -165,7 +165,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype); + err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype, + skb->mac_len); if (err) return err; @@ -178,7 +179,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, { int err; - err = skb_mpls_pop(skb, ethertype); + err = skb_mpls_pop(skb, ethertype, skb->mac_len); if (err) return err; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index e168df0e008a..4cf6c553bb0b 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -55,7 +55,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret; + int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -63,8 +63,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. */ - if (skb_at_tc_ingress(skb)) + if (skb_at_tc_ingress(skb)) { skb_push_rcsum(skb, skb->mac_len); + mac_len = skb->mac_len; + } else { + mac_len = skb_network_header(skb) - skb_mac_header(skb); + } ret = READ_ONCE(m->tcf_action); @@ -72,12 +76,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, switch (p->tcfm_action) { case TCA_MPLS_ACT_POP: - if (skb_mpls_pop(skb, p->tcfm_proto)) + if (skb_mpls_pop(skb, p->tcfm_proto, mac_len)) goto drop; break; case TCA_MPLS_ACT_PUSH: new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); - if (skb_mpls_push(skb, new_lse, p->tcfm_proto)) + if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len)) goto drop; break; case TCA_MPLS_ACT_MODIFY: -- cgit v1.2.3 From cab209e571a9375f7dc6db69a6c40d2d98e57e3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Oct 2019 06:47:57 -0700 Subject: tcp: fix a possible lockdep splat in tcp_done() syzbot found that if __inet_inherit_port() returns an error, we call tcp_done() after inet_csk_prepare_forced_close(), meaning the socket lock is no longer held. We might fix this in a different way in net-next, but for 5.4 it seems safer to relax the lockdep check. Fixes: d983ea6f16b8 ("tcp: add rcu protection around tp->fastopen_rsk") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b2ac4f074e2d..42187a3b82f4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3842,8 +3842,12 @@ void tcp_done(struct sock *sk) { struct request_sock *req; - req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, - lockdep_sock_is_held(sk)); + /* We might be called with a new socket, after + * inet_csk_prepare_forced_close() has been called + * so we can not use lockdep_sock_is_held(sk) + */ + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); -- cgit v1.2.3 From 39f13ea2f61b439ebe0060393e9c39925c9ee28c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Oct 2019 11:22:30 -0700 Subject: net: avoid potential infinite loop in tc_ctl_action() tc_ctl_action() has the ability to loop forever if tcf_action_add() returns -EAGAIN. This special case has been done in case a module needed to be loaded, but it turns out that tcf_add_notify() could also return -EAGAIN if the socket sk_rcvbuf limit is hit. We need to separate the two cases, and only loop for the module loading case. While we are at it, add a limit of 10 attempts since unbounded loops are always scary. syzbot repro was something like : socket(PF_NETLINK, SOCK_RAW|SOCK_NONBLOCK, NETLINK_ROUTE) = 3 write(3, ..., 38) = 38 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [0], 4) = 0 sendmsg(3, {msg_name(0)=NULL, msg_iov(1)=[{..., 388}], msg_controllen=0, msg_flags=0x10}, ...) NMI backtrace for cpu 0 CPU: 0 PID: 1054 Comm: khungtaskd Not tainted 5.4.0-rc1+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 nmi_cpu_backtrace.cold+0x70/0xb2 lib/nmi_backtrace.c:101 nmi_trigger_cpumask_backtrace+0x23b/0x28b lib/nmi_backtrace.c:62 arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38 trigger_all_cpu_backtrace include/linux/nmi.h:146 [inline] check_hung_uninterruptible_tasks kernel/hung_task.c:205 [inline] watchdog+0x9d0/0xef0 kernel/hung_task.c:289 kthread+0x361/0x430 kernel/kthread.c:255 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352 Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 8859 Comm: syz-executor910 Not tainted 5.4.0-rc1+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:arch_local_save_flags arch/x86/include/asm/paravirt.h:751 [inline] RIP: 0010:lockdep_hardirqs_off+0x1df/0x2e0 kernel/locking/lockdep.c:3453 Code: 5c 08 00 00 5b 41 5c 41 5d 5d c3 48 c7 c0 58 1d f3 88 48 ba 00 00 00 00 00 fc ff df 48 c1 e8 03 80 3c 10 00 0f 85 d3 00 00 00 <48> 83 3d 21 9e 99 07 00 0f 84 b9 00 00 00 9c 58 0f 1f 44 00 00 f6 RSP: 0018:ffff8880a6f3f1b8 EFLAGS: 00000046 RAX: 1ffffffff11e63ab RBX: ffff88808c9c6080 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000000 RDI: ffff88808c9c6914 RBP: ffff8880a6f3f1d0 R08: ffff88808c9c6080 R09: fffffbfff16be5d1 R10: fffffbfff16be5d0 R11: 0000000000000003 R12: ffffffff8746591f R13: ffff88808c9c6080 R14: ffffffff8746591f R15: 0000000000000003 FS: 00000000011e4880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffff600400 CR3: 00000000a8920000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: trace_hardirqs_off+0x62/0x240 kernel/trace/trace_preemptirq.c:45 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline] _raw_spin_lock_irqsave+0x6f/0xcd kernel/locking/spinlock.c:159 __wake_up_common_lock+0xc8/0x150 kernel/sched/wait.c:122 __wake_up+0xe/0x10 kernel/sched/wait.c:142 netlink_unlock_table net/netlink/af_netlink.c:466 [inline] netlink_unlock_table net/netlink/af_netlink.c:463 [inline] netlink_broadcast_filtered+0x705/0xb80 net/netlink/af_netlink.c:1514 netlink_broadcast+0x3a/0x50 net/netlink/af_netlink.c:1534 rtnetlink_send+0xdd/0x110 net/core/rtnetlink.c:714 tcf_add_notify net/sched/act_api.c:1343 [inline] tcf_action_add+0x243/0x370 net/sched/act_api.c:1362 tc_ctl_action+0x3b5/0x4bc net/sched/act_api.c:1410 rtnetlink_rcv_msg+0x463/0xb00 net/core/rtnetlink.c:5386 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5404 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x803/0x920 net/socket.c:2311 __sys_sendmsg+0x105/0x1d0 net/socket.c:2356 __do_sys_sendmsg net/socket.c:2365 [inline] __se_sys_sendmsg net/socket.c:2363 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440939 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot+cf0adbb9c28c8866c788@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- net/sched/act_api.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4e7429c6f864..69d4676a402f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1353,11 +1353,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { size_t attr_size = 0; - int ret = 0; + int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, - &attr_size, true, extack); + for (loop = 0; loop < 10; loop++) { + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, + actions, &attr_size, true, extack); + if (ret != -EAGAIN) + break; + } + if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); @@ -1407,11 +1412,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, */ if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; -replay: ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, extack); - if (ret == -EAGAIN) - goto replay; break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, -- cgit v1.2.3 From 28aa7c86c2b49f659c8460a89e53b506c45979bb Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 14 Oct 2019 13:38:22 -0700 Subject: sched: etf: Fix ordering of packets with same txtime When a application sends many packets with the same txtime, they may be transmitted out of order (different from the order in which they were enqueued). This happens because when inserting elements into the tree, when the txtime of two packets are the same, the new packet is inserted at the left side of the tree, causing the reordering. The only effect of this change should be that packets with the same txtime will be transmitted in the order they are enqueued. The application in question (the AVTP GStreamer plugin, still in development) is sending video traffic, in which each video frame have a single presentation time, the problem is that when packetizing, multiple packets end up with the same txtime. The receiving side was rejecting packets because they were being received out of order. Fixes: 25db26a91364 ("net/sched: Introduce the ETF Qdisc") Reported-by: Ederson de Souza Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_etf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index cebfb65d8556..b1da5589a0c6 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -177,7 +177,7 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, parent = *p; skb = rb_to_skb(parent); - if (ktime_after(txtime, skb->tstamp)) { + if (ktime_compare(txtime, skb->tstamp) >= 0) { p = &parent->rb_right; leftmost = false; } else { -- cgit v1.2.3 From 63dfb7938b13fa2c2fbcb45f34d065769eb09414 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Oct 2019 15:24:38 +0800 Subject: sctp: change sctp_prot .no_autobind with true syzbot reported a memory leak: BUG: memory leak, unreferenced object 0xffff888120b3d380 (size 64): backtrace: [...] slab_alloc mm/slab.c:3319 [inline] [...] kmem_cache_alloc+0x13f/0x2c0 mm/slab.c:3483 [...] sctp_bucket_create net/sctp/socket.c:8523 [inline] [...] sctp_get_port_local+0x189/0x5a0 net/sctp/socket.c:8270 [...] sctp_do_bind+0xcc/0x200 net/sctp/socket.c:402 [...] sctp_bindx_add+0x4b/0xd0 net/sctp/socket.c:497 [...] sctp_setsockopt_bindx+0x156/0x1b0 net/sctp/socket.c:1022 [...] sctp_setsockopt net/sctp/socket.c:4641 [inline] [...] sctp_setsockopt+0xaea/0x2dc0 net/sctp/socket.c:4611 [...] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3147 [...] __sys_setsockopt+0x10f/0x220 net/socket.c:2084 [...] __do_sys_setsockopt net/socket.c:2100 [inline] It was caused by when sending msgs without binding a port, in the path: inet_sendmsg() -> inet_send_prepare() -> inet_autobind() -> .get_port/sctp_get_port(), sp->bind_hash will be set while bp->port is not. Later when binding another port by sctp_setsockopt_bindx(), a new bucket will be created as bp->port is not set. sctp's autobind is supposed to call sctp_autobind() where it does all things including setting bp->port. Since sctp_autobind() is called in sctp_sendmsg() if the sk is not yet bound, it should have skipped the auto bind. THis patch is to avoid calling inet_autobind() in inet_send_prepare() by changing sctp_prot .no_autobind with true, also remove the unused .get_port. Reported-by: syzbot+d44f7bbebdea49dbc84a@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 939b8d2595bc..5ca0ec0e823c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9500,7 +9500,7 @@ struct proto sctp_prot = { .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, - .get_port = sctp_get_port, + .no_autobind = true, .obj_size = sizeof(struct sctp_sock), .useroffset = offsetof(struct sctp_sock, subscribe), .usersize = offsetof(struct sctp_sock, initmsg) - @@ -9542,7 +9542,7 @@ struct proto sctpv6_prot = { .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, - .get_port = sctp_get_port, + .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - -- cgit v1.2.3 From bd74708cd979f4934f0744055ce3b47da68733ce Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 16 Oct 2019 00:04:38 -0700 Subject: Revert "blackhole_netdev: fix syzkaller reported issue" This reverts commit b0818f80c8c1bc215bba276bd61c216014fab23b. Started seeing weird behavior after this patch especially in the IPv6 code path. Haven't root caused it, but since this was applied to net branch, taking a precautionary measure to revert it and look / analyze those failures Revert this now and I'll send a better fix after analysing / fixing the weirdness observed. CC: Eric Dumazet CC: Wei Wang CC: David S. Miller Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 7 +------ net/ipv6/route.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4c87594d1389..34ccef18b40e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6996,7 +6996,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { - struct inet6_dev *idev, *bdev; + struct inet6_dev *idev; int i, err; err = ipv6_addr_label_init(); @@ -7036,14 +7036,10 @@ int __init addrconf_init(void) */ rtnl_lock(); idev = ipv6_add_dev(init_net.loopback_dev); - bdev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; - } else if (IS_ERR(bdev)) { - err = PTR_ERR(bdev); - goto errlo; } ip6_route_init_special_entries(); @@ -7128,7 +7124,6 @@ void addrconf_cleanup(void) addrconf_ifdown(dev, 1); } addrconf_ifdown(init_net.loopback_dev, 2); - addrconf_ifdown(blackhole_netdev, 2); /* * Check hash table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 742120728869..a63ff85fe141 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,9 +155,10 @@ void rt6_uncached_list_del(struct rt6_info *rt) static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { + struct net_device *loopback_dev = net->loopback_dev; int cpu; - if (dev == net->loopback_dev) + if (dev == loopback_dev) return; for_each_possible_cpu(cpu) { @@ -170,7 +171,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(blackhole_netdev); + rt->rt6i_idev = in6_dev_get(loopback_dev); in6_dev_put(rt_idev); } @@ -385,11 +386,13 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *loopback_dev = + dev_net(dev)->loopback_dev; - if (idev && idev->dev != dev_net(dev)->loopback_dev) { - struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev); - if (ibdev) { - rt->rt6i_idev = ibdev; + if (idev && idev->dev != loopback_dev) { + struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; in6_dev_put(idev); } } -- cgit v1.2.3 From 2ca4f6ca4562594ef161e4140c2a5e0e5282967b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Oct 2019 06:04:38 -0700 Subject: rxrpc: use rcu protection while reading sk->sk_user_data We need to extend the rcu_read_lock() section in rxrpc_error_report() and use rcu_dereference_sk_user_data() instead of plain access to sk->sk_user_data to make sure all rules are respected. The compiler wont reload sk->sk_user_data at will, and RCU rules prevent memory beeing freed too soon. Fixes: f0308fb07080 ("rxrpc: Fix possible NULL pointer access in ICMP handling") Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: Eric Dumazet Cc: David Howells Signed-off-by: David S. Miller --- net/rxrpc/peer_event.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 61451281d74a..48f67a9b1037 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -147,13 +147,16 @@ void rxrpc_error_report(struct sock *sk) { struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; - struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_local *local; struct rxrpc_peer *peer; struct sk_buff *skb; - if (unlikely(!local)) + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); return; - + } _enter("%p{%d}", sk, local->debug_id); /* Clear the outstanding error value on the socket so that it doesn't @@ -163,6 +166,7 @@ void rxrpc_error_report(struct sock *sk) skb = sock_dequeue_err_skb(sk); if (!skb) { + rcu_read_unlock(); _leave("UDP socket errqueue empty"); return; } @@ -170,11 +174,11 @@ void rxrpc_error_report(struct sock *sk) serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); + rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); return; } - rcu_read_lock(); peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; -- cgit v1.2.3 From 595e0651d0296bad2491a4a29a7a43eae6328b02 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 16 Oct 2019 20:52:09 +0200 Subject: ipv4: Return -ENETUNREACH if we can't create route but saddr is valid ...instead of -EINVAL. An issue was found with older kernel versions while unplugging a NFS client with pending RPCs, and the wrong error code here prevented it from recovering once link is back up with a configured address. Incidentally, this is not an issue anymore since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context"), included in 5.2-rc7, had the effect of decoupling the forwarding of this error by using SO_ERROR in xs_wake_error(), as pointed out by Benjamin Coddington. To the best of my knowledge, this isn't currently causing any further issue, but the error code doesn't look appropriate anyway, and we might hit this in other paths as well. In detail, as analysed by Gonzalo Siero, once the route is deleted because the interface is down, and can't be resolved and we return -EINVAL here, this ends up, courtesy of inet_sk_rebuild_header(), as the socket error seen by tcp_write_err(), called by tcp_retransmit_timer(). In turn, tcp_write_err() indirectly calls xs_error_report(), which wakes up the RPC pending tasks with a status of -EINVAL. This is then seen by call_status() in the SUN RPC implementation, which aborts the RPC call calling rpc_exit(), instead of handling this as a potentially temporary condition, i.e. as a timeout. Return -EINVAL only if the input parameters passed to ip_route_output_key_hash_rcu() are actually invalid (this is the case if the specified source address is multicast, limited broadcast or all zeroes), but return -ENETUNREACH in all cases where, at the given moment, the given source address doesn't allow resolving the route. While at it, drop the initialisation of err to -ENETUNREACH, which was added to __ip_route_output_key() back then by commit 0315e3827048 ("net: Fix behaviour of unreachable, blackhole and prohibit routes"), but actually had no effect, as it was, and is, overwritten by the fib_lookup() return code assignment, and anyway ignored in all other branches, including the if (fl4->saddr) one: I find this rather confusing, as it would look like -ENETUNREACH is the "default" error, while that statement has no effect. Also note that after commit fc75fc8339e7 ("ipv4: dont create routes on down devices"), we would get -ENETUNREACH if the device is down, but -EINVAL if the source address is specified and we can't resolve the route, and this appears to be rather inconsistent. Reported-by: Stefan Walter Analysed-by: Benjamin Coddington Analysed-by: Gonzalo Siero Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/route.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 14654876127e..5bc172abd143 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2470,14 +2470,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; - int err = -ENETUNREACH; + int err; if (fl4->saddr) { - rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) + ipv4_is_zeronet(fl4->saddr)) { + rth = ERR_PTR(-EINVAL); goto out; + } + + rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: -- cgit v1.2.3 From 5018c59607a511cdee743b629c76206d9c9e6d7b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 16 Oct 2019 12:03:15 -0700 Subject: ipv4: fix race condition between route lookup and invalidation Jesse and Ido reported the following race condition: - Received packet A is forwarded and cached dst entry is taken from the nexthop ('nhc->nhc_rth_input'). Calls skb_dst_set() - Given Jesse has busy routers ("ingesting full BGP routing tables from multiple ISPs"), route is added / deleted and rt_cache_flush() is called - Received packet B tries to use the same cached dst entry from t0, but rt_cache_valid() is no longer true and it is replaced in rt_cache_route() by the newer one. This calls dst_dev_put() on the original dst entry which assigns the blackhole netdev to 'dst->dev' - dst_input(skb) is called on packet A and it is dropped due to 'dst->dev' being the blackhole netdev There are 2 issues in the v4 routing code: 1. A per-netns counter is used to do the validation of the route. That means whenever a route is changed in the netns, users of all routes in the netns needs to redo lookup. v6 has an implementation of only updating fn_sernum for routes that are affected. 2. When rt_cache_valid() returns false, rt_cache_route() is called to throw away the current cache, and create a new one. This seems unnecessary because as long as this route does not change, the route cache does not need to be recreated. To fully solve the above 2 issues, it probably needs quite some code changes and requires careful testing, and does not suite for net branch. So this patch only tries to add the deleted cached rt into the uncached list, so user could still be able to use it to receive packets until it's done. Fixes: 95c47f9cf5e0 ("ipv4: call dst_dev_put() properly") Signed-off-by: Wei Wang Reported-by: Ido Schimmel Reported-by: Jesse Hathaway Tested-by: Jesse Hathaway Acked-by: Martin KaFai Lau Cc: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5bc172abd143..621f83434b24 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1482,7 +1482,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { - dst_dev_put(&orig->dst); + rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { -- cgit v1.2.3 From 9669fffc1415bb0c30e5d2ec98a8e1c3a418cb9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Oct 2019 18:00:56 -0700 Subject: net: ensure correct skb->tstamp in various fragmenters Thomas found that some forwarded packets would be stuck in FQ packet scheduler because their skb->tstamp contained timestamps far in the future. We thought we addressed this point in commit 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") but there is still an issue when/if a packet needs to be fragmented. In order to meet EDT requirements, we have to make sure all fragments get the original skb->tstamp. Note that this original skb->tstamp should be zero in forwarding path, but might have a non zero value in output path if user decided so. Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Signed-off-by: Eric Dumazet Reported-by: Thomas Bartschies Signed-off-by: David S. Miller --- net/bridge/netfilter/nf_conntrack_bridge.c | 3 +++ net/ipv4/ip_output.c | 3 +++ net/ipv6/ip6_output.c | 3 +++ net/ipv6/netfilter.c | 3 +++ 4 files changed, 12 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 8842798c29e6..506d6141e44e 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -33,6 +33,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; unsigned int hlen, ll_rs, mtu; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; int err; @@ -80,6 +81,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -104,6 +106,7 @@ slow_path: goto blackhole; } + skb2->tstamp = tstamp; err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 28fca408812c..814b9b8882a0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -771,6 +771,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; + ktime_t tstamp = skb->tstamp; struct ip_frag_state state; int err = 0; @@ -846,6 +847,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ip_fraglist_prepare(skb, &iter); } + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) @@ -900,6 +902,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + skb2->tstamp = tstamp; err = output(net, sk, skb2); if (err) goto fail; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index edadee4a7e76..71827b56c006 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -768,6 +768,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, inet6_sk(skb->sk) : NULL; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; + ktime_t tstamp = skb->tstamp; int hroom, err = 0; __be32 frag_id; u8 *prevhdr, nexthdr = 0; @@ -855,6 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -913,6 +915,7 @@ slow_path: /* * Put this fragment into the sending queue. */ + frag->tstamp = tstamp; err = output(net, sk, frag); if (err) goto fail; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index a9bff556d3b2..409e79b84a83 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -119,6 +119,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; unsigned int mtu, hlen; @@ -183,6 +184,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); + skb->tstamp = tstamp; err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -215,6 +217,7 @@ slow_path: goto blackhole; } + skb2->tstamp = tstamp; err = output(net, sk, data, skb2); if (err) goto blackhole; -- cgit v1.2.3 From ec3359b685db834c249953db6ac5717bf5cde425 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 17 Oct 2019 14:44:02 +0200 Subject: vsock/virtio: send a credit update when buffer size is changed When the user application set a new buffer size value, we should update the remote peer about this change, since it uses this information to calculate the credit available. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a666ef8fc54e..db127a69f5c3 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -458,6 +458,9 @@ void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) vvs->buf_size_max = val; vvs->buf_size = val; vvs->buf_alloc = val; + + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); } EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); -- cgit v1.2.3 From ae6fcfbf5f03de3407b809aaee319330d3dc7f8b Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 17 Oct 2019 14:44:03 +0200 Subject: vsock/virtio: discard packets if credit is not respected If the remote peer doesn't respect the credit information (buf_alloc, fwd_cnt), sending more data than it can send, we should drop the packets to prevent a malicious peer from using all of our memory. This is patch follows the VIRTIO spec: "VIRTIO_VSOCK_OP_RW data packets MUST only be transmitted when the peer has sufficient free buffer space for the payload" Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index db127a69f5c3..481f7f8a1655 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -204,10 +204,14 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, return virtio_transport_get_ops()->send_pkt(pkt); } -static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, +static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) { + if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) + return false; + vvs->rx_bytes += pkt->len; + return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, @@ -879,14 +883,18 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, struct virtio_vsock_pkt *pkt) { struct virtio_vsock_sock *vvs = vsk->trans; - bool free_pkt = false; + bool can_enqueue, free_pkt = false; pkt->len = le32_to_cpu(pkt->hdr.len); pkt->off = 0; spin_lock_bh(&vvs->rx_lock); - virtio_transport_inc_rx_pkt(vvs, pkt); + can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); + if (!can_enqueue) { + free_pkt = true; + goto out; + } /* Try to copy small packets into the buffer of last packet queued, * to avoid wasting memory queueing the entire buffer with a small -- cgit v1.2.3 From a7fa12d15855904aff1716e1fc723c03ba38c5cc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Oct 2019 09:16:57 -0700 Subject: net: netem: fix error path for corrupted GSO frames To corrupt a GSO frame we first perform segmentation. We then proceed using the first segment instead of the full GSO skb and requeue the rest of the segments as separate packets. If there are any issues with processing the first segment we still want to process the rest, therefore we jump to the finish_segs label. Commit 177b8007463c ("net: netem: fix backlog accounting for corrupted GSO frames") started using the pointer to the first segment in the "rest of segments processing", but as mentioned above the first segment may had already been freed at this point. Backlog corrections for parent qdiscs have to be adjusted. Fixes: 177b8007463c ("net: netem: fix backlog accounting for corrupted GSO frames") Reported-by: kbuild test robot Reported-by: Dan Carpenter Reported-by: Ben Hutchings Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0e44039e729c..942eb17f413c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -509,6 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) { qdisc_drop(skb, sch, to_free); + skb = NULL; goto finish_segs; } @@ -593,9 +594,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, finish_segs: if (segs) { unsigned int len, last_len; - int nb = 0; + int nb; - len = skb->len; + len = skb ? skb->len : 0; + nb = skb ? 1 : 0; while (segs) { skb2 = segs->next; @@ -612,7 +614,8 @@ finish_segs: } segs = skb2; } - qdisc_tree_reduce_backlog(sch, -nb, prev_len - len); + /* Parent qdiscs accounted for 1 skb of size @prev_len */ + qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); } return NET_XMIT_SUCCESS; } -- cgit v1.2.3 From e0ad032e144731a5928f2d75e91c2064ba1a764c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Oct 2019 09:16:58 -0700 Subject: net: netem: correct the parent's backlog when corrupted packet was dropped If packet corruption failed we jump to finish_segs and return NET_XMIT_SUCCESS. Seeing success will make the parent qdisc increment its backlog, that's incorrect - we need to return NET_XMIT_DROP. Fixes: 6071bd1aa13e ("netem: Segment GSO packets on enqueue") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 942eb17f413c..42e557d48e4e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -616,6 +616,8 @@ finish_segs: } /* Parent qdiscs accounted for 1 skb of size @prev_len */ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); + } else if (!skb) { + return NET_XMIT_DROP; } return NET_XMIT_SUCCESS; } -- cgit v1.2.3 From 50c7d2ba9de20f60a2d527ad6928209ef67e4cdd Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 18 Oct 2019 17:02:46 -0400 Subject: net: dsa: fix switch tree list If there are multiple switch trees on the device, only the last one will be listed, because the arguments of list_add_tail are swapped. Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation") Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 73002022c9d8..716d265ba8ca 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -46,7 +46,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index) dst->index = index; INIT_LIST_HEAD(&dst->list); - list_add_tail(&dsa_tree_list, &dst->list); + list_add_tail(&dst->list, &dsa_tree_list); kref_init(&dst->refcount); -- cgit v1.2.3