diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-23 16:50:42 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-23 16:50:42 -0700 |
| commit | e728258debd553c95d2e70f9cd97c9fde27c7130 (patch) | |
| tree | 18ef97c80f9923717f5cf6bdab44d77607ca0f4b /net | |
| parent | e8df5a0c0d041588e7f02781822d637d226cdbe8 (diff) | |
| parent | 5e6391da4539c35422c0df1d1d2d9a9bb97cd736 (diff) | |
Merge tag 'net-7.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Jakub Kicinski:
"Including fixes from Netfilter.
Steady stream of fixes. Last two weeks feel comparable to the two
weeks before the merge window. Lots of AI-aided bug discovery. A newer
big source is Sashiko/Gemini (Roman Gushchin's system), which points
out issues in existing code during patch review (maybe 25% of fixes
here likely originating from Sashiko). Nice thing is these are often
fixed by the respective maintainers, not drive-bys.
Current release - new code bugs:
- kconfig: MDIO_PIC64HPSC should depend on ARCH_MICROCHIP
Previous releases - regressions:
- add async ndo_set_rx_mode and switch drivers which we promised to
be called under the per-netdev mutex to it
- dsa: remove duplicate netdev_lock_ops() for conduit ethtool ops
- hv_sock: report EOF instead of -EIO for FIN
- vsock/virtio: fix MSG_PEEK calculation on bytes to copy
Previous releases - always broken:
- ipv6: fix possible UAF in icmpv6_rcv()
- icmp: validate reply type before using icmp_pointers
- af_unix: drop all SCM attributes for SOCKMAP
- netfilter: fix a number of bugs in the osf (OS fingerprinting)
- eth: intel: fix timestamp interrupt configuration for E825C
Misc:
- bunch of data-race annotations"
* tag 'net-7.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (148 commits)
rxrpc: Fix error handling in rxgk_extract_token()
rxrpc: Fix re-decryption of RESPONSE packets
rxrpc: Fix rxrpc_input_call_event() to only unshare DATA packets
rxrpc: Fix missing validation of ticket length in non-XDR key preparsing
rxgk: Fix potential integer overflow in length check
rxrpc: Fix conn-level packet handling to unshare RESPONSE packets
rxrpc: Fix potential UAF after skb_unshare() failure
rxrpc: Fix rxkad crypto unalignment handling
rxrpc: Fix memory leaks in rxkad_verify_response()
net: rds: fix MR cleanup on copy error
m68k: mvme147: Make me the maintainer
net: txgbe: fix firmware version check
selftests/bpf: check epoll readiness during reuseport migration
tcp: call sk_data_ready() after listener migration
vhost_net: fix sleeping with preempt-disabled in vhost_net_busy_poll()
ipv6: Cap TLV scan in ip6_tnl_parse_tlv_enc_lim
tipc: fix double-free in tipc_buf_append()
llc: Return -EINPROGRESS from llc_ui_connect()
ipv4: icmp: validate reply type before using icmp_pointers
selftests/net: packetdrill: cover RFC 5961 5.2 challenge ACK on both edges
...
Diffstat (limited to 'net')
80 files changed, 1505 insertions, 527 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index c40f7d5c4fca..7aa3af8b10ea 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -172,39 +172,42 @@ int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - struct vlan_priority_tci_mapping *mp = NULL; + struct vlan_priority_tci_mapping __rcu **mpp; + struct vlan_priority_tci_mapping *mp; struct vlan_priority_tci_mapping *np; + u32 bucket = skb_prio & 0xF; u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; /* See if a priority mapping exists.. */ - mp = vlan->egress_priority_map[skb_prio & 0xF]; + mpp = &vlan->egress_priority_map[bucket]; + mp = rtnl_dereference(*mpp); while (mp) { if (mp->priority == skb_prio) { - if (mp->vlan_qos && !vlan_qos) + if (!vlan_qos) { + rcu_assign_pointer(*mpp, rtnl_dereference(mp->next)); vlan->nr_egress_mappings--; - else if (!mp->vlan_qos && vlan_qos) - vlan->nr_egress_mappings++; - mp->vlan_qos = vlan_qos; + kfree_rcu(mp, rcu); + } else { + WRITE_ONCE(mp->vlan_qos, vlan_qos); + } return 0; } - mp = mp->next; + mpp = &mp->next; + mp = rtnl_dereference(*mpp); } /* Create a new mapping then. */ - mp = vlan->egress_priority_map[skb_prio & 0xF]; + if (!vlan_qos) + return 0; + np = kmalloc_obj(struct vlan_priority_tci_mapping); if (!np) return -ENOBUFS; - np->next = mp; np->priority = skb_prio; np->vlan_qos = vlan_qos; - /* Before inserting this element in hash table, make sure all its fields - * are committed to memory. - * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask() - */ - smp_wmb(); - vlan->egress_priority_map[skb_prio & 0xF] = np; + RCU_INIT_POINTER(np->next, rtnl_dereference(vlan->egress_priority_map[bucket])); + rcu_assign_pointer(vlan->egress_priority_map[bucket], np); if (vlan_qos) vlan->nr_egress_mappings++; return 0; @@ -604,11 +607,17 @@ void vlan_dev_free_egress_priority(const struct net_device *dev) int i; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { - while ((pm = vlan->egress_priority_map[i]) != NULL) { - vlan->egress_priority_map[i] = pm->next; - kfree(pm); + pm = rtnl_dereference(vlan->egress_priority_map[i]); + RCU_INIT_POINTER(vlan->egress_priority_map[i], NULL); + while (pm) { + struct vlan_priority_tci_mapping *next; + + next = rtnl_dereference(pm->next); + kfree_rcu(pm, rcu); + pm = next; } } + vlan->nr_egress_mappings = 0; } static void vlan_dev_uninit(struct net_device *dev) diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index a000b1ef0520..368d53ca7d87 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -260,13 +260,11 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { - for (pm = vlan->egress_priority_map[i]; pm; - pm = pm->next) { - if (!pm->vlan_qos) - continue; - + for (pm = rcu_dereference_rtnl(vlan->egress_priority_map[i]); pm; + pm = rcu_dereference_rtnl(pm->next)) { + u16 vlan_qos = READ_ONCE(pm->vlan_qos); m.from = pm->priority; - m.to = (pm->vlan_qos >> 13) & 0x7; + m.to = (vlan_qos >> 13) & 0x7; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index fa67374bda49..0e424e0895b7 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -262,15 +262,19 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) vlan->ingress_priority_map[7]); seq_printf(seq, " EGRESS priority mappings: "); + rcu_read_lock(); for (i = 0; i < 16; i++) { - const struct vlan_priority_tci_mapping *mp - = vlan->egress_priority_map[i]; + const struct vlan_priority_tci_mapping *mp = + rcu_dereference(vlan->egress_priority_map[i]); while (mp) { + u16 vlan_qos = READ_ONCE(mp->vlan_qos); + seq_printf(seq, "%u:%d ", - mp->priority, ((mp->vlan_qos >> 13) & 0x7)); - mp = mp->next; + mp->priority, ((vlan_qos >> 13) & 0x7)); + mp = rcu_dereference(mp->next); } } + rcu_read_unlock(); seq_puts(seq, "\n"); return 0; diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 0c8a06cdd46f..deb1ab1f24b0 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -201,11 +201,12 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, f = br_fdb_find_rcu(br, n->ha, vid); if (f) { + const struct net_bridge_port *dst = READ_ONCE(f->dst); bool replied = false; if ((p && (p->flags & BR_PROXYARP)) || - (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)) || - br_is_neigh_suppress_enabled(f->dst, vid)) { + (dst && (dst->flags & BR_PROXYARP_WIFI)) || + br_is_neigh_suppress_enabled(dst, vid)) { if (!vid) br_arp_send(br, p, skb->dev, sip, tip, sha, n->ha, sha, 0, 0); @@ -469,9 +470,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, f = br_fdb_find_rcu(br, n->ha, vid); if (f) { + const struct net_bridge_port *dst = READ_ONCE(f->dst); bool replied = false; - if (br_is_neigh_suppress_enabled(f->dst, vid)) { + if (br_is_neigh_suppress_enabled(dst, vid)) { if (vid != 0) br_nd_send(br, p, skb, n, skb->vlan_proto, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e2c17f620f00..6eb3ab69a514 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -236,6 +236,7 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, const unsigned char *addr, __u16 vid) { + const struct net_bridge_port *dst; struct net_bridge_fdb_entry *f; struct net_device *dev = NULL; struct net_bridge *br; @@ -248,8 +249,11 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, br = netdev_priv(br_dev); rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); - if (f && f->dst) - dev = f->dst->dev; + if (f) { + dst = READ_ONCE(f->dst); + if (dst) + dev = dst->dev; + } rcu_read_unlock(); return dev; @@ -346,7 +350,7 @@ static void fdb_delete_local(struct net_bridge *br, vg = nbp_vlan_group(op); if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { - f->dst = op; + WRITE_ONCE(f->dst, op); clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } @@ -357,7 +361,7 @@ static void fdb_delete_local(struct net_bridge *br, /* Maybe bridge device has same hw addr? */ if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { - f->dst = NULL; + WRITE_ONCE(f->dst, NULL); clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } @@ -928,6 +932,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long maxnum, unsigned long skip) { + const struct net_bridge_port *dst; struct net_bridge_fdb_entry *f; struct __fdb_entry *fe = buf; unsigned long delta; @@ -944,7 +949,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, continue; /* ignore pseudo entry for local MAC address */ - if (!f->dst) + dst = READ_ONCE(f->dst); + if (!dst) continue; if (skip) { @@ -956,8 +962,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN); /* due to ABI compat need to split into hi/lo */ - fe->port_no = f->dst->port_no; - fe->port_hi = f->dst->port_no >> 8; + fe->port_no = dst->port_no; + fe->port_hi = dst->port_no >> 8; fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); if (!test_bit(BR_FDB_STATIC, &f->flags)) { @@ -1083,9 +1089,11 @@ int br_fdb_dump(struct sk_buff *skb, rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { + const struct net_bridge_port *dst = READ_ONCE(f->dst); + if (*idx < ctx->fdb_idx) goto skip; - if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { + if (filter_dev && (!dst || dst->dev != filter_dev)) { if (filter_dev != dev) goto skip; /* !f->dst is a special case for bridge @@ -1093,10 +1101,10 @@ int br_fdb_dump(struct sk_buff *skb, * Therefore need a little more filtering * we only want to dump the !f->dst case */ - if (f->dst) + if (dst) goto skip; } - if (!filter_dev && f->dst) + if (!filter_dev && dst) goto skip; err = fdb_fill_info(skb, br, f, diff --git a/net/core/dev.c b/net/core/dev.c index e59f6025067c..d426c1beeb76 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9593,14 +9593,14 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) +int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); promiscuity = dev->promiscuity + inc; if (promiscuity == 0) { @@ -9636,16 +9636,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } - if (notify) { - /* The ops lock is only required to ensure consistent locking - * for `NETDEV_CHANGE` notifiers. This function is sometimes - * called without the lock, even for devices that are ops - * locked, such as in `dev_uc_sync_multiple` when using - * bonding or teaming. - */ - netdev_ops_assert_locked(dev); + if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); - } return 0; } @@ -9667,7 +9659,7 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify) unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); allmulti = dev->allmulti + inc; if (allmulti == 0) { @@ -9697,46 +9689,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } -/* - * Upload unicast and multicast address lists to device and - * configure RX filtering. When the device doesn't support unicast - * filtering it is put in promiscuous mode while unicast addresses - * are present. - */ -void __dev_set_rx_mode(struct net_device *dev) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - /* dev_open will call this function so the list will stay sane. */ - if (!(dev->flags&IFF_UP)) - return; - - if (!netif_device_present(dev)) - return; - - if (!(dev->priv_flags & IFF_UNICAST_FLT)) { - /* Unicast addresses changes may only happen under the rtnl, - * therefore calling __dev_set_promiscuity here is safe. - */ - if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1, false); - dev->uc_promisc = true; - } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1, false); - dev->uc_promisc = false; - } - } - - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); -} - -void dev_set_rx_mode(struct net_device *dev) -{ - netif_addr_lock_bh(dev); - __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); -} /** * netif_get_flags() - get flags reported to userspace @@ -9775,7 +9727,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, unsigned int old_flags = dev->flags; int ret; - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); /* * Set the flags on our device. @@ -11408,6 +11360,11 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } + if (netdev_need_ops_lock(dev) && + dev->netdev_ops->ndo_set_rx_mode && + !dev->netdev_ops->ndo_set_rx_mode_async) + netdev_WARN(dev, "ops-locked drivers should use ndo_set_rx_mode_async\n"); + ret = netdev_do_alloc_pcpu_stats(dev); if (ret) goto err_uninit; @@ -12127,6 +12084,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #endif mutex_init(&dev->lock); + INIT_LIST_HEAD(&dev->rx_mode_node); + __hw_addr_init(&dev->rx_mode_addr_cache); dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -12231,6 +12190,8 @@ void free_netdev(struct net_device *dev) kfree(rcu_dereference_protected(dev->ingress_queue, 1)); + __hw_addr_flush(&dev->rx_mode_addr_cache); + /* Flush device addresses */ dev_addr_flush(dev); diff --git a/net/core/dev.h b/net/core/dev.h index 628bdaebf0ca..0cf24b8f5008 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -78,6 +78,7 @@ void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +void __hw_addr_flush(struct netdev_hw_addr_list *list); #if IS_ENABLED(CONFIG_NET_SHAPER) void net_shaper_flush_netdev(struct net_device *dev); @@ -164,6 +165,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); +int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify); +bool netif_rx_mode_clean(struct net_device *dev); +void netif_rx_mode_sync(struct net_device *dev); void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 76c91f224886..d73fcb0c6785 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -11,9 +11,18 @@ #include <linux/rtnetlink.h> #include <linux/export.h> #include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <kunit/visibility.h> #include "dev.h" +static void netdev_rx_mode_work(struct work_struct *work); + +static LIST_HEAD(rx_mode_list); +static DEFINE_SPINLOCK(rx_mode_lock); +static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work); + /* * General list handling functions */ @@ -481,7 +490,7 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, } EXPORT_SYMBOL(__hw_addr_unsync_dev); -static void __hw_addr_flush(struct netdev_hw_addr_list *list) +void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; @@ -492,6 +501,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list) } list->count = 0; } +EXPORT_SYMBOL_IF_KUNIT(__hw_addr_flush); void __hw_addr_init(struct netdev_hw_addr_list *list) { @@ -501,6 +511,133 @@ void __hw_addr_init(struct netdev_hw_addr_list *list) } EXPORT_SYMBOL(__hw_addr_init); +static void __hw_addr_splice(struct netdev_hw_addr_list *dst, + struct netdev_hw_addr_list *src) +{ + src->tree = RB_ROOT; + list_splice_init(&src->list, &dst->list); + dst->count += src->count; + src->count = 0; +} + +/** + * __hw_addr_list_snapshot - create a snapshot copy of an address list + * @snap: destination snapshot list (needs to be __hw_addr_init-initialized) + * @list: source address list to snapshot + * @addr_len: length of addresses + * @cache: entry cache to reuse entries from; falls back to GFP_ATOMIC + * + * Creates a copy of @list reusing entries from @cache when available. + * Must be called under a spinlock. + * + * Return: 0 on success, -errno on failure. + */ +int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap, + const struct netdev_hw_addr_list *list, + int addr_len, struct netdev_hw_addr_list *cache) +{ + struct netdev_hw_addr *ha, *entry; + + list_for_each_entry(ha, &list->list, list) { + if (cache->count) { + entry = list_first_entry(&cache->list, + struct netdev_hw_addr, list); + list_del(&entry->list); + cache->count--; + memcpy(entry->addr, ha->addr, addr_len); + entry->type = ha->type; + entry->global_use = false; + entry->synced = 0; + } else { + entry = __hw_addr_create(ha->addr, addr_len, ha->type, + false, false); + if (!entry) { + __hw_addr_flush(snap); + return -ENOMEM; + } + } + entry->sync_cnt = ha->sync_cnt; + entry->refcount = ha->refcount; + + list_add_tail(&entry->list, &snap->list); + __hw_addr_insert(snap, entry, addr_len); + snap->count++; + } + + return 0; +} +EXPORT_SYMBOL_IF_KUNIT(__hw_addr_list_snapshot); + +/** + * __hw_addr_list_reconcile - sync snapshot changes back and free snapshots + * @real_list: the real address list to update + * @work: the working snapshot (modified by driver via __hw_addr_sync_dev) + * @ref: the reference snapshot (untouched copy of original state) + * @addr_len: length of addresses + * @cache: entry cache to return snapshot entries to for reuse + * + * Walks the reference snapshot and compares each entry against the work + * snapshot to compute sync_cnt deltas. Applies those deltas to @real_list. + * Returns snapshot entries to @cache for reuse; frees both snapshots. + * Caller must hold netif_addr_lock_bh. + */ +void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list, + struct netdev_hw_addr_list *work, + struct netdev_hw_addr_list *ref, int addr_len, + struct netdev_hw_addr_list *cache) +{ + struct netdev_hw_addr *ref_ha, *tmp, *work_ha, *real_ha; + int delta; + + list_for_each_entry_safe(ref_ha, tmp, &ref->list, list) { + work_ha = __hw_addr_lookup(work, ref_ha->addr, addr_len, + ref_ha->type); + if (work_ha) + delta = work_ha->sync_cnt - ref_ha->sync_cnt; + else + delta = -1; + + if (delta == 0) + continue; + + real_ha = __hw_addr_lookup(real_list, ref_ha->addr, addr_len, + ref_ha->type); + if (!real_ha) { + /* The real entry was concurrently removed. If the + * driver synced this addr to hardware (delta > 0), + * re-insert it as a stale entry so the next work + * run unsyncs it from hardware. + */ + if (delta > 0) { + rb_erase(&ref_ha->node, &ref->tree); + list_del(&ref_ha->list); + ref->count--; + ref_ha->sync_cnt = delta; + ref_ha->refcount = delta; + list_add_tail_rcu(&ref_ha->list, + &real_list->list); + __hw_addr_insert(real_list, ref_ha, + addr_len); + real_list->count++; + } + continue; + } + + real_ha->sync_cnt += delta; + real_ha->refcount += delta; + if (!real_ha->refcount) { + rb_erase(&real_ha->node, &real_list->tree); + list_del_rcu(&real_ha->list); + kfree_rcu(real_ha, rcu_head); + real_list->count--; + } + } + + __hw_addr_splice(cache, work); + __hw_addr_splice(cache, ref); +} +EXPORT_SYMBOL_IF_KUNIT(__hw_addr_list_reconcile); + /* * Device addresses handling functions */ @@ -1049,3 +1186,249 @@ void dev_mc_init(struct net_device *dev) __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init); + +static int netif_addr_lists_snapshot(struct net_device *dev, + struct netdev_hw_addr_list *uc_snap, + struct netdev_hw_addr_list *mc_snap, + struct netdev_hw_addr_list *uc_ref, + struct netdev_hw_addr_list *mc_ref) +{ + int err; + + err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len, + &dev->rx_mode_addr_cache); + if (!err) + err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len, + &dev->rx_mode_addr_cache); + if (!err) + err = __hw_addr_list_snapshot(mc_snap, &dev->mc, + dev->addr_len, + &dev->rx_mode_addr_cache); + if (!err) + err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len, + &dev->rx_mode_addr_cache); + + if (err) { + __hw_addr_flush(uc_snap); + __hw_addr_flush(uc_ref); + __hw_addr_flush(mc_snap); + } + + return err; +} + +static void netif_addr_lists_reconcile(struct net_device *dev, + struct netdev_hw_addr_list *uc_snap, + struct netdev_hw_addr_list *mc_snap, + struct netdev_hw_addr_list *uc_ref, + struct netdev_hw_addr_list *mc_ref) +{ + __hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len, + &dev->rx_mode_addr_cache); + __hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len, + &dev->rx_mode_addr_cache); +} + +/** + * netif_uc_promisc_update() - evaluate whether uc_promisc should be toggled. + * @dev: device + * + * Must be called under netif_addr_lock_bh. + * Return: +1 to enter promisc, -1 to leave, 0 for no change. + */ +static int netif_uc_promisc_update(struct net_device *dev) +{ + if (dev->priv_flags & IFF_UNICAST_FLT) + return 0; + + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { + dev->uc_promisc = true; + return 1; + } + if (netdev_uc_empty(dev) && dev->uc_promisc) { + dev->uc_promisc = false; + return -1; + } + return 0; +} + +static void netif_rx_mode_run(struct net_device *dev) +{ + struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref; + const struct net_device_ops *ops = dev->netdev_ops; + int promisc_inc; + int err; + + might_sleep(); + netdev_ops_assert_locked(dev); + + __hw_addr_init(&uc_snap); + __hw_addr_init(&mc_snap); + __hw_addr_init(&uc_ref); + __hw_addr_init(&mc_ref); + + if (!(dev->flags & IFF_UP) || !netif_device_present(dev)) + return; + + if (ops->ndo_set_rx_mode_async) { + netif_addr_lock_bh(dev); + err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap, + &uc_ref, &mc_ref); + if (err) { + netdev_WARN(dev, "failed to sync uc/mc addresses\n"); + netif_addr_unlock_bh(dev); + return; + } + + promisc_inc = netif_uc_promisc_update(dev); + netif_addr_unlock_bh(dev); + } else { + netif_addr_lock_bh(dev); + promisc_inc = netif_uc_promisc_update(dev); + netif_addr_unlock_bh(dev); + } + + if (promisc_inc) + __dev_set_promiscuity(dev, promisc_inc, false); + + if (ops->ndo_set_rx_mode_async) { + ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap); + + netif_addr_lock_bh(dev); + netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap, + &uc_ref, &mc_ref); + netif_addr_unlock_bh(dev); + } else if (ops->ndo_set_rx_mode) { + netif_addr_lock_bh(dev); + ops->ndo_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + } +} + +static void netdev_rx_mode_work(struct work_struct *work) +{ + struct net_device *dev; + + rtnl_lock(); + + while (true) { + spin_lock_bh(&rx_mode_lock); + if (list_empty(&rx_mode_list)) { + spin_unlock_bh(&rx_mode_lock); + break; + } + dev = list_first_entry(&rx_mode_list, struct net_device, + rx_mode_node); + list_del_init(&dev->rx_mode_node); + /* We must free netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->rx_mode_tracker); + spin_unlock_bh(&rx_mode_lock); + + netdev_lock_ops(dev); + netif_rx_mode_run(dev); + netdev_unlock_ops(dev); + /* Use __dev_put() because netdev_tracker_free() was already + * called above. Must be after netdev_unlock_ops() to prevent + * netdev_run_todo() from freeing the device while still in use. + */ + __dev_put(dev); + } + + rtnl_unlock(); +} + +static void netif_rx_mode_queue(struct net_device *dev) +{ + spin_lock_bh(&rx_mode_lock); + if (list_empty(&dev->rx_mode_node)) { + list_add_tail(&dev->rx_mode_node, &rx_mode_list); + netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC); + } + spin_unlock_bh(&rx_mode_lock); + schedule_work(&rx_mode_work); +} + +/** + * __dev_set_rx_mode() - upload unicast and multicast address lists to device + * and configure RX filtering. + * @dev: device + * + * When the device doesn't support unicast filtering it is put in promiscuous + * mode while unicast addresses are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int promisc_inc; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags & IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (ops->ndo_set_rx_mode_async || ops->ndo_change_rx_flags || + netdev_need_ops_lock(dev)) { + netif_rx_mode_queue(dev); + return; + } + + /* Legacy path for non-ops-locked HW devices. */ + + promisc_inc = netif_uc_promisc_update(dev); + if (promisc_inc) + __dev_set_promiscuity(dev, promisc_inc, false); + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); +} + +bool netif_rx_mode_clean(struct net_device *dev) +{ + bool clean = false; + + spin_lock_bh(&rx_mode_lock); + if (!list_empty(&dev->rx_mode_node)) { + list_del_init(&dev->rx_mode_node); + clean = true; + /* We must release netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->rx_mode_tracker); + } + spin_unlock_bh(&rx_mode_lock); + + return clean; +} + +/** + * netif_rx_mode_sync() - sync rx mode inline + * @dev: network device + * + * Drivers implementing ndo_set_rx_mode_async() have their rx mode callback + * executed from a workqueue. This allows the callback to sleep, but means + * the hardware update is deferred and may not be visible to userspace + * by the time the initiating syscall returns. netif_rx_mode_sync() steals + * workqueue update and executes it inline. This preserves the atomicity of + * operations to the userspace. + */ +void netif_rx_mode_sync(struct net_device *dev) +{ + if (netif_rx_mode_clean(dev)) { + netif_rx_mode_run(dev); + /* Use __dev_put() because netdev_tracker_free() was already + * called inside netif_rx_mode_clean(). + */ + __dev_put(dev); + } +} diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 8e1dba825e94..260e71a2399f 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -2,22 +2,31 @@ #include <kunit/test.h> #include <linux/etherdevice.h> +#include <linux/math64.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> static const struct net_device_ops dummy_netdev_ops = { }; +#define ADDR_A 1 +#define ADDR_B 2 +#define ADDR_C 3 + struct dev_addr_test_priv { u32 addr_seen; + u32 addr_synced; + u32 addr_unsynced; }; static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a) { struct dev_addr_test_priv *datp = netdev_priv(netdev); - if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) { datp->addr_seen |= 1 << a[0]; + datp->addr_synced |= 1 << a[0]; + } return 0; } @@ -26,11 +35,22 @@ static int dev_addr_test_unsync(struct net_device *netdev, { struct dev_addr_test_priv *datp = netdev_priv(netdev); - if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) { datp->addr_seen &= ~(1 << a[0]); + datp->addr_unsynced |= 1 << a[0]; + } return 0; } +static void dev_addr_test_reset(struct net_device *netdev) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + datp->addr_seen = 0; + datp->addr_synced = 0; + datp->addr_unsynced = 0; +} + static int dev_addr_test_init(struct kunit *test) { struct dev_addr_test_priv *datp; @@ -225,6 +245,363 @@ static void dev_addr_test_add_excl(struct kunit *test) rtnl_unlock(); } +/* Snapshot test: basic sync with no concurrent modifications. + * Add one address, snapshot, driver syncs it, reconcile propagates + * sync_cnt delta back to real list. + */ +static void dev_addr_test_snapshot_sync(struct kunit *test) +{ + struct netdev_hw_addr_list snap, ref, cache; + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + struct netdev_hw_addr *ha; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + rtnl_lock(); + + memset(addr, ADDR_A, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + + /* Snapshot: ADDR_A has sync_cnt=0, refcount=1 (new) */ + netif_addr_lock_bh(netdev); + __hw_addr_init(&snap); + __hw_addr_init(&ref); + __hw_addr_init(&cache); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN, + &cache)); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN, + &cache)); + netif_addr_unlock_bh(netdev); + + /* Driver syncs ADDR_A to hardware */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced); + + /* Reconcile: delta=+1 applied to real entry */ + netif_addr_lock_bh(netdev); + __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN, + &cache); + netif_addr_unlock_bh(netdev); + + /* Real entry should now reflect the sync: sync_cnt=1, refcount=2 */ + KUNIT_EXPECT_EQ(test, 1, netdev->uc.count); + ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list); + KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN); + KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt); + KUNIT_EXPECT_EQ(test, 2, ha->refcount); + + /* Second work run: already synced, nothing to do */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced); + KUNIT_EXPECT_EQ(test, 1, netdev->uc.count); + + __hw_addr_flush(&cache); + rtnl_unlock(); +} + +/* Snapshot test: ADDR_A synced to hardware, then concurrently removed + * from the real list before reconcile runs. Reconcile re-inserts ADDR_A as + * a stale entry so the next work run unsyncs it from hardware. + */ +static void dev_addr_test_snapshot_remove_during_sync(struct kunit *test) +{ + struct netdev_hw_addr_list snap, ref, cache; + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + struct netdev_hw_addr *ha; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + rtnl_lock(); + + memset(addr, ADDR_A, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + + /* Snapshot: ADDR_A is new (sync_cnt=0, refcount=1) */ + netif_addr_lock_bh(netdev); + __hw_addr_init(&snap); + __hw_addr_init(&ref); + __hw_addr_init(&cache); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN, + &cache)); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN, + &cache)); + netif_addr_unlock_bh(netdev); + + /* Driver syncs ADDR_A to hardware */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced); + + /* Concurrent removal: user deletes ADDR_A while driver was working */ + memset(addr, ADDR_A, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + KUNIT_EXPECT_EQ(test, 0, netdev->uc.count); + + /* Reconcile: ADDR_A gone from real list but driver synced it, + * so it gets re-inserted as stale (sync_cnt=1, refcount=1). + */ + netif_addr_lock_bh(netdev); + __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN, + &cache); + netif_addr_unlock_bh(netdev); + + KUNIT_EXPECT_EQ(test, 1, netdev->uc.count); + ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list); + KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN); + KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt); + KUNIT_EXPECT_EQ(test, 1, ha->refcount); + + /* Second work run: stale entry gets unsynced from HW and removed */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_unsynced); + KUNIT_EXPECT_EQ(test, 0, netdev->uc.count); + + __hw_addr_flush(&cache); + rtnl_unlock(); +} + +/* Snapshot test: ADDR_A was stale (unsynced from hardware by driver), + * but concurrently re-added by the user. The re-add bumps refcount of + * the existing stale entry. Reconcile applies delta=-1, leaving ADDR_A + * as a fresh entry (sync_cnt=0, refcount=1) for the next work run. + */ +static void dev_addr_test_snapshot_readd_during_unsync(struct kunit *test) +{ + struct netdev_hw_addr_list snap, ref, cache; + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + struct netdev_hw_addr *ha; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + rtnl_lock(); + + memset(addr, ADDR_A, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + + /* Sync ADDR_A to hardware: sync_cnt=1, refcount=2 */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced); + + /* User removes ADDR_A: refcount=1, sync_cnt=1 -> stale */ + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + + /* Snapshot: ADDR_A is stale (sync_cnt=1, refcount=1) */ + netif_addr_lock_bh(netdev); + __hw_addr_init(&snap); + __hw_addr_init(&ref); + __hw_addr_init(&cache); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN, + &cache)); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN, + &cache)); + netif_addr_unlock_bh(netdev); + + /* Driver unsyncs stale ADDR_A from hardware */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_unsynced); + + /* Concurrent: user re-adds ADDR_A. dev_uc_add finds the existing + * stale entry and bumps refcount from 1 -> 2. sync_cnt stays 1. + */ + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + KUNIT_EXPECT_EQ(test, 1, netdev->uc.count); + + /* Reconcile: ref sync_cnt=1 matches real sync_cnt=1, delta=-1 + * applied. Result: sync_cnt=0, refcount=1 (fresh). + */ + netif_addr_lock_bh(netdev); + __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN, + &cache); + netif_addr_unlock_bh(netdev); + + /* Entry survives as fresh: needs re-sync to HW */ + KUNIT_EXPECT_EQ(test, 1, netdev->uc.count); + ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list); + KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN); + KUNIT_EXPECT_EQ(test, 0, ha->sync_cnt); + KUNIT_EXPECT_EQ(test, 1, ha->refcount); + + /* Second work run: fresh entry gets synced to HW */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced); + + __hw_addr_flush(&cache); + rtnl_unlock(); +} + +/* Snapshot test: ADDR_A is new (synced by driver), and independent ADDR_B + * is concurrently removed from the real list. A's sync delta propagates + * normally; B's absence doesn't interfere. + */ +static void dev_addr_test_snapshot_add_and_remove(struct kunit *test) +{ + struct netdev_hw_addr_list snap, ref, cache; + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + struct netdev_hw_addr *ha; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + rtnl_lock(); + + /* Add ADDR_A and ADDR_B (will be synced then removed) */ + memset(addr, ADDR_A, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + memset(addr, ADDR_B, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + + /* Sync both to hardware: sync_cnt=1, refcount=2 */ + __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + + /* Add ADDR_C (new, will be synced by snapshot) */ + memset(addr, ADDR_C, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + + /* Snapshot: A,B synced (sync_cnt=1,refcount=2); C new (0,1) */ + netif_addr_lock_bh(netdev); + __hw_addr_init(&snap); + __hw_addr_init(&ref); + __hw_addr_init(&cache); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN, + &cache)); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN, + &cache)); + netif_addr_unlock_bh(netdev); + + /* Driver syncs snapshot: ADDR_C is new -> synced; A,B already synced */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1 << ADDR_C, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced); + + /* Concurrent: user removes addr B while driver was working */ + memset(addr, ADDR_B, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + + /* Reconcile: ADDR_C's delta=+1 applied to real list. + * ADDR_B's delta=0 (unchanged in snapshot), + * so nothing to apply to ADDR_B. + */ + netif_addr_lock_bh(netdev); + __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN, + &cache); + netif_addr_unlock_bh(netdev); + + /* ADDR_A: unchanged (sync_cnt=1, refcount=2) + * ADDR_B: refcount went from 2->1 via dev_uc_del (still present, stale) + * ADDR_C: sync propagated (sync_cnt=1, refcount=2) + */ + KUNIT_EXPECT_EQ(test, 3, netdev->uc.count); + netdev_hw_addr_list_for_each(ha, &netdev->uc) { + u8 id = ha->addr[0]; + + if (!memchr_inv(ha->addr, id, ETH_ALEN)) { + if (id == ADDR_A) { + KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt); + KUNIT_EXPECT_EQ(test, 2, ha->refcount); + } else if (id == ADDR_B) { + /* B: still present but now stale */ + KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt); + KUNIT_EXPECT_EQ(test, 1, ha->refcount); + } else if (id == ADDR_C) { + KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt); + KUNIT_EXPECT_EQ(test, 2, ha->refcount); + } + } + } + + /* Second work run: ADDR_B is stale, gets unsynced and removed */ + dev_addr_test_reset(netdev); + __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0, datp->addr_synced); + KUNIT_EXPECT_EQ(test, 1 << ADDR_B, datp->addr_unsynced); + KUNIT_EXPECT_EQ(test, 2, netdev->uc.count); + + __hw_addr_flush(&cache); + rtnl_unlock(); +} + +static void dev_addr_test_snapshot_benchmark(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct netdev_hw_addr_list snap, cache; + u8 addr[ETH_ALEN]; + s64 duration = 0; + ktime_t start; + int i, iter; + + rtnl_lock(); + + for (i = 0; i < 1024; i++) { + memset(addr, 0, sizeof(addr)); + addr[0] = (i >> 8) & 0xff; + addr[1] = i & 0xff; + KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr)); + } + + __hw_addr_init(&cache); + + for (iter = 0; iter < 1000; iter++) { + netif_addr_lock_bh(netdev); + __hw_addr_init(&snap); + + start = ktime_get(); + KUNIT_EXPECT_EQ(test, 0, + __hw_addr_list_snapshot(&snap, &netdev->uc, + ETH_ALEN, &cache)); + duration += ktime_to_ns(ktime_sub(ktime_get(), start)); + + netif_addr_unlock_bh(netdev); + __hw_addr_flush(&snap); + } + + __hw_addr_flush(&cache); + + kunit_info(test, + "1024 addrs x 1000 snapshots: %lld ns total, %lld ns/iter", + duration, div_s64(duration, 1000)); + + rtnl_unlock(); +} + static struct kunit_case dev_addr_test_cases[] = { KUNIT_CASE(dev_addr_test_basic), KUNIT_CASE(dev_addr_test_sync_one), @@ -232,6 +609,11 @@ static struct kunit_case dev_addr_test_cases[] = { KUNIT_CASE(dev_addr_test_del_main), KUNIT_CASE(dev_addr_test_add_set), KUNIT_CASE(dev_addr_test_add_excl), + KUNIT_CASE(dev_addr_test_snapshot_sync), + KUNIT_CASE(dev_addr_test_snapshot_remove_during_sync), + KUNIT_CASE(dev_addr_test_snapshot_readd_during_unsync), + KUNIT_CASE(dev_addr_test_snapshot_add_and_remove), + KUNIT_CASE_SLOW(dev_addr_test_snapshot_benchmark), {} }; @@ -243,5 +625,6 @@ static struct kunit_suite dev_addr_test_suite = { }; kunit_test_suite(dev_addr_test_suite); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list"); MODULE_LICENSE("GPL"); diff --git a/net/core/dev_api.c b/net/core/dev_api.c index f28852078aa6..437947dd08ed 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, netdev_lock_ops(dev); ret = netif_change_flags(dev, flags, extack); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return ret; @@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) netdev_lock_ops(dev); ret = netif_set_promiscuity(dev, inc); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return ret; @@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc) netdev_lock_ops(dev); ret = netif_set_allmulti(dev, inc, true); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return ret; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7a8966544c9d..f3979b276090 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return err; case SIOCADDMULTI: - if (!ops->ndo_set_rx_mode || + if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return err; case SIOCDELMULTI: - if (!ops->ndo_set_rx_mode || + if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return err; diff --git a/net/core/filter.c b/net/core/filter.c index 5fa9189eb772..80a3b702a2d4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5396,7 +5396,7 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, if (val <= 0) return -EINVAL; tp->snd_cwnd_clamp = val; - tp->snd_ssthresh = val; + WRITE_ONCE(tp->snd_ssthresh, val); break; case TCP_BPF_DELACK_MAX: timeout = usecs_to_jiffies(val); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1b61bb25ba0e..2a98f5fa74eb 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1374,16 +1374,13 @@ proto_again: break; } - /* least significant bit of the most significant octet - * indicates if protocol field was compressed + /* PFC (compressed 1-byte protocol) frames are not processed. + * A compressed protocol field has the least significant bit of + * the most significant octet set, which will fail the following + * ppp_proto_is_valid(), returning FLOW_DISSECT_RET_OUT_BAD. */ ppp_proto = ntohs(hdr->proto); - if (ppp_proto & 0x0100) { - ppp_proto = ppp_proto >> 8; - nhoff += PPPOE_SES_HLEN - 1; - } else { - nhoff += PPPOE_SES_HLEN; - } + nhoff += PPPOE_SES_HLEN; if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 69daba3ddaf0..b613bb6e07df 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3431,6 +3431,7 @@ errout: dev->name); } + netif_rx_mode_sync(dev); netdev_unlock_ops(dev); return err; diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c index a1b044467bd6..8398d72d7e4d 100644 --- a/net/dsa/conduit.c +++ b/net/dsa/conduit.c @@ -27,9 +27,7 @@ static int dsa_conduit_get_regs_len(struct net_device *dev) int len; if (ops && ops->get_regs_len) { - netdev_lock_ops(dev); len = ops->get_regs_len(dev); - netdev_unlock_ops(dev); if (len < 0) return len; ret += len; @@ -60,15 +58,11 @@ static void dsa_conduit_get_regs(struct net_device *dev, int len; if (ops && ops->get_regs_len && ops->get_regs) { - netdev_lock_ops(dev); len = ops->get_regs_len(dev); - if (len < 0) { - netdev_unlock_ops(dev); + if (len < 0) return; - } regs->len = len; ops->get_regs(dev, regs, data); - netdev_unlock_ops(dev); data += regs->len; } @@ -115,10 +109,8 @@ static void dsa_conduit_get_ethtool_stats(struct net_device *dev, int count, mcount = 0; if (ops && ops->get_sset_count && ops->get_ethtool_stats) { - netdev_lock_ops(dev); mcount = ops->get_sset_count(dev, ETH_SS_STATS); ops->get_ethtool_stats(dev, stats, data); - netdev_unlock_ops(dev); } list_for_each_entry(dp, &dst->ports, list) { @@ -149,10 +141,8 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, if (count >= 0) phy_ethtool_get_stats(dev->phydev, stats, data); } else if (ops && ops->get_sset_count && ops->get_ethtool_phy_stats) { - netdev_lock_ops(dev); count = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ops->get_ethtool_phy_stats(dev, stats, data); - netdev_unlock_ops(dev); } if (count < 0) @@ -176,13 +166,11 @@ static int dsa_conduit_get_sset_count(struct net_device *dev, int sset) struct dsa_switch_tree *dst = cpu_dp->dst; int count = 0; - netdev_lock_ops(dev); if (sset == ETH_SS_PHY_STATS && dev->phydev && (!ops || !ops->get_ethtool_phy_stats)) count = phy_ethtool_get_sset_count(dev->phydev); else if (ops && ops->get_sset_count) count = ops->get_sset_count(dev, sset); - netdev_unlock_ops(dev); if (count < 0) count = 0; @@ -239,7 +227,6 @@ static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset, struct dsa_switch_tree *dst = cpu_dp->dst; int count, mcount = 0; - netdev_lock_ops(dev); if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) { mcount = phy_ethtool_get_sset_count(dev->phydev); @@ -253,7 +240,6 @@ static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset, mcount = 0; ops->get_strings(dev, stringset, data); } - netdev_unlock_ops(dev); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2f4fac22d1ab..7eeff658b467 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -64,6 +64,7 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/fcntl.h> +#include <linux/nospec.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> @@ -371,7 +372,9 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); - if (icmp_pointers[icmp_param->data.icmph.type].error) + if (icmp_param->data.icmph.type <= NR_ICMP_TYPES && + icmp_pointers[array_index_nospec(icmp_param->data.icmph.type, + NR_ICMP_TYPES + 1)].error) nf_ct_attach(skb, icmp_param->skb); return 0; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4ac3ae1bc1af..928654c34156 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1479,16 +1479,19 @@ void inet_csk_listen_stop(struct sock *sk) if (nreq) { refcount_set(&nreq->rsk_refcnt, 1); + rcu_read_lock(); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); + READ_ONCE(nsk->sk_data_ready)(nsk); } else { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } + rcu_read_unlock(); /* inet_csk_reqsk_queue_add() has already * called inet_child_forget() on failure case. diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a5db7c67d61b..625a1ca13b1b 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -79,7 +79,7 @@ static int ipt_nat_register_lookups(struct net *net) while (i) nf_nat_ipv4_unregister_fn(net, &ops[--i]); - kfree(ops); + kfree_rcu(ops, rcu); return ret; } } @@ -100,7 +100,7 @@ static void ipt_nat_unregister_lookups(struct net *net) for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) nf_nat_ipv4_unregister_fn(net, &ops[i]); - kfree(ops); + kfree_rcu(ops, rcu); } static int iptable_nat_table_init(struct net *net) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 904a060a7330..f92fcc39fc4c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2469,10 +2469,10 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, goto err_notify; } - /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially + /* When replacing a nexthop with one of a different family, potentially * update IPv4 indication in all the groups using the nexthop. */ - if (oldi->family == AF_INET && newi->family == AF_INET6) { + if (oldi->family != newi->family) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2014a6408e93..432fa28e47d4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3424,7 +3424,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + WRITE_ONCE(tp->snd_ssthresh, TCP_INFINITE_SSTHRESH); tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; @@ -3622,7 +3622,8 @@ static void tcp_enable_tx_delay(struct sock *sk, int val) if (delta && sk->sk_state == TCP_ESTABLISHED) { s64 srtt = (s64)tp->srtt_us + delta; - tp->srtt_us = clamp_t(s64, srtt, 1, ~0U); + WRITE_ONCE(tp->srtt_us, + clamp_t(s64, srtt, 1, ~0U)); /* Note: does not deal with non zero icsk_backoff */ tcp_set_rto(sk); @@ -4190,12 +4191,18 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; - enum tcp_chrono i; + enum tcp_chrono i, cur; + /* Following READ_ONCE()s pair with WRITE_ONCE()s in tcp_chrono_set(). + * This is because socket lock might not be owned by us at this point. + * This is best effort, tcp_get_timestamping_opt_stats() can + * see wrong values. A real fix would be too costly for TCP fast path. + */ + cur = READ_ONCE(tp->chrono_type); for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { - stats[i] = tp->chrono_stat[i - 1]; - if (i == tp->chrono_type) - stats[i] += tcp_jiffies32 - tp->chrono_start; + stats[i] = READ_ONCE(tp->chrono_stat[i - 1]); + if (i == cur) + stats[i] += tcp_jiffies32 - READ_ONCE(tp->chrono_start); stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } @@ -4427,9 +4434,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, - tp->data_segs_out, TCP_NLA_PAD); + READ_ONCE(tp->data_segs_out), TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, - tp->total_retrans, TCP_NLA_PAD); + READ_ONCE(tp->total_retrans), TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; @@ -4438,37 +4445,42 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); - nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); - nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); - nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + nla_put_u32(stats, TCP_NLA_SND_CWND, READ_ONCE(tp->snd_cwnd)); + nla_put_u32(stats, TCP_NLA_REORDERING, READ_ONCE(tp->reordering)); + nla_put_u32(stats, TCP_NLA_MIN_RTT, data_race(tcp_min_rtt(tp))); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, READ_ONCE(inet_csk(sk)->icsk_retransmits)); - nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); - nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); - nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); - nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); - - nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, data_race(!!tp->rate_app_limited)); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, READ_ONCE(tp->snd_ssthresh)); + nla_put_u32(stats, TCP_NLA_DELIVERED, READ_ONCE(tp->delivered)); + nla_put_u32(stats, TCP_NLA_DELIVERED_CE, READ_ONCE(tp->delivered_ce)); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, + max_t(int, 0, + READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_una))); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); - nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, - TCP_NLA_PAD); - nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, + nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, READ_ONCE(tp->bytes_sent), TCP_NLA_PAD); - nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); - nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); - nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); - nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); + nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, + READ_ONCE(tp->bytes_retrans), TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_DSACK_DUPS, READ_ONCE(tp->dsack_dups)); + nla_put_u32(stats, TCP_NLA_REORD_SEEN, READ_ONCE(tp->reord_seen)); + nla_put_u32(stats, TCP_NLA_SRTT, READ_ONCE(tp->srtt_us) >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, + READ_ONCE(tp->timeout_rehash)); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, - max_t(int, 0, tp->write_seq - tp->snd_nxt)); + max_t(int, 0, + READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt))); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); - nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); + nla_put_u32(stats, TCP_NLA_REHASH, + READ_ONCE(tp->plb_rehash) + READ_ONCE(tp->timeout_rehash)); return stats; } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 1ddc20a399b0..aec7805b1d37 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -897,8 +897,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ - tcp_sk(sk)->snd_ssthresh = - bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); + WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= @@ -1043,7 +1043,7 @@ __bpf_kfunc static void bbr_init(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->prior_cwnd = 0; - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + WRITE_ONCE(tp->snd_ssthresh, TCP_INFINITE_SSTHRESH); bbr->rtt_cnt = 0; bbr->next_rtt_delivered = tp->delivered; bbr->prev_ca_state = TCP_CA_Open; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 58358bf92e1b..65444ff14241 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -74,7 +74,7 @@ static void bictcp_init(struct sock *sk) bictcp_reset(ca); if (initial_ssthresh) - tcp_sk(sk)->snd_ssthresh = initial_ssthresh; + WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, initial_ssthresh); } /* diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ceabfd690a29..0812c390aee5 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -162,7 +162,7 @@ static void tcp_cdg_hystart_update(struct sock *sk) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); - tp->snd_ssthresh = tcp_snd_cwnd(tp); + WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp)); return; } } @@ -181,7 +181,7 @@ static void tcp_cdg_hystart_update(struct sock *sk) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); - tp->snd_ssthresh = tcp_snd_cwnd(tp); + WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp)); } } } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ab78b5ae8d0e..119bf8cbb007 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -136,7 +136,7 @@ __bpf_kfunc static void cubictcp_init(struct sock *sk) bictcp_hystart_reset(sk); if (!hystart && initial_ssthresh) - tcp_sk(sk)->snd_ssthresh = initial_ssthresh; + WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, initial_ssthresh); } __bpf_kfunc static void cubictcp_cwnd_event_tx_start(struct sock *sk) @@ -420,7 +420,7 @@ static void hystart_update(struct sock *sk, u32 delay) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); - tp->snd_ssthresh = tcp_snd_cwnd(tp); + WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp)); } } } @@ -440,7 +440,7 @@ static void hystart_update(struct sock *sk, u32 delay) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); - tp->snd_ssthresh = tcp_snd_cwnd(tp); + WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp)); } } } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 96c99999e09d..274e628e7cf8 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -177,7 +177,7 @@ static void dctcp_react_to_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); - tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); + WRITE_ONCE(tp->snd_ssthresh, max(tcp_snd_cwnd(tp) >> 1U, 2U)); } __bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 021f745747c5..d5c9e65d9760 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -476,14 +476,14 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { - tp->delivered_ce += ecn_count; + WRITE_ONCE(tp->delivered_ce, tp->delivered_ce + ecn_count); } /* Updates the delivered and delivered_ce counts */ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { - tp->delivered += delivered; + WRITE_ONCE(tp->delivered, tp->delivered + delivered); if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } @@ -1132,7 +1132,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tcp_bpf_rtt(sk, mrtt_us, srtt); } - tp->srtt_us = max(1U, srtt); + WRITE_ONCE(tp->srtt_us, max(1U, srtt)); } void tcp_update_pacing_rate(struct sock *sk) @@ -1246,7 +1246,7 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) state->flag |= FLAG_DSACK_TLP; - tp->dsack_dups += dup_segs; + WRITE_ONCE(tp->dsack_dups, tp->dsack_dups + dup_segs); /* Skip the DSACK if dup segs weren't retransmitted by sender */ if (tp->dsack_dups > tp->total_retrans) return 0; @@ -1293,12 +1293,13 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - tp->reordering = min_t(u32, (metric + mss - 1) / mss, - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); + WRITE_ONCE(tp->reordering, + min_t(u32, (metric + mss - 1) / mss, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering))); } /* This exciting event is worth to be remembered. 8) */ - tp->reord_seen++; + WRITE_ONCE(tp->reord_seen, tp->reord_seen + 1); NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } @@ -2439,9 +2440,10 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) if (!tcp_limit_reno_sacked(tp)) return; - tp->reordering = min_t(u32, tp->packets_out + addend, - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); - tp->reord_seen++; + WRITE_ONCE(tp->reordering, + min_t(u32, tp->packets_out + addend, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering))); + WRITE_ONCE(tp->reord_seen, tp->reord_seen + 1); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); } @@ -2565,7 +2567,7 @@ void tcp_enter_loss(struct sock *sk) (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->prior_cwnd = tcp_snd_cwnd(tp); - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + WRITE_ONCE(tp->snd_ssthresh, icsk->icsk_ca_ops->ssthresh(sk)); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } @@ -2579,8 +2581,8 @@ void tcp_enter_loss(struct sock *sk) reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); if (icsk->icsk_ca_state <= TCP_CA_Disorder && tp->sacked_out >= reordering) - tp->reordering = min_t(unsigned int, tp->reordering, - reordering); + WRITE_ONCE(tp->reordering, + min_t(unsigned int, tp->reordering, reordering)); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; @@ -2858,7 +2860,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk)); if (tp->prior_ssthresh > tp->snd_ssthresh) { - tp->snd_ssthresh = tp->prior_ssthresh; + WRITE_ONCE(tp->snd_ssthresh, tp->prior_ssthresh); tcp_ecn_withdraw_cwr(tp); } } @@ -2976,7 +2978,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tp->prior_cwnd = tcp_snd_cwnd(tp); tp->prr_delivered = 0; tp->prr_out = 0; - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + WRITE_ONCE(tp->snd_ssthresh, inet_csk(sk)->icsk_ca_ops->ssthresh(sk)); tcp_ecn_queue_cwr(tp); } @@ -3118,7 +3120,7 @@ static void tcp_non_congestion_loss_retransmit(struct sock *sk) if (icsk->icsk_ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(sk); + WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk)); tp->prior_ssthresh = 0; tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Loss); @@ -3910,7 +3912,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) sock_owned_by_me((struct sock *)tp); tp->bytes_acked += delta; tcp_snd_sne_update(tp, ack); - tp->snd_una = ack; + WRITE_ONCE(tp->snd_una, ack); } static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) @@ -4284,11 +4286,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto old_ack; } - /* If the ack includes data we haven't sent yet, discard - * this segment (RFC793 Section 3.9). + /* If the ack includes data we haven't sent yet, drop the + * segment. RFC 793 Section 3.9 and RFC 5961 Section 5.2 + * require us to send an ACK back in that case. */ - if (after(ack, tp->snd_nxt)) + if (after(ack, tp->snd_nxt)) { + if (!(flag & FLAG_NO_CHALLENGE_ACK)) + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA; + } if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; @@ -6777,7 +6783,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); /* SYN-data is counted as two separate packets in tcp_ack() */ if (tp->delivered > 1) - --tp->delivered; + WRITE_ONCE(tp->delivered, tp->delivered - 1); } tcp_fastopen_add_skb(sk, synack); @@ -7210,7 +7216,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) SKB_DR_SET(reason, NOT_SPECIFIED); switch (sk->sk_state) { case TCP_SYN_RECV: - tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ + WRITE_ONCE(tp->delivered, tp->delivered + 1); /* SYN-ACK delivery isn't tracked in tcp_ack */ if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); @@ -7238,7 +7244,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (sk->sk_socket) sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + WRITE_ONCE(tp->snd_una, TCP_SKB_CB(skb)->ack_seq); tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 06b1d5d3b6df..dc0c081fc1f3 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -490,13 +490,13 @@ void tcp_init_metrics(struct sock *sk) val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ? 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { - tp->snd_ssthresh = val; + WRITE_ONCE(tp->snd_ssthresh, val); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; + WRITE_ONCE(tp->snd_ssthresh, tp->snd_cwnd_clamp); } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) - tp->reordering = val; + WRITE_ONCE(tp->reordering, val); crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index a60662f4bdf9..f345897a68df 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -396,8 +396,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) /* We have enough data to determine we are congested */ ca->nv_allow_cwnd_growth = 0; - tp->snd_ssthresh = - (nv_ssthresh_factor * max_win) >> 3; + WRITE_ONCE(tp->snd_ssthresh, + (nv_ssthresh_factor * max_win) >> 3); if (tcp_snd_cwnd(tp) - max_win > 2) { /* gap > 2, we do exponential cwnd decrease */ int dec; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e99687526a6..f9d8755705f7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -171,7 +171,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) tcp_ca_event(sk, CA_EVENT_CWND_RESTART); - tp->snd_ssthresh = tcp_current_ssthresh(sk); + WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk)); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) @@ -1688,8 +1688,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); - tp->data_segs_out += tcp_skb_pcount(skb); - tp->bytes_sent += skb->len - tcp_header_size; + WRITE_ONCE(tp->data_segs_out, + tp->data_segs_out + tcp_skb_pcount(skb)); + WRITE_ONCE(tp->bytes_sent, + tp->bytes_sent + skb->len - tcp_header_size); } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) @@ -2142,7 +2144,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); + WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk)); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; @@ -3642,8 +3644,8 @@ start: TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans += segs; - tp->bytes_retrans += skb->len; + WRITE_ONCE(tp->total_retrans, tp->total_retrans + segs); + WRITE_ONCE(tp->bytes_retrans, tp->bytes_retrans + skb->len); /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom @@ -4152,7 +4154,7 @@ static void tcp_connect_init(struct sock *sk) tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); - tp->snd_una = tp->write_seq; + WRITE_ONCE(tp->snd_una, tp->write_seq); tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); @@ -4646,7 +4648,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ - tcp_sk_rw(sk)->total_retrans++; + WRITE_ONCE(tcp_sk_rw(sk)->total_retrans, + tcp_sk_rw(sk)->total_retrans + 1); } trace_tcp_retransmit_synack(sk, req); WRITE_ONCE(req->num_retrans, req->num_retrans + 1); diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c index 68ccdb9a5412..c11a0cd3f8fe 100644 --- a/net/ipv4/tcp_plb.c +++ b/net/ipv4/tcp_plb.c @@ -80,7 +80,7 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) sk_rethink_txhash(sk); plb->consec_cong_rounds = 0; - tcp_sk(sk)->plb_rehash++; + WRITE_ONCE(tcp_sk(sk)->plb_rehash, tcp_sk(sk)->plb_rehash + 1); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); } EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ea99988795e7..8d791a954cd6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -297,7 +297,7 @@ static int tcp_write_timeout(struct sock *sk) } if (sk_rethink_txhash(sk)) { - tp->timeout_rehash++; + WRITE_ONCE(tp->timeout_rehash, tp->timeout_rehash + 1); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 950a66966059..574453af6bc0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -245,7 +245,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), (u32)target_cwnd + 1)); - tp->snd_ssthresh = tcp_vegas_ssthresh(tp); + WRITE_ONCE(tp->snd_ssthresh, + tcp_vegas_ssthresh(tp)); } else if (tcp_in_slow_start(tp)) { /* Slow start. */ @@ -261,8 +262,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) * we slow down. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); - tp->snd_ssthresh - = tcp_vegas_ssthresh(tp); + WRITE_ONCE(tp->snd_ssthresh, + tcp_vegas_ssthresh(tp)); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. @@ -280,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp) tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp); - tp->snd_ssthresh = tcp_current_ssthresh(sk); + WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk)); } /* Wipe the slate clean for the next RTT. */ diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index c6e97141eef2..b5a42adfd6ca 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -244,11 +244,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) switch (event) { case CA_EVENT_COMPLETE_CWR: - tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + WRITE_ONCE(tp->snd_ssthresh, tcp_westwood_bw_rttmin(sk)); tcp_snd_cwnd_set(tp, tp->snd_ssthresh); break; case CA_EVENT_LOSS: - tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + WRITE_ONCE(tp->snd_ssthresh, tcp_westwood_bw_rttmin(sk)); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index b22b3dccd05e..9e581154f18f 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -147,7 +147,8 @@ do_vegas: tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), yeah->reno_count)); - tp->snd_ssthresh = tcp_snd_cwnd(tp); + WRITE_ONCE(tp->snd_ssthresh, + tcp_snd_cwnd(tp)); } if (yeah->reno_count <= 2) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 799d9e9ac45d..efb23807a026 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1104,7 +1104,6 @@ static int icmpv6_rcv(struct sk_buff *skb) struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); - const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; @@ -1135,12 +1134,10 @@ static int icmpv6_rcv(struct sk_buff *skb) __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); - saddr = &ipv6_hdr(skb)->saddr; - daddr = &ipv6_hdr(skb)->daddr; - if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n", - saddr, daddr); + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr); goto csum_error; } @@ -1220,7 +1217,8 @@ static int icmpv6_rcv(struct sk_buff *skb) break; net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n", - saddr, daddr); + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr); /* * error of unknown type. diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 46bc06506470..c468c83af0f2 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -62,6 +62,8 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); +#define IP6_TUNNEL_MAX_DEST_TLVS 8 + #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) @@ -425,11 +427,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) break; } if (nexthdr == NEXTHDR_DEST) { + int tlv_cnt = 0; u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; + if (unlikely(tlv_cnt++ >= IP6_TUNNEL_MAX_DEST_TLVS)) + break; + /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index e119d4f090cc..5be723232df8 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -81,7 +81,7 @@ static int ip6t_nat_register_lookups(struct net *net) while (i) nf_nat_ipv6_unregister_fn(net, &ops[--i]); - kfree(ops); + kfree_rcu(ops, rcu); return ret; } } @@ -102,7 +102,7 @@ static void ip6t_nat_unregister_lookups(struct net *net) for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) nf_nat_ipv6_unregister_fn(net, &ops[i]); - kfree(ops); + kfree_rcu(ops, rcu); } static int ip6table_nat_table_init(struct net *net) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 97b50d9b1365..9b64343ebad6 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -746,7 +746,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla, newts->type = LWTUNNEL_ENCAP_SEG6; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; - if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP) + if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP && + tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP_RED) newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; newts->headroom = seg6_lwt_headroom(tuninfo); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 59d593bb5d18..1b210db3119e 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -520,8 +520,10 @@ static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr, if (sk->sk_state == TCP_SYN_SENT) { const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); - if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) + if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) { + rc = -EINPROGRESS; goto out; + } rc = sock_intr_errno(timeo); if (signal_pending(current)) diff --git a/net/mctp/route.c b/net/mctp/route.c index 26fb8c6bbad2..1f3dccbb7aed 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -441,6 +441,7 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) unsigned long f; u8 tag, flags; int rc; + u8 ver; msk = NULL; rc = -EINVAL; @@ -467,7 +468,8 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) netid = mctp_cb(skb)->net; skb_pull(skb, sizeof(struct mctp_hdr)); - if (mh->ver != 1) + ver = mh->ver & MCTP_HDR_VER_MASK; + if (ver < MCTP_VER_MIN || ver > MCTP_VER_MAX) goto out; flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM); @@ -1317,6 +1319,7 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, struct mctp_dst dst; struct mctp_hdr *mh; int rc; + u8 ver; rcu_read_lock(); mdev = __mctp_dev_get(dev); @@ -1334,7 +1337,8 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, /* We have enough for a header; decode and route */ mh = mctp_hdr(skb); - if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) + ver = mh->ver & MCTP_HDR_VER_MASK; + if (ver < MCTP_VER_MIN || ver > MCTP_VER_MAX) goto err_drop; /* source must be valid unicast or null; drop reserved ranges and diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index fbffd3a43fe8..718e910ff23f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3594,7 +3594,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); - __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); msk->rcvq_space.time = mptcp_stamp(); @@ -4252,6 +4251,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_graft_subflows(newsk); mptcp_rps_record_subflows(msk); + __mptcp_propagate_sndbuf(newsk, mptcp_subflow_tcp_sock(subflow)); /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 0fb5162992e5..ce542ed4b013 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -102,6 +102,18 @@ __ip_vs_dst_check(struct ip_vs_dest *dest) return dest_dst; } +/* Based on ip_exceeds_mtu(). */ +static bool ip_vs_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) + return false; + + return true; +} + static inline bool __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) { @@ -111,10 +123,9 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) */ if (IP6CB(skb)->frag_max_size > mtu) return true; /* largest fragment violate MTU */ - } - else if (skb->len > mtu && !skb_is_gso(skb)) { + } else if (ip_vs_exceeds_mtu(skb, mtu)) return true; /* Packet size violate MTU size */ - } + return false; } @@ -232,7 +243,7 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, return true; if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && - skb->len > mtu && !skb_is_gso(skb) && + ip_vs_exceeds_mtu(skb, mtu) && !ip_vs_iph_icmp(ipvsh))) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index 98deef6cde69..8f1054920a85 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -50,7 +50,7 @@ static unsigned int help(struct sk_buff *skb, return NF_DROP; } - sprintf(buffer, "%u", port); + snprintf(buffer, sizeof(buffer), "%u", port); if (!nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, protoff, matchoff, matchlen, buffer, strlen(buffer))) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 83b2b5e9759a..74ec224ce0d6 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1222,9 +1222,11 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); - for (i = 0; i < ops_count; i++) - kfree(nat_ops[i].priv); - kfree(nat_ops); + for (i = 0; i < ops_count; i++) { + priv = nat_ops[i].priv; + kfree_rcu(priv, rcu_head); + } + kfree_rcu(nat_ops, rcu); return ret; } @@ -1288,7 +1290,7 @@ void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, } nat_proto_net->nat_hook_ops = NULL; - kfree(nat_ops); + kfree_rcu(nat_ops, rcu); } unlock: mutex_unlock(&nf_nat_proto_mutex); diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index cf4aeb299bde..c845b6d1a2bd 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -68,25 +68,27 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff, } static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer, + size_t size, const union nf_inet_addr *addr, bool delim) { if (nf_ct_l3num(ct) == NFPROTO_IPV4) - return sprintf(buffer, "%pI4", &addr->ip); + return scnprintf(buffer, size, "%pI4", &addr->ip); else { if (delim) - return sprintf(buffer, "[%pI6c]", &addr->ip6); + return scnprintf(buffer, size, "[%pI6c]", &addr->ip6); else - return sprintf(buffer, "%pI6c", &addr->ip6); + return scnprintf(buffer, size, "%pI6c", &addr->ip6); } } static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer, + size_t size, const union nf_inet_addr *addr, u16 port) { if (nf_ct_l3num(ct) == NFPROTO_IPV4) - return sprintf(buffer, "%pI4:%u", &addr->ip, port); + return scnprintf(buffer, size, "%pI4:%u", &addr->ip, port); else - return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port); + return scnprintf(buffer, size, "[%pI6c]:%u", &addr->ip6, port); } static int map_addr(struct sk_buff *skb, unsigned int protoff, @@ -119,7 +121,7 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff, if (nf_inet_addr_cmp(&newaddr, addr) && newport == port) return 1; - buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport)); + buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer), &newaddr, ntohs(newport)); return mangle_packet(skb, protoff, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen); } @@ -212,7 +214,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, &addr, true) > 0 && nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) && !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) { - buflen = sip_sprintf_addr(ct, buffer, + buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), &ct->tuplehash[!dir].tuple.dst.u3, true); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, @@ -229,7 +231,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, &addr, false) > 0 && nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) && !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) { - buflen = sip_sprintf_addr(ct, buffer, + buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), &ct->tuplehash[!dir].tuple.src.u3, false); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, @@ -247,7 +249,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; - buflen = sprintf(buffer, "%u", ntohs(p)); + buflen = scnprintf(buffer, sizeof(buffer), "%u", ntohs(p)); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, poff, plen, buffer, buflen)) { nf_ct_helper_log(skb, ct, "cannot mangle rport"); @@ -418,7 +420,8 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) || exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { - buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port); + buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer), + &newaddr, port); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); @@ -438,8 +441,8 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff, { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + char buffer[sizeof("4294967295")]; unsigned int matchoff, matchlen; - char buffer[sizeof("65536")]; int buflen, c_len; /* Get actual SDP length */ @@ -454,7 +457,7 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff, &matchoff, &matchlen) <= 0) return 0; - buflen = sprintf(buffer, "%u", c_len); + buflen = scnprintf(buffer, sizeof(buffer), "%u", c_len); return mangle_packet(skb, protoff, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen); } @@ -491,7 +494,7 @@ static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff, char buffer[INET6_ADDRSTRLEN]; unsigned int buflen; - buflen = sip_sprintf_addr(ct, buffer, addr, false); + buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false); if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, type, term, buffer, buflen)) return 0; @@ -509,7 +512,7 @@ static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff, char buffer[sizeof("nnnnn")]; unsigned int buflen; - buflen = sprintf(buffer, "%u", port); + buflen = scnprintf(buffer, sizeof(buffer), "%u", port); if (!mangle_packet(skb, protoff, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen)) return 0; @@ -529,7 +532,7 @@ static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff unsigned int buflen; /* Mangle session description owner and contact addresses */ - buflen = sip_sprintf_addr(ct, buffer, addr, false); + buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false); if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff, SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen)) return 0; diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index d64ce21c7b55..acb753ec5697 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -31,26 +31,18 @@ EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); - const struct in_ifaddr *ifa; - int ret = 0; - if (ttl_check == NF_OSF_TTL_TRUE) + switch (ttl_check) { + case NF_OSF_TTL_TRUE: return ip->ttl == f_ttl; - if (ttl_check == NF_OSF_TTL_NOCHECK) - return 1; - else if (ip->ttl <= f_ttl) + break; + case NF_OSF_TTL_NOCHECK: return 1; - - in_dev_for_each_ifa_rcu(ifa, in_dev) { - if (inet_ifa_match(ip->saddr, ifa)) { - ret = (ip->ttl == f_ttl); - break; - } + case NF_OSF_TTL_LESS: + default: + return ip->ttl <= f_ttl; } - - return ret; } struct nf_osf_hdr_ctx { @@ -64,9 +56,9 @@ struct nf_osf_hdr_ctx { static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, - struct nf_osf_hdr_ctx *ctx) + const struct nf_osf_hdr_ctx *ctx) { - const __u8 *optpinit = ctx->optp; + const __u8 *optp = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; @@ -95,17 +87,17 @@ static bool nf_osf_match_one(const struct sk_buff *skb, check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == *ctx->optp) { + if (f->opt[optnum].kind == *optp) { __u32 len = f->opt[optnum].length; - const __u8 *optend = ctx->optp + len; + const __u8 *optend = optp + len; fmatch = FMATCH_OK; - switch (*ctx->optp) { + switch (*optp) { case OSFOPT_MSS: - mss = ctx->optp[3]; + mss = optp[3]; mss <<= 8; - mss |= ctx->optp[2]; + mss |= optp[2]; mss = ntohs((__force __be16)mss); break; @@ -113,7 +105,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb, break; } - ctx->optp = optend; + optp = optend; } else fmatch = FMATCH_OPT_WRONG; @@ -156,9 +148,6 @@ static bool nf_osf_match_one(const struct sk_buff *skb, } } - if (fmatch != FMATCH_OK) - ctx->optp = optpinit; - return fmatch == FMATCH_OK; } @@ -320,6 +309,10 @@ static int nfnl_osf_add_callback(struct sk_buff *skb, if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; + if (f->wss.wc >= OSF_WSS_MAX || + (f->wss.wc == OSF_WSS_MODULO && f->wss.val == 0)) + return -EINVAL; + for (i = 0; i < f->opt_num; i++) { if (!f->opt[i].length || f->opt[i].length > MAX_IPOPTLEN) return -EINVAL; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 18003433476c..c02d5cb52143 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nf_osf_data data; struct tcphdr _tcph; + if (nft_pf(pkt) != NFPROTO_IPV4) { + regs->verdict.code = NFT_BREAK; + return; + } + if (pkt->tprot != IPPROTO_TCP) { regs->verdict.code = NFT_BREAK; return; @@ -114,7 +119,6 @@ static int nft_osf_validate(const struct nft_ctx *ctx, switch (ctx->family) { case NFPROTO_IPV4: - case NFPROTO_IPV6: case NFPROTO_INET: hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING) | diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 4798cd2ca26e..7fc5156825e4 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -36,25 +36,37 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static struct xt_match mac_mt_reg __read_mostly = { - .name = "mac", - .revision = 0, - .family = NFPROTO_UNSPEC, - .match = mac_mt, - .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD), - .me = THIS_MODULE, +static struct xt_match mac_mt_reg[] __read_mostly = { + { + .name = "mac", + .family = NFPROTO_IPV4, + .match = mac_mt, + .matchsize = sizeof(struct xt_mac_info), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, + }, + { + .name = "mac", + .family = NFPROTO_IPV6, + .match = mac_mt, + .matchsize = sizeof(struct xt_mac_info), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, + }, }; static int __init mac_mt_init(void) { - return xt_register_match(&mac_mt_reg); + return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg)); } static void __exit mac_mt_exit(void) { - xt_unregister_match(&mac_mt_reg); + xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg)); } module_init(mac_mt_init); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 5bfb4843df66..8f2e57b2a586 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -127,26 +127,39 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } -static struct xt_match owner_mt_reg __read_mostly = { - .name = "owner", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = owner_check, - .match = owner_mt, - .matchsize = sizeof(struct xt_owner_match_info), - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), - .me = THIS_MODULE, +static struct xt_match owner_mt_reg[] __read_mostly = { + { + .name = "owner", + .revision = 1, + .family = NFPROTO_IPV4, + .checkentry = owner_check, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + }, + { + .name = "owner", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = owner_check, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, + } }; static int __init owner_mt_init(void) { - return xt_register_match(&owner_mt_reg); + return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg)); } static void __exit owner_mt_exit(void) { - xt_unregister_match(&owner_mt_reg); + xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg)); } module_init(owner_mt_init); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 53997771013f..d2b0b52434fa 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -137,24 +137,33 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) return 0; } -static struct xt_match physdev_mt_reg __read_mostly = { - .name = "physdev", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = physdev_mt_check, - .match = physdev_mt, - .matchsize = sizeof(struct xt_physdev_info), - .me = THIS_MODULE, +static struct xt_match physdev_mt_reg[] __read_mostly = { + { + .name = "physdev", + .family = NFPROTO_IPV4, + .checkentry = physdev_mt_check, + .match = physdev_mt, + .matchsize = sizeof(struct xt_physdev_info), + .me = THIS_MODULE, + }, + { + .name = "physdev", + .family = NFPROTO_IPV6, + .checkentry = physdev_mt_check, + .match = physdev_mt, + .matchsize = sizeof(struct xt_physdev_info), + .me = THIS_MODULE, + }, }; static int __init physdev_mt_init(void) { - return xt_register_match(&physdev_mt_reg); + return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg)); } static void __exit physdev_mt_exit(void) { - xt_unregister_match(&physdev_mt_reg); + xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg)); } module_init(physdev_mt_init); diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index 6df485f4403d..61b2f1e58d15 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -33,7 +33,7 @@ static struct xt_match realm_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_realm_info), .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .me = THIS_MODULE }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e209099218b4..bbbde50fc649 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2184,9 +2184,40 @@ error: return err; } +static size_t ovs_vport_cmd_msg_size(void) +{ + size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); + + msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_PORT_NO */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_TYPE */ + msgsize += nla_total_size(IFNAMSIZ); /* OVS_VPORT_ATTR_NAME */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_IFINDEX */ + msgsize += nla_total_size(sizeof(s32)); /* OVS_VPORT_ATTR_NETNSID */ + + /* OVS_VPORT_ATTR_STATS */ + msgsize += nla_total_size_64bit(sizeof(struct ovs_vport_stats)); + + /* OVS_VPORT_ATTR_UPCALL_STATS(OVS_VPORT_UPCALL_ATTR_SUCCESS + + * OVS_VPORT_UPCALL_ATTR_FAIL) + */ + msgsize += nla_total_size(nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64))); + + /* OVS_VPORT_ATTR_UPCALL_PID */ + msgsize += nla_total_size(nr_cpu_ids * sizeof(u32)); + + /* OVS_VPORT_ATTR_OPTIONS(OVS_TUNNEL_ATTR_DST_PORT + + * OVS_TUNNEL_ATTR_EXTENSION(OVS_VXLAN_EXT_GBP)) + */ + msgsize += nla_total_size(nla_total_size(sizeof(u16)) + + nla_total_size(nla_total_size(0))); + + return msgsize; +} + static struct sk_buff *ovs_vport_cmd_alloc_info(void) { - return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + return genlmsg_new(ovs_vport_cmd_msg_size(), GFP_KERNEL); } /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ @@ -2196,7 +2227,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, struct sk_buff *skb; int retval; - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + skb = ovs_vport_cmd_alloc_info(); if (!skb) return ERR_PTR(-ENOMEM); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 23f629e94a36..56b2e2d1a749 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -406,6 +406,9 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; + if (nla_len(ids) / sizeof(u32) > nr_cpu_ids) + return -EINVAL; + old = ovsl_dereference(vport->upcall_portids); vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4b043241fd56..8e6f3a734ba0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2718,7 +2718,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; - struct virtio_net_hdr *vnet_hdr = NULL; + struct virtio_net_hdr vnet_hdr; + bool has_vnet_hdr = false; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; @@ -2819,16 +2820,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { - vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; - if (tp_len < 0 || - __packet_snd_vnet_parse(vnet_hdr, tp_len)) { + if (tp_len < 0) { + tp_len = -EINVAL; + goto tpacket_error; + } + memcpy(&vnet_hdr, data - vnet_hdr_sz, sizeof(vnet_hdr)); + if (__packet_snd_vnet_parse(&vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), - vnet_hdr->hdr_len); + vnet_hdr.hdr_len); + has_vnet_hdr = true; } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, @@ -2865,12 +2870,12 @@ tpacket_error: } } - if (vnet_hdr_sz) { - if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { + if (has_vnet_hdr) { + if (virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } - virtio_net_hdr_set_proto(skb, vnet_hdr); + virtio_net_hdr_set_proto(skb, &vnet_hdr); } skb->destructor = tpacket_destruct_skb; diff --git a/net/rds/connection.c b/net/rds/connection.c index 412441aaa298..c10b7ed06c49 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -701,6 +701,13 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { + /* Zero the per-item buffer before handing it to the + * visitor so any field the visitor does not write - + * including implicit alignment padding - cannot leak + * stack contents to user space via rds_info_copy(). + */ + memset(buffer, 0, item_len); + /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) continue; @@ -750,6 +757,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, */ cp = conn->c_path; + /* Zero the per-item buffer for the same reason as + * rds_for_each_conn_info(): any byte the visitor + * does not write (including alignment padding) must + * not leak stack contents via rds_info_copy(). + */ + memset(buffer, 0, item_len); + /* XXX no cp_lock usage.. */ if (!visitor(cp, buffer)) continue; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index aa6465dc742c..61fb6e45281b 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -326,10 +326,6 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { - if (!need_odp) { - unpin_user_pages(pages, nr_pages); - kfree(sg); - } ret = -EFAULT; goto out; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 96ecb83c9071..27c2aa2dd023 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1486,7 +1486,6 @@ int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); -void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fec59d9338b9..fdd683261226 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -332,7 +332,25 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; - rxrpc_input_call_packet(call, skb); + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.securityIndex != 0 && + skb_cloned(skb)) { + /* Unshare the packet so that it can be + * modified by in-place decryption. + */ + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + + if (nskb) { + rxrpc_new_skb(nskb, rxrpc_skb_new_unshared); + rxrpc_input_call_packet(call, nskb); + rxrpc_free_skb(nskb, rxrpc_skb_put_call_rx); + } else { + /* OOM - Drop the packet. */ + rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem); + } + } else { + rxrpc_input_call_packet(call, skb); + } rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); did_receive = true; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 9a41ec708aeb..a2130d25aaa9 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -240,6 +240,33 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) rxrpc_notify_socket(call); } +static int rxrpc_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + int ret; + + if (skb_cloned(skb)) { + /* Copy the packet if shared so that we can do in-place + * decryption. + */ + struct sk_buff *nskb = skb_copy(skb, GFP_NOFS); + + if (nskb) { + rxrpc_new_skb(nskb, rxrpc_skb_new_unshared); + ret = conn->security->verify_response(conn, nskb); + rxrpc_free_skb(nskb, rxrpc_skb_put_response_copy); + } else { + /* OOM - Drop the packet. */ + rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem); + ret = -ENOMEM; + } + } else { + ret = conn->security->verify_response(conn, skb); + } + + return ret; +} + /* * connection-level Rx packet processor */ @@ -270,7 +297,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, } spin_unlock_irq(&conn->state_lock); - ret = conn->security->verify_response(conn, skb); + ret = rxrpc_verify_response(conn, skb); if (ret < 0) return ret; @@ -362,7 +389,6 @@ again: static void rxrpc_do_process_connection(struct rxrpc_connection *conn) { struct sk_buff *skb; - int ret; if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); @@ -371,17 +397,8 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn) * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { rxrpc_see_skb(skb, rxrpc_skb_see_conn_work); - ret = rxrpc_process_event(conn, skb); - switch (ret) { - case -ENOMEM: - case -EAGAIN: - skb_queue_head(&conn->rx_queue, skb); - rxrpc_queue_conn(conn, rxrpc_conn_queue_retry_work); - break; - default: - rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); - break; - } + rxrpc_process_event(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); } } diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 697956931925..dc5184a2fa9d 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -192,13 +192,12 @@ static bool rxrpc_extract_abort(struct sk_buff *skb) /* * Process packets received on the local endpoint */ -static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) +static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_connection *conn; struct sockaddr_rxrpc peer_srx; struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; - struct sk_buff *skb = *_skb; bool ret = false; skb_pull(skb, sizeof(struct udphdr)); @@ -244,25 +243,6 @@ static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); if (sp->hdr.seq == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_seq); - - /* Unshare the packet so that it can be modified for in-place - * decryption. - */ - if (sp->hdr.securityIndex != 0) { - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem); - *_skb = NULL; - return just_discard; - } - - if (skb != *_skb) { - rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare); - *_skb = skb; - rxrpc_new_skb(skb, rxrpc_skb_new_unshared); - sp = rxrpc_skb(skb); - } - } break; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -494,7 +474,7 @@ int rxrpc_io_thread(void *data) switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; - if (!rxrpc_input_packet(local, &skb)) + if (!rxrpc_input_packet(local, skb)) rxrpc_reject_packet(local, skb); trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_free_skb(skb, rxrpc_skb_put_input); diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 6301d79ee35a..3ec3d89fdf14 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -502,6 +502,10 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep) if (v1->security_index != RXRPC_SECURITY_RXKAD) goto error; + ret = -EKEYREJECTED; + if (v1->ticket_length > AFSTOKEN_RK_TIX_MAX) + goto error; + plen = sizeof(*token->kad) + v1->ticket_length; prep->quotalen += plen + sizeof(*token); diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index 30275cb5ba3e..0ef2a29eb695 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -214,7 +214,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + if (ticket_len > xdr_round_down(token_len - sizeof(container))) goto short_packet; _debug("KVNO %u", kvno); @@ -245,6 +245,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, if (ret != -ENOMEM) return rxrpc_abort_conn(conn, skb, ec, ret, rxgk_abort_resp_tok_dec); + return ret; } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 80164d89e19c..1e257d7ab8ec 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -34,6 +34,7 @@ struct rxgk_context { }; #define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_round_down(x) (round_down((x), sizeof(__be32))) #define xdr_object_len(x) (4 + xdr_round_up(x)) /* diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index eb7f2769d2b1..cba7935977f0 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -510,6 +510,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, rxkad_abort_2_short_header); + /* Don't let the crypto algo see a misaligned length. */ + sp->len = round_down(sp->len, 8); + /* Decrypt the skbuff in-place. TODO: We really want to decrypt * directly into the target buffer. */ @@ -543,8 +546,10 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, if (sg != _sg) kfree(sg); if (ret < 0) { - WARN_ON_ONCE(ret != -ENOMEM); - return ret; + if (ret == -ENOMEM) + return ret; + return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON, + rxkad_abort_2_crypto_unaligned); } /* Extract the decrypted packet length */ @@ -1136,7 +1141,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, struct rxrpc_crypt session_key; struct key *server_key; time64_t expiry; - void *ticket; + void *ticket = NULL; u32 version, kvno, ticket_len, level; __be32 csum; int ret, i; @@ -1162,13 +1167,13 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, ret = -ENOMEM; response = kzalloc_obj(struct rxkad_response, GFP_NOFS); if (!response) - goto temporary_error; + goto error; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), response, sizeof(*response)) < 0) { - rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_resp_short); - goto protocol_error; + ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_resp_short); + goto error; } version = ntohl(response->version); @@ -1178,62 +1183,62 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len); if (version != RXKAD_VERSION) { - rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_resp_version); - goto protocol_error; + ret = rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_resp_version); + goto error; } if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) { - rxrpc_abort_conn(conn, skb, RXKADTICKETLEN, -EPROTO, - rxkad_abort_resp_tkt_len); - goto protocol_error; + ret = rxrpc_abort_conn(conn, skb, RXKADTICKETLEN, -EPROTO, + rxkad_abort_resp_tkt_len); + goto error; } if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) { - rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, -EPROTO, - rxkad_abort_resp_unknown_tkt); - goto protocol_error; + ret = rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, -EPROTO, + rxkad_abort_resp_unknown_tkt); + goto error; } /* extract the kerberos ticket and decrypt and decode it */ ret = -ENOMEM; ticket = kmalloc(ticket_len, GFP_NOFS); if (!ticket) - goto temporary_error_free_resp; + goto error; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), ticket, ticket_len) < 0) { - rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_resp_short_tkt); - goto protocol_error; + ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_resp_short_tkt); + goto error; } ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, &session_key, &expiry); if (ret < 0) - goto temporary_error_free_ticket; + goto error; /* use the session key from inside the ticket to decrypt the * response */ ret = rxkad_decrypt_response(conn, response, &session_key); if (ret < 0) - goto temporary_error_free_ticket; + goto error; if (ntohl(response->encrypted.epoch) != conn->proto.epoch || ntohl(response->encrypted.cid) != conn->proto.cid || ntohl(response->encrypted.securityIndex) != conn->security_ix) { - rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, - rxkad_abort_resp_bad_param); - goto protocol_error_free; + ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_param); + goto error; } csum = response->encrypted.checksum; response->encrypted.checksum = 0; rxkad_calc_response_checksum(response); if (response->encrypted.checksum != csum) { - rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, - rxkad_abort_resp_bad_checksum); - goto protocol_error_free; + ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_checksum); + goto error; } for (i = 0; i < RXRPC_MAXCALLS; i++) { @@ -1241,38 +1246,38 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, u32 counter = READ_ONCE(conn->channels[i].call_counter); if (call_id > INT_MAX) { - rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, - rxkad_abort_resp_bad_callid); - goto protocol_error_free; + ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_bad_callid); + goto error; } if (call_id < counter) { - rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, - rxkad_abort_resp_call_ctr); - goto protocol_error_free; + ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + rxkad_abort_resp_call_ctr); + goto error; } if (call_id > counter) { if (conn->channels[i].call) { - rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, + ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO, rxkad_abort_resp_call_state); - goto protocol_error_free; + goto error; } conn->channels[i].call_counter = call_id; } } if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) { - rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO, - rxkad_abort_resp_ooseq); - goto protocol_error_free; + ret = rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO, + rxkad_abort_resp_ooseq); + goto error; } level = ntohl(response->encrypted.level); if (level > RXRPC_SECURITY_ENCRYPT) { - rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EPROTO, - rxkad_abort_resp_level); - goto protocol_error_free; + ret = rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EPROTO, + rxkad_abort_resp_level); + goto error; } conn->security_level = level; @@ -1280,31 +1285,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, * this the connection security can be handled in exactly the same way * as for a client connection */ ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); - if (ret < 0) - goto temporary_error_free_ticket; - - kfree(ticket); - kfree(response); - _leave(" = 0"); - return 0; - -protocol_error_free: - kfree(ticket); -protocol_error: - kfree(response); - key_put(server_key); - return -EPROTO; -temporary_error_free_ticket: +error: kfree(ticket); -temporary_error_free_resp: kfree(response); -temporary_error: - /* Ignore the response packet if we got a temporary error such as - * ENOMEM. We just want to send the challenge again. Note that we - * also come out this way if the ticket decryption fails. - */ key_put(server_key); + _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 3bcd6ee80396..e2169d1a14b5 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -47,15 +47,6 @@ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) } /* - * Note the dropping of a ref on a socket buffer by the core. - */ -void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) -{ - int n = atomic_inc_return(&rxrpc_n_rx_skbs); - trace_rxrpc_skb(skb, 0, n, why); -} - -/* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 05e0b14b5773..2c5a7a321a94 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -354,7 +354,7 @@ static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, goto assign_prev; tcf_mirred_to_dev(skb, m, dev_prev, - dev_is_mac_header_xmit(dev), + dev_is_mac_header_xmit(dev_prev), mirred_eaction, retval); assign_prev: dev_prev = dev; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ffea9fbd522d..02e1fa4577ae 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -619,7 +619,7 @@ static bool cake_update_flowkeys(struct flow_keys *keys, } port = rev ? tuple.src.u.all : tuple.dst.u.all; if (port != keys->ports.dst) { - port = keys->ports.dst; + keys->ports.dst = port; upd = true; } } diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index fe6f5e889625..241e6a46bd00 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -868,11 +868,35 @@ static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, old_backlog = sch->qstats.backlog; while (qdisc_qlen(sch) > sch->limit || q->memory_used > q->memory_limit) { - struct sk_buff *skb = qdisc_dequeue_internal(sch, true); + struct sk_buff *skb = NULL; - q->memory_used -= skb->truesize; - qdisc_qstats_backlog_dec(sch, skb); - rtnl_qdisc_drop(skb, sch); + if (qdisc_qlen(sch) > qdisc_qlen(q->l_queue)) { + skb = qdisc_dequeue_internal(sch, true); + if (unlikely(!skb)) { + WARN_ON_ONCE(1); + break; + } + q->memory_used -= skb->truesize; + rtnl_qdisc_drop(skb, sch); + } else if (qdisc_qlen(q->l_queue)) { + skb = qdisc_dequeue_internal(q->l_queue, true); + if (unlikely(!skb)) { + WARN_ON_ONCE(1); + break; + } + /* L-queue packets are counted in both sch and + * l_queue on enqueue; qdisc_dequeue_internal() + * handled l_queue, so we further account for sch. + */ + --sch->q.qlen; + qdisc_qstats_backlog_dec(sch, skb); + q->memory_used -= skb->truesize; + rtnl_qdisc_drop(skb, q->l_queue); + qdisc_qstats_drop(sch); + } else { + WARN_ON_ONCE(1); + break; + } } qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), old_backlog - sch->qstats.backlog); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2a3d758f67ab..0664b2f2d6f2 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -585,6 +585,8 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) }; struct list_head *pos; + sch_tree_lock(sch); + st.qdisc_stats.maxpacket = q->cstats.maxpacket; st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; @@ -593,7 +595,6 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.memory_usage = q->memory_usage; st.qdisc_stats.drop_overmemory = q->drop_overmemory; - sch_tree_lock(sch); list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 95e5d9bfd9c8..96021f52d835 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -198,7 +198,8 @@ static struct hh_flow_state *seek_list(const u32 hash, return NULL; list_del(&flow->flowchain); kfree(flow); - q->hh_flows_current_cnt--; + WRITE_ONCE(q->hh_flows_current_cnt, + q->hh_flows_current_cnt - 1); } else if (flow->hash_id == hash) { return flow; } @@ -226,7 +227,7 @@ static struct hh_flow_state *alloc_new_hh(struct list_head *head, } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { - q->hh_flows_overlimit++; + WRITE_ONCE(q->hh_flows_overlimit, q->hh_flows_overlimit + 1); return NULL; } /* Create new entry. */ @@ -234,7 +235,7 @@ static struct hh_flow_state *alloc_new_hh(struct list_head *head, if (!flow) return NULL; - q->hh_flows_current_cnt++; + WRITE_ONCE(q->hh_flows_current_cnt, q->hh_flows_current_cnt + 1); INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); @@ -309,7 +310,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; - q->hh_flows_total_cnt++; + WRITE_ONCE(q->hh_flows_total_cnt, q->hh_flows_total_cnt + 1); /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). @@ -403,7 +404,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; - q->drop_overlimit++; + WRITE_ONCE(q->drop_overlimit, q->drop_overlimit + 1); /* Return Congestion Notification only if we dropped a packet from this * bucket. */ @@ -686,10 +687,10 @@ static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { - .drop_overlimit = q->drop_overlimit, - .hh_overlimit = q->hh_flows_overlimit, - .hh_tot_count = q->hh_flows_total_cnt, - .hh_cur_count = q->hh_flows_current_cnt, + .drop_overlimit = READ_ONCE(q->drop_overlimit), + .hh_overlimit = READ_ONCE(q->hh_flows_overlimit), + .hh_tot_count = READ_ONCE(q->hh_flows_total_cnt), + .hh_cur_count = READ_ONCE(q->hh_flows_current_cnt), }; return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 16f3f629cb8e..fb53fbf0e328 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -90,7 +90,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, bool enqueue = false; if (unlikely(qdisc_qlen(sch) >= sch->limit)) { - q->stats.overlimit++; + WRITE_ONCE(q->stats.overlimit, q->stats.overlimit + 1); goto out; } @@ -104,7 +104,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* If packet is ecn capable, mark it if drop probability * is lower than 10%, else drop it. */ - q->stats.ecn_mark++; + WRITE_ONCE(q->stats.ecn_mark, q->stats.ecn_mark + 1); enqueue = true; } @@ -114,15 +114,15 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (!q->params.dq_rate_estimator) pie_set_enqueue_time(skb); - q->stats.packets_in++; + WRITE_ONCE(q->stats.packets_in, q->stats.packets_in + 1); if (qdisc_qlen(sch) > q->stats.maxq) - q->stats.maxq = qdisc_qlen(sch); + WRITE_ONCE(q->stats.maxq, qdisc_qlen(sch)); return qdisc_enqueue_tail(skb, sch); } out: - q->stats.dropped++; + WRITE_ONCE(q->stats.dropped, q->stats.dropped + 1); q->vars.accu_prob = 0; return qdisc_drop_reason(skb, sch, to_free, reason); } @@ -267,11 +267,11 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, count = count / dtime; if (vars->avg_dq_rate == 0) - vars->avg_dq_rate = count; + WRITE_ONCE(vars->avg_dq_rate, count); else - vars->avg_dq_rate = + WRITE_ONCE(vars->avg_dq_rate, (vars->avg_dq_rate - - (vars->avg_dq_rate >> 3)) + (count >> 3); + (vars->avg_dq_rate >> 3)) + (count >> 3)); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset @@ -381,7 +381,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (delta > 0) { /* prevent overflow */ if (vars->prob < oldprob) { - vars->prob = MAX_PROB; + WRITE_ONCE(vars->prob, MAX_PROB); /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next @@ -392,7 +392,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, } else { /* prevent underflow */ if (vars->prob > oldprob) - vars->prob = 0; + WRITE_ONCE(vars->prob, 0); } /* Non-linear drop in probability: Reduce drop probability quickly if @@ -403,7 +403,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, /* Reduce drop probability to 98.4% */ vars->prob -= vars->prob / 64; - vars->qdelay = qdelay; + WRITE_ONCE(vars->qdelay, qdelay); vars->backlog_old = backlog; /* We restart the measurement cycle if the following conditions are met @@ -502,21 +502,21 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { .prob = q->vars.prob << BITS_PER_BYTE, - .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / + .delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) / NSEC_PER_USEC, - .packets_in = q->stats.packets_in, - .overlimit = q->stats.overlimit, - .maxq = q->stats.maxq, - .dropped = q->stats.dropped, - .ecn_mark = q->stats.ecn_mark, + .packets_in = READ_ONCE(q->stats.packets_in), + .overlimit = READ_ONCE(q->stats.overlimit), + .maxq = READ_ONCE(q->stats.maxq), + .dropped = READ_ONCE(q->stats.dropped), + .ecn_mark = READ_ONCE(q->stats.ecn_mark), }; /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ st.dq_rate_estimating = q->params.dq_rate_estimator; /* unscale and return dq_rate in bytes per sec */ - if (q->params.dq_rate_estimator) - st.avg_dq_rate = q->vars.avg_dq_rate * + if (st.dq_rate_estimating) + st.avg_dq_rate = READ_ONCE(q->vars.avg_dq_rate) * (PSCHED_TICKS_PER_SEC) >> PIE_SCALE; return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index c8d3d09f15e3..432b8a3000a5 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -90,17 +90,20 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_PROB_MARK: qdisc_qstats_overlimit(sch); if (!red_use_ecn(q)) { - q->stats.prob_drop++; + WRITE_ONCE(q->stats.prob_drop, + q->stats.prob_drop + 1); goto congestion_drop; } if (INET_ECN_set_ce(skb)) { - q->stats.prob_mark++; + WRITE_ONCE(q->stats.prob_mark, + q->stats.prob_mark + 1); skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret); if (!skb) return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { - q->stats.prob_drop++; + WRITE_ONCE(q->stats.prob_drop, + q->stats.prob_drop + 1); goto congestion_drop; } @@ -111,17 +114,20 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, reason = QDISC_DROP_OVERLIMIT; qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q)) { - q->stats.forced_drop++; + WRITE_ONCE(q->stats.forced_drop, + q->stats.forced_drop + 1); goto congestion_drop; } if (INET_ECN_set_ce(skb)) { - q->stats.forced_mark++; + WRITE_ONCE(q->stats.forced_mark, + q->stats.forced_mark + 1); skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret); if (!skb) return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { - q->stats.forced_drop++; + WRITE_ONCE(q->stats.forced_drop, + q->stats.forced_drop + 1); goto congestion_drop; } @@ -135,7 +141,8 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, sch->qstats.backlog += len; sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { - q->stats.pdrop++; + WRITE_ONCE(q->stats.pdrop, + q->stats.pdrop + 1); qdisc_qstats_drop(sch); } return ret; @@ -463,9 +470,13 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED, &hw_stats_request); } - st.early = q->stats.prob_drop + q->stats.forced_drop; - st.pdrop = q->stats.pdrop; - st.marked = q->stats.prob_mark + q->stats.forced_mark; + st.early = READ_ONCE(q->stats.prob_drop) + + READ_ONCE(q->stats.forced_drop); + + st.pdrop = READ_ONCE(q->stats.pdrop); + + st.marked = READ_ONCE(q->stats.prob_mark) + + READ_ONCE(q->stats.forced_mark); return gnet_stats_copy_app(d, &st, sizeof(st)); } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 013738662128..bd5ef561030f 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -130,7 +130,7 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen < 0xFFFF) - b[hash].qlen++; + WRITE_ONCE(b[hash].qlen, b[hash].qlen + 1); b += SFB_NUMBUCKETS; /* next level */ } } @@ -159,7 +159,7 @@ static void decrement_one_qlen(u32 sfbhash, u32 slot, sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen > 0) - b[hash].qlen--; + WRITE_ONCE(b[hash].qlen, b[hash].qlen - 1); b += SFB_NUMBUCKETS; /* next level */ } } @@ -179,12 +179,12 @@ static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { - b->p_mark = prob_minus(b->p_mark, q->decrement); + WRITE_ONCE(b->p_mark, prob_minus(b->p_mark, q->decrement)); } static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { - b->p_mark = prob_plus(b->p_mark, q->increment); + WRITE_ONCE(b->p_mark, prob_plus(b->p_mark, q->increment)); } static void sfb_zero_all_buckets(struct sfb_sched_data *q) @@ -202,11 +202,14 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { - if (qlen < b->qlen) - qlen = b->qlen; - totalpm += b->p_mark; - if (prob < b->p_mark) - prob = b->p_mark; + u32 b_qlen = READ_ONCE(b->qlen); + u32 b_mark = READ_ONCE(b->p_mark); + + if (qlen < b_qlen) + qlen = b_qlen; + totalpm += b_mark; + if (prob < b_mark) + prob = b_mark; b++; } *prob_r = prob; @@ -295,7 +298,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); - q->stats.queuedrop++; + WRITE_ONCE(q->stats.queuedrop, + q->stats.queuedrop + 1); goto drop; } @@ -348,7 +352,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(minqlen >= q->max)) { qdisc_qstats_overlimit(sch); - q->stats.bucketdrop++; + WRITE_ONCE(q->stats.bucketdrop, + q->stats.bucketdrop + 1); goto drop; } @@ -374,7 +379,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (sfb_rate_limit(skb, q)) { qdisc_qstats_overlimit(sch); - q->stats.penaltydrop++; + WRITE_ONCE(q->stats.penaltydrop, + q->stats.penaltydrop + 1); goto drop; } goto enqueue; @@ -390,14 +396,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, * In either case, we want to start dropping packets. */ if (r < (p_min - SFB_MAX_PROB / 2) * 2) { - q->stats.earlydrop++; + WRITE_ONCE(q->stats.earlydrop, + q->stats.earlydrop + 1); goto drop; } } if (INET_ECN_set_ce(skb)) { - q->stats.marked++; + WRITE_ONCE(q->stats.marked, + q->stats.marked + 1); } else { - q->stats.earlydrop++; + WRITE_ONCE(q->stats.earlydrop, + q->stats.earlydrop + 1); goto drop; } } @@ -410,7 +419,8 @@ enqueue: sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { - q->stats.childdrop++; + WRITE_ONCE(q->stats.childdrop, + q->stats.childdrop + 1); qdisc_qstats_drop(sch); } return ret; @@ -599,12 +609,12 @@ static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct sfb_sched_data *q = qdisc_priv(sch); struct tc_sfb_xstats st = { - .earlydrop = q->stats.earlydrop, - .penaltydrop = q->stats.penaltydrop, - .bucketdrop = q->stats.bucketdrop, - .queuedrop = q->stats.queuedrop, - .childdrop = q->stats.childdrop, - .marked = q->stats.marked, + .earlydrop = READ_ONCE(q->stats.earlydrop), + .penaltydrop = READ_ONCE(q->stats.penaltydrop), + .bucketdrop = READ_ONCE(q->stats.bucketdrop), + .queuedrop = READ_ONCE(q->stats.queuedrop), + .childdrop = READ_ONCE(q->stats.childdrop), + .marked = READ_ONCE(q->stats.marked), }; st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8e3752811950..a47a09d76400 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -972,11 +972,12 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) } if (should_change_schedules(admin, oper, end_time)) { - /* Set things so the next time this runs, the new - * schedule runs. - */ - end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); + /* After changing schedules, the next entry is the first one + * in the new schedule, with a pre-calculated end_time. + */ + next = list_first_entry(&oper->entries, struct sched_entry, list); + end_time = next->end_time; } next->end_time = end_time; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d2665bbd41a2..58d0d9747f0b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4855,8 +4855,9 @@ static struct sock *sctp_clone_sock(struct sock *sk, if (!newsk) return ERR_PTR(err); - /* sk_clone() sets refcnt to 2 */ + /* sk_clone() sets refcnt to 2 and increments sockets_allocated */ sock_put(newsk); + sk_sockets_allocated_dec(newsk); newinet = inet_sk(newsk); newsp = sctp_sk(newsk); @@ -7033,7 +7034,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, /* See if the user provided enough room for all the data */ num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr); - if (len < num_chunks) + if (len < sizeof(struct sctp_authchunks) + num_chunks) return -EINVAL; if (copy_to_user(to, ch->chunks, num_chunks)) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index c38fc7bf0a7e..014d527d5462 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -788,8 +788,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, dclc = (struct smc_clc_msg_decline *)clcm; reason_code = SMC_CLC_DECL_PEERDECL; smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); - if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 & - SMC_FIRST_CONTACT_MASK) { + if ((dclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK) && + smc->conn.lgr) { smc->conn.lgr->sync_err = 1; smc_lgr_terminate_sched(smc->conn.lgr); } diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 76284fc538eb..b0bba0feef56 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -177,8 +177,20 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == LAST_FRAGMENT) { TIPC_SKB_CB(head)->validated = 0; - if (unlikely(!tipc_msg_validate(&head))) + + /* If the reassembled skb has been freed in + * tipc_msg_validate() because of an invalid truesize, + * then head will point to a newly allocated reassembled + * skb, while *headbuf points to freed reassembled skb. + * In such cases, correct *headbuf for freeing the newly + * allocated reassembled skb later. + */ + if (unlikely(!tipc_msg_validate(&head))) { + if (head != *headbuf) + *headbuf = head; goto err; + } + *buf = head; TIPC_SKB_CB(head)->tail = NULL; *headbuf = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f668ff107722..e2d787ca3e74 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1968,16 +1968,19 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_destruct_scm(struct sk_buff *skb) { - struct scm_cookie scm; + struct scm_cookie scm = {}; + + swap(scm.pid, UNIXCB(skb).pid); - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ scm_destroy(&scm); +} + +static void unix_wfree(struct sk_buff *skb) +{ + unix_destruct_scm(skb); sock_wfree(skb); } @@ -1993,7 +1996,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen if (scm->fp && send_fds) err = unix_attach_fds(scm, skb); - skb->destructor = unix_destruct_scm; + skb->destructor = unix_wfree; return err; } @@ -2070,6 +2073,13 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) } } +static void unix_orphan_scm(struct sock *sk, struct sk_buff *skb) +{ + scm_stat_del(sk, skb); + unix_destruct_scm(skb); + skb->destructor = sock_wfree; +} + /* * Send AF_UNIX data. */ @@ -2683,10 +2693,16 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) int err; mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) + if (!skb) { + mutex_unlock(&u->iolock); return err; + } + + unix_orphan_scm(sk, skb); + + mutex_unlock(&u->iolock); return recv_actor(sk, skb); } @@ -2886,6 +2902,9 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) #endif spin_unlock(&queue->lock); + + unix_orphan_scm(sk, skb); + mutex_unlock(&u->iolock); return recv_actor(sk, skb); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 2b7c0b5896ed..f862988c1e86 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -694,7 +694,6 @@ out: static s64 hvs_stream_has_data(struct vsock_sock *vsk) { struct hvsock *hvs = vsk->trans; - bool need_refill; s64 ret; if (hvs->recv_data_len > 0) @@ -702,9 +701,31 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk) switch (hvs_channel_readable_payload(hvs->chan)) { case 1: - need_refill = !hvs->recv_desc; - if (!need_refill) - return -EIO; + if (hvs->recv_desc) { + /* Here hvs->recv_data_len is 0, so hvs->recv_desc must + * be NULL unless it points to the 0-byte-payload FIN + * packet or a malformed/short packet: see + * hvs_update_recv_data(). + * + * If hvs->recv_desc points to the FIN packet, here all + * the payload has been dequeued and the peer_shutdown + * flag is set, but hvs_channel_readable_payload() still + * returns 1, because the VMBus ringbuffer's read_index + * is not updated for the FIN packet: + * hvs_stream_dequeue() -> hv_pkt_iter_next() updates + * the cached priv_read_index but has no opportunity to + * update the read_index in hv_pkt_iter_close() as + * hvs_stream_has_data() returns 0 for the FIN packet, + * so it won't get dequeued. + * + * In case hvs->recv_desc points to a malformed/short + * packet, return -EIO. + */ + if (!(vsk->peer_shutdown & SEND_SHUTDOWN)) + return -EIO; + + return 0; + } hvs->recv_desc = hv_pkt_iter_first(hvs->chan); if (!hvs->recv_desc) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a152a9e208d0..416d533f493d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -73,6 +73,7 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, struct sk_buff *skb, struct msghdr *msg, + size_t pkt_len, bool zerocopy) { struct ubuf_info *uarg; @@ -81,12 +82,10 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg->msg_ubuf; net_zcopy_get(uarg); } else { - struct iov_iter *iter = &msg->msg_iter; struct ubuf_info_msgzc *uarg_zc; uarg = msg_zerocopy_realloc(sk_vsock(vsk), - iter->count, - NULL, false); + pkt_len, NULL, false); if (!uarg) return -1; @@ -398,11 +397,17 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, * each iteration. If this is last skb for this buffer * and MSG_ZEROCOPY mode is in use - we must allocate * completion for the current syscall. + * + * Pass pkt_len because msg iter is already consumed + * by virtio_transport_fill_skb(), so iter->count + * can not be used for RLIMIT_MEMLOCK pinned-pages + * accounting done by msg_zerocopy_realloc(). */ if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY && skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) { if (virtio_transport_init_zcopy_skb(vsk, skb, info->msg, + pkt_len, can_zcopy)) { kfree_skb(skb); ret = -ENOMEM; @@ -545,9 +550,8 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, skb_queue_walk(&vvs->rx_queue, skb) { size_t bytes; - bytes = len - total; - if (bytes > skb->len) - bytes = skb->len; + bytes = min_t(size_t, len - total, + skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset); spin_unlock_bh(&vvs->rx_lock); @@ -1558,8 +1562,6 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, return -ENOMEM; } - sk_acceptq_added(sk); - lock_sock_nested(child, SINGLE_DEPTH_NESTING); child->sk_state = TCP_ESTABLISHED; @@ -1581,6 +1583,7 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, return ret; } + sk_acceptq_added(sk); if (virtio_transport_space_update(child, skb)) child->sk_write_space(child); |
