summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-23 16:50:42 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-23 16:50:42 -0700
commite728258debd553c95d2e70f9cd97c9fde27c7130 (patch)
tree18ef97c80f9923717f5cf6bdab44d77607ca0f4b /net
parente8df5a0c0d041588e7f02781822d637d226cdbe8 (diff)
parent5e6391da4539c35422c0df1d1d2d9a9bb97cd736 (diff)
Merge tag 'net-7.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Jakub Kicinski: "Including fixes from Netfilter. Steady stream of fixes. Last two weeks feel comparable to the two weeks before the merge window. Lots of AI-aided bug discovery. A newer big source is Sashiko/Gemini (Roman Gushchin's system), which points out issues in existing code during patch review (maybe 25% of fixes here likely originating from Sashiko). Nice thing is these are often fixed by the respective maintainers, not drive-bys. Current release - new code bugs: - kconfig: MDIO_PIC64HPSC should depend on ARCH_MICROCHIP Previous releases - regressions: - add async ndo_set_rx_mode and switch drivers which we promised to be called under the per-netdev mutex to it - dsa: remove duplicate netdev_lock_ops() for conduit ethtool ops - hv_sock: report EOF instead of -EIO for FIN - vsock/virtio: fix MSG_PEEK calculation on bytes to copy Previous releases - always broken: - ipv6: fix possible UAF in icmpv6_rcv() - icmp: validate reply type before using icmp_pointers - af_unix: drop all SCM attributes for SOCKMAP - netfilter: fix a number of bugs in the osf (OS fingerprinting) - eth: intel: fix timestamp interrupt configuration for E825C Misc: - bunch of data-race annotations" * tag 'net-7.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (148 commits) rxrpc: Fix error handling in rxgk_extract_token() rxrpc: Fix re-decryption of RESPONSE packets rxrpc: Fix rxrpc_input_call_event() to only unshare DATA packets rxrpc: Fix missing validation of ticket length in non-XDR key preparsing rxgk: Fix potential integer overflow in length check rxrpc: Fix conn-level packet handling to unshare RESPONSE packets rxrpc: Fix potential UAF after skb_unshare() failure rxrpc: Fix rxkad crypto unalignment handling rxrpc: Fix memory leaks in rxkad_verify_response() net: rds: fix MR cleanup on copy error m68k: mvme147: Make me the maintainer net: txgbe: fix firmware version check selftests/bpf: check epoll readiness during reuseport migration tcp: call sk_data_ready() after listener migration vhost_net: fix sleeping with preempt-disabled in vhost_net_busy_poll() ipv6: Cap TLV scan in ip6_tnl_parse_tlv_enc_lim tipc: fix double-free in tipc_buf_append() llc: Return -EINPROGRESS from llc_ui_connect() ipv4: icmp: validate reply type before using icmp_pointers selftests/net: packetdrill: cover RFC 5961 5.2 challenge ACK on both edges ...
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c45
-rw-r--r--net/8021q/vlan_netlink.c10
-rw-r--r--net/8021q/vlanproc.c12
-rw-r--r--net/bridge/br_arp_nd_proxy.c8
-rw-r--r--net/bridge/br_fdb.c28
-rw-r--r--net/core/dev.c67
-rw-r--r--net/core/dev.h4
-rw-r--r--net/core/dev_addr_lists.c385
-rw-r--r--net/core/dev_addr_lists_test.c387
-rw-r--r--net/core/dev_api.c3
-rw-r--r--net/core/dev_ioctl.c6
-rw-r--r--net/core/filter.c2
-rw-r--r--net/core/flow_dissector.c13
-rw-r--r--net/core/rtnetlink.c1
-rw-r--r--net/dsa/conduit.c16
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/netfilter/iptable_nat.c4
-rw-r--r--net/ipv4/nexthop.c4
-rw-r--r--net/ipv4/tcp.c64
-rw-r--r--net/ipv4/tcp_bbr.c6
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cdg.c4
-rw-r--r--net/ipv4/tcp_cubic.c6
-rw-r--r--net/ipv4/tcp_dctcp.c2
-rw-r--r--net/ipv4/tcp_input.c52
-rw-r--r--net/ipv4/tcp_metrics.c6
-rw-r--r--net/ipv4/tcp_nv.c4
-rw-r--r--net/ipv4/tcp_output.c19
-rw-r--r--net/ipv4/tcp_plb.c2
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/tcp_vegas.c9
-rw-r--r--net/ipv4/tcp_westwood.c4
-rw-r--r--net/ipv4/tcp_yeah.c3
-rw-r--r--net/ipv6/icmp.c10
-rw-r--r--net/ipv6/ip6_tunnel.c6
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c4
-rw-r--r--net/ipv6/seg6_iptunnel.c3
-rw-r--r--net/llc/af_llc.c4
-rw-r--r--net/mctp/route.c8
-rw-r--r--net/mptcp/protocol.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c19
-rw-r--r--net/netfilter/nf_nat_amanda.c2
-rw-r--r--net/netfilter/nf_nat_core.c10
-rw-r--r--net/netfilter/nf_nat_sip.c33
-rw-r--r--net/netfilter/nfnetlink_osf.c45
-rw-r--r--net/netfilter/nft_osf.c6
-rw-r--r--net/netfilter/xt_mac.c34
-rw-r--r--net/netfilter/xt_owner.c37
-rw-r--r--net/netfilter/xt_physdev.c29
-rw-r--r--net/netfilter/xt_realm.c2
-rw-r--r--net/openvswitch/datapath.c35
-rw-r--r--net/openvswitch/vport.c3
-rw-r--r--net/packet/af_packet.c21
-rw-r--r--net/rds/connection.c14
-rw-r--r--net/rds/rdma.c4
-rw-r--r--net/rxrpc/ar-internal.h1
-rw-r--r--net/rxrpc/call_event.c20
-rw-r--r--net/rxrpc/conn_event.c43
-rw-r--r--net/rxrpc/io_thread.c24
-rw-r--r--net/rxrpc/key.c4
-rw-r--r--net/rxrpc/rxgk_app.c3
-rw-r--r--net/rxrpc/rxgk_common.h1
-rw-r--r--net/rxrpc/rxkad.c112
-rw-r--r--net/rxrpc/skbuff.c9
-rw-r--r--net/sched/act_mirred.c2
-rw-r--r--net/sched/sch_cake.c2
-rw-r--r--net/sched/sch_dualpi2.c32
-rw-r--r--net/sched/sch_fq_codel.c3
-rw-r--r--net/sched/sch_hhf.c19
-rw-r--r--net/sched/sch_pie.c38
-rw-r--r--net/sched/sch_red.c31
-rw-r--r--net/sched/sch_sfb.c54
-rw-r--r--net/sched/sch_taprio.c9
-rw-r--r--net/sctp/socket.c5
-rw-r--r--net/smc/smc_clc.c4
-rw-r--r--net/tipc/msg.c14
-rw-r--r--net/unix/af_unix.c35
-rw-r--r--net/vmw_vsock/hyperv_transport.c29
-rw-r--r--net/vmw_vsock/virtio_transport_common.c19
80 files changed, 1505 insertions, 527 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index c40f7d5c4fca..7aa3af8b10ea 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -172,39 +172,42 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
u32 skb_prio, u16 vlan_prio)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
- struct vlan_priority_tci_mapping *mp = NULL;
+ struct vlan_priority_tci_mapping __rcu **mpp;
+ struct vlan_priority_tci_mapping *mp;
struct vlan_priority_tci_mapping *np;
+ u32 bucket = skb_prio & 0xF;
u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
/* See if a priority mapping exists.. */
- mp = vlan->egress_priority_map[skb_prio & 0xF];
+ mpp = &vlan->egress_priority_map[bucket];
+ mp = rtnl_dereference(*mpp);
while (mp) {
if (mp->priority == skb_prio) {
- if (mp->vlan_qos && !vlan_qos)
+ if (!vlan_qos) {
+ rcu_assign_pointer(*mpp, rtnl_dereference(mp->next));
vlan->nr_egress_mappings--;
- else if (!mp->vlan_qos && vlan_qos)
- vlan->nr_egress_mappings++;
- mp->vlan_qos = vlan_qos;
+ kfree_rcu(mp, rcu);
+ } else {
+ WRITE_ONCE(mp->vlan_qos, vlan_qos);
+ }
return 0;
}
- mp = mp->next;
+ mpp = &mp->next;
+ mp = rtnl_dereference(*mpp);
}
/* Create a new mapping then. */
- mp = vlan->egress_priority_map[skb_prio & 0xF];
+ if (!vlan_qos)
+ return 0;
+
np = kmalloc_obj(struct vlan_priority_tci_mapping);
if (!np)
return -ENOBUFS;
- np->next = mp;
np->priority = skb_prio;
np->vlan_qos = vlan_qos;
- /* Before inserting this element in hash table, make sure all its fields
- * are committed to memory.
- * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask()
- */
- smp_wmb();
- vlan->egress_priority_map[skb_prio & 0xF] = np;
+ RCU_INIT_POINTER(np->next, rtnl_dereference(vlan->egress_priority_map[bucket]));
+ rcu_assign_pointer(vlan->egress_priority_map[bucket], np);
if (vlan_qos)
vlan->nr_egress_mappings++;
return 0;
@@ -604,11 +607,17 @@ void vlan_dev_free_egress_priority(const struct net_device *dev)
int i;
for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
- while ((pm = vlan->egress_priority_map[i]) != NULL) {
- vlan->egress_priority_map[i] = pm->next;
- kfree(pm);
+ pm = rtnl_dereference(vlan->egress_priority_map[i]);
+ RCU_INIT_POINTER(vlan->egress_priority_map[i], NULL);
+ while (pm) {
+ struct vlan_priority_tci_mapping *next;
+
+ next = rtnl_dereference(pm->next);
+ kfree_rcu(pm, rcu);
+ pm = next;
}
}
+ vlan->nr_egress_mappings = 0;
}
static void vlan_dev_uninit(struct net_device *dev)
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index a000b1ef0520..368d53ca7d87 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -260,13 +260,11 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
- for (pm = vlan->egress_priority_map[i]; pm;
- pm = pm->next) {
- if (!pm->vlan_qos)
- continue;
-
+ for (pm = rcu_dereference_rtnl(vlan->egress_priority_map[i]); pm;
+ pm = rcu_dereference_rtnl(pm->next)) {
+ u16 vlan_qos = READ_ONCE(pm->vlan_qos);
m.from = pm->priority;
- m.to = (pm->vlan_qos >> 13) & 0x7;
+ m.to = (vlan_qos >> 13) & 0x7;
if (nla_put(skb, IFLA_VLAN_QOS_MAPPING,
sizeof(m), &m))
goto nla_put_failure;
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index fa67374bda49..0e424e0895b7 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -262,15 +262,19 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
vlan->ingress_priority_map[7]);
seq_printf(seq, " EGRESS priority mappings: ");
+ rcu_read_lock();
for (i = 0; i < 16; i++) {
- const struct vlan_priority_tci_mapping *mp
- = vlan->egress_priority_map[i];
+ const struct vlan_priority_tci_mapping *mp =
+ rcu_dereference(vlan->egress_priority_map[i]);
while (mp) {
+ u16 vlan_qos = READ_ONCE(mp->vlan_qos);
+
seq_printf(seq, "%u:%d ",
- mp->priority, ((mp->vlan_qos >> 13) & 0x7));
- mp = mp->next;
+ mp->priority, ((vlan_qos >> 13) & 0x7));
+ mp = rcu_dereference(mp->next);
}
}
+ rcu_read_unlock();
seq_puts(seq, "\n");
return 0;
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 0c8a06cdd46f..deb1ab1f24b0 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -201,11 +201,12 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
f = br_fdb_find_rcu(br, n->ha, vid);
if (f) {
+ const struct net_bridge_port *dst = READ_ONCE(f->dst);
bool replied = false;
if ((p && (p->flags & BR_PROXYARP)) ||
- (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)) ||
- br_is_neigh_suppress_enabled(f->dst, vid)) {
+ (dst && (dst->flags & BR_PROXYARP_WIFI)) ||
+ br_is_neigh_suppress_enabled(dst, vid)) {
if (!vid)
br_arp_send(br, p, skb->dev, sip, tip,
sha, n->ha, sha, 0, 0);
@@ -469,9 +470,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
f = br_fdb_find_rcu(br, n->ha, vid);
if (f) {
+ const struct net_bridge_port *dst = READ_ONCE(f->dst);
bool replied = false;
- if (br_is_neigh_suppress_enabled(f->dst, vid)) {
+ if (br_is_neigh_suppress_enabled(dst, vid)) {
if (vid != 0)
br_nd_send(br, p, skb, n,
skb->vlan_proto,
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e2c17f620f00..6eb3ab69a514 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -236,6 +236,7 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev,
const unsigned char *addr,
__u16 vid)
{
+ const struct net_bridge_port *dst;
struct net_bridge_fdb_entry *f;
struct net_device *dev = NULL;
struct net_bridge *br;
@@ -248,8 +249,11 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev,
br = netdev_priv(br_dev);
rcu_read_lock();
f = br_fdb_find_rcu(br, addr, vid);
- if (f && f->dst)
- dev = f->dst->dev;
+ if (f) {
+ dst = READ_ONCE(f->dst);
+ if (dst)
+ dev = dst->dev;
+ }
rcu_read_unlock();
return dev;
@@ -346,7 +350,7 @@ static void fdb_delete_local(struct net_bridge *br,
vg = nbp_vlan_group(op);
if (op != p && ether_addr_equal(op->dev->dev_addr, addr) &&
(!vid || br_vlan_find(vg, vid))) {
- f->dst = op;
+ WRITE_ONCE(f->dst, op);
clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
return;
}
@@ -357,7 +361,7 @@ static void fdb_delete_local(struct net_bridge *br,
/* Maybe bridge device has same hw addr? */
if (p && ether_addr_equal(br->dev->dev_addr, addr) &&
(!vid || (v && br_vlan_should_use(v)))) {
- f->dst = NULL;
+ WRITE_ONCE(f->dst, NULL);
clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
return;
}
@@ -928,6 +932,7 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
int br_fdb_fillbuf(struct net_bridge *br, void *buf,
unsigned long maxnum, unsigned long skip)
{
+ const struct net_bridge_port *dst;
struct net_bridge_fdb_entry *f;
struct __fdb_entry *fe = buf;
unsigned long delta;
@@ -944,7 +949,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
continue;
/* ignore pseudo entry for local MAC address */
- if (!f->dst)
+ dst = READ_ONCE(f->dst);
+ if (!dst)
continue;
if (skip) {
@@ -956,8 +962,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN);
/* due to ABI compat need to split into hi/lo */
- fe->port_no = f->dst->port_no;
- fe->port_hi = f->dst->port_no >> 8;
+ fe->port_no = dst->port_no;
+ fe->port_hi = dst->port_no >> 8;
fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags);
if (!test_bit(BR_FDB_STATIC, &f->flags)) {
@@ -1083,9 +1089,11 @@ int br_fdb_dump(struct sk_buff *skb,
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ const struct net_bridge_port *dst = READ_ONCE(f->dst);
+
if (*idx < ctx->fdb_idx)
goto skip;
- if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
+ if (filter_dev && (!dst || dst->dev != filter_dev)) {
if (filter_dev != dev)
goto skip;
/* !f->dst is a special case for bridge
@@ -1093,10 +1101,10 @@ int br_fdb_dump(struct sk_buff *skb,
* Therefore need a little more filtering
* we only want to dump the !f->dst case
*/
- if (f->dst)
+ if (dst)
goto skip;
}
- if (!filter_dev && f->dst)
+ if (!filter_dev && dst)
goto skip;
err = fdb_fill_info(skb, br, f,
diff --git a/net/core/dev.c b/net/core/dev.c
index e59f6025067c..d426c1beeb76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9593,14 +9593,14 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
ops->ndo_change_rx_flags(dev, flags);
}
-static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
+int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
unsigned int promiscuity, flags;
kuid_t uid;
kgid_t gid;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
promiscuity = dev->promiscuity + inc;
if (promiscuity == 0) {
@@ -9636,16 +9636,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify) {
- /* The ops lock is only required to ensure consistent locking
- * for `NETDEV_CHANGE` notifiers. This function is sometimes
- * called without the lock, even for devices that are ops
- * locked, such as in `dev_uc_sync_multiple` when using
- * bonding or teaming.
- */
- netdev_ops_assert_locked(dev);
+ if (notify)
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
- }
return 0;
}
@@ -9667,7 +9659,7 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
unsigned int allmulti, flags;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
allmulti = dev->allmulti + inc;
if (allmulti == 0) {
@@ -9697,46 +9689,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/*
- * Upload unicast and multicast address lists to device and
- * configure RX filtering. When the device doesn't support unicast
- * filtering it is put in promiscuous mode while unicast addresses
- * are present.
- */
-void __dev_set_rx_mode(struct net_device *dev)
-{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- /* dev_open will call this function so the list will stay sane. */
- if (!(dev->flags&IFF_UP))
- return;
-
- if (!netif_device_present(dev))
- return;
-
- if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
- /* Unicast addresses changes may only happen under the rtnl,
- * therefore calling __dev_set_promiscuity here is safe.
- */
- if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
- __dev_set_promiscuity(dev, 1, false);
- dev->uc_promisc = true;
- } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
- __dev_set_promiscuity(dev, -1, false);
- dev->uc_promisc = false;
- }
- }
-
- if (ops->ndo_set_rx_mode)
- ops->ndo_set_rx_mode(dev);
-}
-
-void dev_set_rx_mode(struct net_device *dev)
-{
- netif_addr_lock_bh(dev);
- __dev_set_rx_mode(dev);
- netif_addr_unlock_bh(dev);
-}
/**
* netif_get_flags() - get flags reported to userspace
@@ -9775,7 +9727,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
unsigned int old_flags = dev->flags;
int ret;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
/*
* Set the flags on our device.
@@ -11408,6 +11360,11 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
}
+ if (netdev_need_ops_lock(dev) &&
+ dev->netdev_ops->ndo_set_rx_mode &&
+ !dev->netdev_ops->ndo_set_rx_mode_async)
+ netdev_WARN(dev, "ops-locked drivers should use ndo_set_rx_mode_async\n");
+
ret = netdev_do_alloc_pcpu_stats(dev);
if (ret)
goto err_uninit;
@@ -12127,6 +12084,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
mutex_init(&dev->lock);
+ INIT_LIST_HEAD(&dev->rx_mode_node);
+ __hw_addr_init(&dev->rx_mode_addr_cache);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -12231,6 +12190,8 @@ void free_netdev(struct net_device *dev)
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+ __hw_addr_flush(&dev->rx_mode_addr_cache);
+
/* Flush device addresses */
dev_addr_flush(dev);
diff --git a/net/core/dev.h b/net/core/dev.h
index 628bdaebf0ca..0cf24b8f5008 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -78,6 +78,7 @@ void linkwatch_run_queue(void);
void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);
void dev_addr_check(struct net_device *dev);
+void __hw_addr_flush(struct netdev_hw_addr_list *list);
#if IS_ENABLED(CONFIG_NET_SHAPER)
void net_shaper_flush_netdev(struct net_device *dev);
@@ -164,6 +165,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
+int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify);
+bool netif_rx_mode_clean(struct net_device *dev);
+void netif_rx_mode_sync(struct net_device *dev);
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges, u32 portid,
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 76c91f224886..d73fcb0c6785 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -11,9 +11,18 @@
#include <linux/rtnetlink.h>
#include <linux/export.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <kunit/visibility.h>
#include "dev.h"
+static void netdev_rx_mode_work(struct work_struct *work);
+
+static LIST_HEAD(rx_mode_list);
+static DEFINE_SPINLOCK(rx_mode_lock);
+static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work);
+
/*
* General list handling functions
*/
@@ -481,7 +490,7 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
}
EXPORT_SYMBOL(__hw_addr_unsync_dev);
-static void __hw_addr_flush(struct netdev_hw_addr_list *list)
+void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
@@ -492,6 +501,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list)
}
list->count = 0;
}
+EXPORT_SYMBOL_IF_KUNIT(__hw_addr_flush);
void __hw_addr_init(struct netdev_hw_addr_list *list)
{
@@ -501,6 +511,133 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
}
EXPORT_SYMBOL(__hw_addr_init);
+static void __hw_addr_splice(struct netdev_hw_addr_list *dst,
+ struct netdev_hw_addr_list *src)
+{
+ src->tree = RB_ROOT;
+ list_splice_init(&src->list, &dst->list);
+ dst->count += src->count;
+ src->count = 0;
+}
+
+/**
+ * __hw_addr_list_snapshot - create a snapshot copy of an address list
+ * @snap: destination snapshot list (needs to be __hw_addr_init-initialized)
+ * @list: source address list to snapshot
+ * @addr_len: length of addresses
+ * @cache: entry cache to reuse entries from; falls back to GFP_ATOMIC
+ *
+ * Creates a copy of @list reusing entries from @cache when available.
+ * Must be called under a spinlock.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap,
+ const struct netdev_hw_addr_list *list,
+ int addr_len, struct netdev_hw_addr_list *cache)
+{
+ struct netdev_hw_addr *ha, *entry;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (cache->count) {
+ entry = list_first_entry(&cache->list,
+ struct netdev_hw_addr, list);
+ list_del(&entry->list);
+ cache->count--;
+ memcpy(entry->addr, ha->addr, addr_len);
+ entry->type = ha->type;
+ entry->global_use = false;
+ entry->synced = 0;
+ } else {
+ entry = __hw_addr_create(ha->addr, addr_len, ha->type,
+ false, false);
+ if (!entry) {
+ __hw_addr_flush(snap);
+ return -ENOMEM;
+ }
+ }
+ entry->sync_cnt = ha->sync_cnt;
+ entry->refcount = ha->refcount;
+
+ list_add_tail(&entry->list, &snap->list);
+ __hw_addr_insert(snap, entry, addr_len);
+ snap->count++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(__hw_addr_list_snapshot);
+
+/**
+ * __hw_addr_list_reconcile - sync snapshot changes back and free snapshots
+ * @real_list: the real address list to update
+ * @work: the working snapshot (modified by driver via __hw_addr_sync_dev)
+ * @ref: the reference snapshot (untouched copy of original state)
+ * @addr_len: length of addresses
+ * @cache: entry cache to return snapshot entries to for reuse
+ *
+ * Walks the reference snapshot and compares each entry against the work
+ * snapshot to compute sync_cnt deltas. Applies those deltas to @real_list.
+ * Returns snapshot entries to @cache for reuse; frees both snapshots.
+ * Caller must hold netif_addr_lock_bh.
+ */
+void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list,
+ struct netdev_hw_addr_list *work,
+ struct netdev_hw_addr_list *ref, int addr_len,
+ struct netdev_hw_addr_list *cache)
+{
+ struct netdev_hw_addr *ref_ha, *tmp, *work_ha, *real_ha;
+ int delta;
+
+ list_for_each_entry_safe(ref_ha, tmp, &ref->list, list) {
+ work_ha = __hw_addr_lookup(work, ref_ha->addr, addr_len,
+ ref_ha->type);
+ if (work_ha)
+ delta = work_ha->sync_cnt - ref_ha->sync_cnt;
+ else
+ delta = -1;
+
+ if (delta == 0)
+ continue;
+
+ real_ha = __hw_addr_lookup(real_list, ref_ha->addr, addr_len,
+ ref_ha->type);
+ if (!real_ha) {
+ /* The real entry was concurrently removed. If the
+ * driver synced this addr to hardware (delta > 0),
+ * re-insert it as a stale entry so the next work
+ * run unsyncs it from hardware.
+ */
+ if (delta > 0) {
+ rb_erase(&ref_ha->node, &ref->tree);
+ list_del(&ref_ha->list);
+ ref->count--;
+ ref_ha->sync_cnt = delta;
+ ref_ha->refcount = delta;
+ list_add_tail_rcu(&ref_ha->list,
+ &real_list->list);
+ __hw_addr_insert(real_list, ref_ha,
+ addr_len);
+ real_list->count++;
+ }
+ continue;
+ }
+
+ real_ha->sync_cnt += delta;
+ real_ha->refcount += delta;
+ if (!real_ha->refcount) {
+ rb_erase(&real_ha->node, &real_list->tree);
+ list_del_rcu(&real_ha->list);
+ kfree_rcu(real_ha, rcu_head);
+ real_list->count--;
+ }
+ }
+
+ __hw_addr_splice(cache, work);
+ __hw_addr_splice(cache, ref);
+}
+EXPORT_SYMBOL_IF_KUNIT(__hw_addr_list_reconcile);
+
/*
* Device addresses handling functions
*/
@@ -1049,3 +1186,249 @@ void dev_mc_init(struct net_device *dev)
__hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);
+
+static int netif_addr_lists_snapshot(struct net_device *dev,
+ struct netdev_hw_addr_list *uc_snap,
+ struct netdev_hw_addr_list *mc_snap,
+ struct netdev_hw_addr_list *uc_ref,
+ struct netdev_hw_addr_list *mc_ref)
+{
+ int err;
+
+ err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ if (!err)
+ err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ if (!err)
+ err = __hw_addr_list_snapshot(mc_snap, &dev->mc,
+ dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ if (!err)
+ err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+
+ if (err) {
+ __hw_addr_flush(uc_snap);
+ __hw_addr_flush(uc_ref);
+ __hw_addr_flush(mc_snap);
+ }
+
+ return err;
+}
+
+static void netif_addr_lists_reconcile(struct net_device *dev,
+ struct netdev_hw_addr_list *uc_snap,
+ struct netdev_hw_addr_list *mc_snap,
+ struct netdev_hw_addr_list *uc_ref,
+ struct netdev_hw_addr_list *mc_ref)
+{
+ __hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+ __hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len,
+ &dev->rx_mode_addr_cache);
+}
+
+/**
+ * netif_uc_promisc_update() - evaluate whether uc_promisc should be toggled.
+ * @dev: device
+ *
+ * Must be called under netif_addr_lock_bh.
+ * Return: +1 to enter promisc, -1 to leave, 0 for no change.
+ */
+static int netif_uc_promisc_update(struct net_device *dev)
+{
+ if (dev->priv_flags & IFF_UNICAST_FLT)
+ return 0;
+
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ dev->uc_promisc = true;
+ return 1;
+ }
+ if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ dev->uc_promisc = false;
+ return -1;
+ }
+ return 0;
+}
+
+static void netif_rx_mode_run(struct net_device *dev)
+{
+ struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int promisc_inc;
+ int err;
+
+ might_sleep();
+ netdev_ops_assert_locked(dev);
+
+ __hw_addr_init(&uc_snap);
+ __hw_addr_init(&mc_snap);
+ __hw_addr_init(&uc_ref);
+ __hw_addr_init(&mc_ref);
+
+ if (!(dev->flags & IFF_UP) || !netif_device_present(dev))
+ return;
+
+ if (ops->ndo_set_rx_mode_async) {
+ netif_addr_lock_bh(dev);
+ err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap,
+ &uc_ref, &mc_ref);
+ if (err) {
+ netdev_WARN(dev, "failed to sync uc/mc addresses\n");
+ netif_addr_unlock_bh(dev);
+ return;
+ }
+
+ promisc_inc = netif_uc_promisc_update(dev);
+ netif_addr_unlock_bh(dev);
+ } else {
+ netif_addr_lock_bh(dev);
+ promisc_inc = netif_uc_promisc_update(dev);
+ netif_addr_unlock_bh(dev);
+ }
+
+ if (promisc_inc)
+ __dev_set_promiscuity(dev, promisc_inc, false);
+
+ if (ops->ndo_set_rx_mode_async) {
+ ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
+
+ netif_addr_lock_bh(dev);
+ netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap,
+ &uc_ref, &mc_ref);
+ netif_addr_unlock_bh(dev);
+ } else if (ops->ndo_set_rx_mode) {
+ netif_addr_lock_bh(dev);
+ ops->ndo_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ }
+}
+
+static void netdev_rx_mode_work(struct work_struct *work)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+
+ while (true) {
+ spin_lock_bh(&rx_mode_lock);
+ if (list_empty(&rx_mode_list)) {
+ spin_unlock_bh(&rx_mode_lock);
+ break;
+ }
+ dev = list_first_entry(&rx_mode_list, struct net_device,
+ rx_mode_node);
+ list_del_init(&dev->rx_mode_node);
+ /* We must free netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->rx_mode_tracker);
+ spin_unlock_bh(&rx_mode_lock);
+
+ netdev_lock_ops(dev);
+ netif_rx_mode_run(dev);
+ netdev_unlock_ops(dev);
+ /* Use __dev_put() because netdev_tracker_free() was already
+ * called above. Must be after netdev_unlock_ops() to prevent
+ * netdev_run_todo() from freeing the device while still in use.
+ */
+ __dev_put(dev);
+ }
+
+ rtnl_unlock();
+}
+
+static void netif_rx_mode_queue(struct net_device *dev)
+{
+ spin_lock_bh(&rx_mode_lock);
+ if (list_empty(&dev->rx_mode_node)) {
+ list_add_tail(&dev->rx_mode_node, &rx_mode_list);
+ netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC);
+ }
+ spin_unlock_bh(&rx_mode_lock);
+ schedule_work(&rx_mode_work);
+}
+
+/**
+ * __dev_set_rx_mode() - upload unicast and multicast address lists to device
+ * and configure RX filtering.
+ * @dev: device
+ *
+ * When the device doesn't support unicast filtering it is put in promiscuous
+ * mode while unicast addresses are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int promisc_inc;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags & IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (ops->ndo_set_rx_mode_async || ops->ndo_change_rx_flags ||
+ netdev_need_ops_lock(dev)) {
+ netif_rx_mode_queue(dev);
+ return;
+ }
+
+ /* Legacy path for non-ops-locked HW devices. */
+
+ promisc_inc = netif_uc_promisc_update(dev);
+ if (promisc_inc)
+ __dev_set_promiscuity(dev, promisc_inc, false);
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+bool netif_rx_mode_clean(struct net_device *dev)
+{
+ bool clean = false;
+
+ spin_lock_bh(&rx_mode_lock);
+ if (!list_empty(&dev->rx_mode_node)) {
+ list_del_init(&dev->rx_mode_node);
+ clean = true;
+ /* We must release netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->rx_mode_tracker);
+ }
+ spin_unlock_bh(&rx_mode_lock);
+
+ return clean;
+}
+
+/**
+ * netif_rx_mode_sync() - sync rx mode inline
+ * @dev: network device
+ *
+ * Drivers implementing ndo_set_rx_mode_async() have their rx mode callback
+ * executed from a workqueue. This allows the callback to sleep, but means
+ * the hardware update is deferred and may not be visible to userspace
+ * by the time the initiating syscall returns. netif_rx_mode_sync() steals
+ * workqueue update and executes it inline. This preserves the atomicity of
+ * operations to the userspace.
+ */
+void netif_rx_mode_sync(struct net_device *dev)
+{
+ if (netif_rx_mode_clean(dev)) {
+ netif_rx_mode_run(dev);
+ /* Use __dev_put() because netdev_tracker_free() was already
+ * called inside netif_rx_mode_clean().
+ */
+ __dev_put(dev);
+ }
+}
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
index 8e1dba825e94..260e71a2399f 100644
--- a/net/core/dev_addr_lists_test.c
+++ b/net/core/dev_addr_lists_test.c
@@ -2,22 +2,31 @@
#include <kunit/test.h>
#include <linux/etherdevice.h>
+#include <linux/math64.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
static const struct net_device_ops dummy_netdev_ops = {
};
+#define ADDR_A 1
+#define ADDR_B 2
+#define ADDR_C 3
+
struct dev_addr_test_priv {
u32 addr_seen;
+ u32 addr_synced;
+ u32 addr_unsynced;
};
static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a)
{
struct dev_addr_test_priv *datp = netdev_priv(netdev);
- if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) {
datp->addr_seen |= 1 << a[0];
+ datp->addr_synced |= 1 << a[0];
+ }
return 0;
}
@@ -26,11 +35,22 @@ static int dev_addr_test_unsync(struct net_device *netdev,
{
struct dev_addr_test_priv *datp = netdev_priv(netdev);
- if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) {
datp->addr_seen &= ~(1 << a[0]);
+ datp->addr_unsynced |= 1 << a[0];
+ }
return 0;
}
+static void dev_addr_test_reset(struct net_device *netdev)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ datp->addr_seen = 0;
+ datp->addr_synced = 0;
+ datp->addr_unsynced = 0;
+}
+
static int dev_addr_test_init(struct kunit *test)
{
struct dev_addr_test_priv *datp;
@@ -225,6 +245,363 @@ static void dev_addr_test_add_excl(struct kunit *test)
rtnl_unlock();
}
+/* Snapshot test: basic sync with no concurrent modifications.
+ * Add one address, snapshot, driver syncs it, reconcile propagates
+ * sync_cnt delta back to real list.
+ */
+static void dev_addr_test_snapshot_sync(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Snapshot: ADDR_A has sync_cnt=0, refcount=1 (new) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver syncs ADDR_A to hardware */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* Reconcile: delta=+1 applied to real entry */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ /* Real entry should now reflect the sync: sync_cnt=1, refcount=2 */
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+ ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list);
+ KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN);
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 2, ha->refcount);
+
+ /* Second work run: already synced, nothing to do */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+/* Snapshot test: ADDR_A synced to hardware, then concurrently removed
+ * from the real list before reconcile runs. Reconcile re-inserts ADDR_A as
+ * a stale entry so the next work run unsyncs it from hardware.
+ */
+static void dev_addr_test_snapshot_remove_during_sync(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Snapshot: ADDR_A is new (sync_cnt=0, refcount=1) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver syncs ADDR_A to hardware */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* Concurrent removal: user deletes ADDR_A while driver was working */
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+ KUNIT_EXPECT_EQ(test, 0, netdev->uc.count);
+
+ /* Reconcile: ADDR_A gone from real list but driver synced it,
+ * so it gets re-inserted as stale (sync_cnt=1, refcount=1).
+ */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+ ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list);
+ KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN);
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 1, ha->refcount);
+
+ /* Second work run: stale entry gets unsynced from HW and removed */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_unsynced);
+ KUNIT_EXPECT_EQ(test, 0, netdev->uc.count);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+/* Snapshot test: ADDR_A was stale (unsynced from hardware by driver),
+ * but concurrently re-added by the user. The re-add bumps refcount of
+ * the existing stale entry. Reconcile applies delta=-1, leaving ADDR_A
+ * as a fresh entry (sync_cnt=0, refcount=1) for the next work run.
+ */
+static void dev_addr_test_snapshot_readd_during_unsync(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Sync ADDR_A to hardware: sync_cnt=1, refcount=2 */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* User removes ADDR_A: refcount=1, sync_cnt=1 -> stale */
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+
+ /* Snapshot: ADDR_A is stale (sync_cnt=1, refcount=1) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver unsyncs stale ADDR_A from hardware */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_unsynced);
+
+ /* Concurrent: user re-adds ADDR_A. dev_uc_add finds the existing
+ * stale entry and bumps refcount from 1 -> 2. sync_cnt stays 1.
+ */
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+
+ /* Reconcile: ref sync_cnt=1 matches real sync_cnt=1, delta=-1
+ * applied. Result: sync_cnt=0, refcount=1 (fresh).
+ */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ /* Entry survives as fresh: needs re-sync to HW */
+ KUNIT_EXPECT_EQ(test, 1, netdev->uc.count);
+ ha = list_first_entry(&netdev->uc.list, struct netdev_hw_addr, list);
+ KUNIT_EXPECT_MEMEQ(test, ha->addr, addr, ETH_ALEN);
+ KUNIT_EXPECT_EQ(test, 0, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 1, ha->refcount);
+
+ /* Second work run: fresh entry gets synced to HW */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_A, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+/* Snapshot test: ADDR_A is new (synced by driver), and independent ADDR_B
+ * is concurrently removed from the real list. A's sync delta propagates
+ * normally; B's absence doesn't interfere.
+ */
+static void dev_addr_test_snapshot_add_and_remove(struct kunit *test)
+{
+ struct netdev_hw_addr_list snap, ref, cache;
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ struct netdev_hw_addr *ha;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+
+ /* Add ADDR_A and ADDR_B (will be synced then removed) */
+ memset(addr, ADDR_A, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+ memset(addr, ADDR_B, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Sync both to hardware: sync_cnt=1, refcount=2 */
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+
+ /* Add ADDR_C (new, will be synced by snapshot) */
+ memset(addr, ADDR_C, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+
+ /* Snapshot: A,B synced (sync_cnt=1,refcount=2); C new (0,1) */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+ __hw_addr_init(&ref);
+ __hw_addr_init(&cache);
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc, ETH_ALEN,
+ &cache));
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&ref, &netdev->uc, ETH_ALEN,
+ &cache));
+ netif_addr_unlock_bh(netdev);
+
+ /* Driver syncs snapshot: ADDR_C is new -> synced; A,B already synced */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&snap, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_C, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_unsynced);
+
+ /* Concurrent: user removes addr B while driver was working */
+ memset(addr, ADDR_B, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+
+ /* Reconcile: ADDR_C's delta=+1 applied to real list.
+ * ADDR_B's delta=0 (unchanged in snapshot),
+ * so nothing to apply to ADDR_B.
+ */
+ netif_addr_lock_bh(netdev);
+ __hw_addr_list_reconcile(&netdev->uc, &snap, &ref, ETH_ALEN,
+ &cache);
+ netif_addr_unlock_bh(netdev);
+
+ /* ADDR_A: unchanged (sync_cnt=1, refcount=2)
+ * ADDR_B: refcount went from 2->1 via dev_uc_del (still present, stale)
+ * ADDR_C: sync propagated (sync_cnt=1, refcount=2)
+ */
+ KUNIT_EXPECT_EQ(test, 3, netdev->uc.count);
+ netdev_hw_addr_list_for_each(ha, &netdev->uc) {
+ u8 id = ha->addr[0];
+
+ if (!memchr_inv(ha->addr, id, ETH_ALEN)) {
+ if (id == ADDR_A) {
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 2, ha->refcount);
+ } else if (id == ADDR_B) {
+ /* B: still present but now stale */
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 1, ha->refcount);
+ } else if (id == ADDR_C) {
+ KUNIT_EXPECT_EQ(test, 1, ha->sync_cnt);
+ KUNIT_EXPECT_EQ(test, 2, ha->refcount);
+ }
+ }
+ }
+
+ /* Second work run: ADDR_B is stale, gets unsynced and removed */
+ dev_addr_test_reset(netdev);
+ __hw_addr_sync_dev(&netdev->uc, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_synced);
+ KUNIT_EXPECT_EQ(test, 1 << ADDR_B, datp->addr_unsynced);
+ KUNIT_EXPECT_EQ(test, 2, netdev->uc.count);
+
+ __hw_addr_flush(&cache);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_snapshot_benchmark(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct netdev_hw_addr_list snap, cache;
+ u8 addr[ETH_ALEN];
+ s64 duration = 0;
+ ktime_t start;
+ int i, iter;
+
+ rtnl_lock();
+
+ for (i = 0; i < 1024; i++) {
+ memset(addr, 0, sizeof(addr));
+ addr[0] = (i >> 8) & 0xff;
+ addr[1] = i & 0xff;
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add(netdev, addr));
+ }
+
+ __hw_addr_init(&cache);
+
+ for (iter = 0; iter < 1000; iter++) {
+ netif_addr_lock_bh(netdev);
+ __hw_addr_init(&snap);
+
+ start = ktime_get();
+ KUNIT_EXPECT_EQ(test, 0,
+ __hw_addr_list_snapshot(&snap, &netdev->uc,
+ ETH_ALEN, &cache));
+ duration += ktime_to_ns(ktime_sub(ktime_get(), start));
+
+ netif_addr_unlock_bh(netdev);
+ __hw_addr_flush(&snap);
+ }
+
+ __hw_addr_flush(&cache);
+
+ kunit_info(test,
+ "1024 addrs x 1000 snapshots: %lld ns total, %lld ns/iter",
+ duration, div_s64(duration, 1000));
+
+ rtnl_unlock();
+}
+
static struct kunit_case dev_addr_test_cases[] = {
KUNIT_CASE(dev_addr_test_basic),
KUNIT_CASE(dev_addr_test_sync_one),
@@ -232,6 +609,11 @@ static struct kunit_case dev_addr_test_cases[] = {
KUNIT_CASE(dev_addr_test_del_main),
KUNIT_CASE(dev_addr_test_add_set),
KUNIT_CASE(dev_addr_test_add_excl),
+ KUNIT_CASE(dev_addr_test_snapshot_sync),
+ KUNIT_CASE(dev_addr_test_snapshot_remove_during_sync),
+ KUNIT_CASE(dev_addr_test_snapshot_readd_during_unsync),
+ KUNIT_CASE(dev_addr_test_snapshot_add_and_remove),
+ KUNIT_CASE_SLOW(dev_addr_test_snapshot_benchmark),
{}
};
@@ -243,5 +625,6 @@ static struct kunit_suite dev_addr_test_suite = {
};
kunit_test_suite(dev_addr_test_suite);
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list");
MODULE_LICENSE("GPL");
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
index f28852078aa6..437947dd08ed 100644
--- a/net/core/dev_api.c
+++ b/net/core/dev_api.c
@@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
netdev_lock_ops(dev);
ret = netif_change_flags(dev, flags, extack);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_promiscuity(dev, inc);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_allmulti(dev, inc, true);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 7a8966544c9d..f3979b276090 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return err;
case SIOCADDMULTI:
- if (!ops->ndo_set_rx_mode ||
+ if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
case SIOCDELMULTI:
- if (!ops->ndo_set_rx_mode ||
+ if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5fa9189eb772..80a3b702a2d4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5396,7 +5396,7 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
if (val <= 0)
return -EINVAL;
tp->snd_cwnd_clamp = val;
- tp->snd_ssthresh = val;
+ WRITE_ONCE(tp->snd_ssthresh, val);
break;
case TCP_BPF_DELACK_MAX:
timeout = usecs_to_jiffies(val);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1b61bb25ba0e..2a98f5fa74eb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1374,16 +1374,13 @@ proto_again:
break;
}
- /* least significant bit of the most significant octet
- * indicates if protocol field was compressed
+ /* PFC (compressed 1-byte protocol) frames are not processed.
+ * A compressed protocol field has the least significant bit of
+ * the most significant octet set, which will fail the following
+ * ppp_proto_is_valid(), returning FLOW_DISSECT_RET_OUT_BAD.
*/
ppp_proto = ntohs(hdr->proto);
- if (ppp_proto & 0x0100) {
- ppp_proto = ppp_proto >> 8;
- nhoff += PPPOE_SES_HLEN - 1;
- } else {
- nhoff += PPPOE_SES_HLEN;
- }
+ nhoff += PPPOE_SES_HLEN;
if (ppp_proto == PPP_IP) {
proto = htons(ETH_P_IP);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 69daba3ddaf0..b613bb6e07df 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3431,6 +3431,7 @@ errout:
dev->name);
}
+ netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c
index a1b044467bd6..8398d72d7e4d 100644
--- a/net/dsa/conduit.c
+++ b/net/dsa/conduit.c
@@ -27,9 +27,7 @@ static int dsa_conduit_get_regs_len(struct net_device *dev)
int len;
if (ops && ops->get_regs_len) {
- netdev_lock_ops(dev);
len = ops->get_regs_len(dev);
- netdev_unlock_ops(dev);
if (len < 0)
return len;
ret += len;
@@ -60,15 +58,11 @@ static void dsa_conduit_get_regs(struct net_device *dev,
int len;
if (ops && ops->get_regs_len && ops->get_regs) {
- netdev_lock_ops(dev);
len = ops->get_regs_len(dev);
- if (len < 0) {
- netdev_unlock_ops(dev);
+ if (len < 0)
return;
- }
regs->len = len;
ops->get_regs(dev, regs, data);
- netdev_unlock_ops(dev);
data += regs->len;
}
@@ -115,10 +109,8 @@ static void dsa_conduit_get_ethtool_stats(struct net_device *dev,
int count, mcount = 0;
if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
- netdev_lock_ops(dev);
mcount = ops->get_sset_count(dev, ETH_SS_STATS);
ops->get_ethtool_stats(dev, stats, data);
- netdev_unlock_ops(dev);
}
list_for_each_entry(dp, &dst->ports, list) {
@@ -149,10 +141,8 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev,
if (count >= 0)
phy_ethtool_get_stats(dev->phydev, stats, data);
} else if (ops && ops->get_sset_count && ops->get_ethtool_phy_stats) {
- netdev_lock_ops(dev);
count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
ops->get_ethtool_phy_stats(dev, stats, data);
- netdev_unlock_ops(dev);
}
if (count < 0)
@@ -176,13 +166,11 @@ static int dsa_conduit_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch_tree *dst = cpu_dp->dst;
int count = 0;
- netdev_lock_ops(dev);
if (sset == ETH_SS_PHY_STATS && dev->phydev &&
(!ops || !ops->get_ethtool_phy_stats))
count = phy_ethtool_get_sset_count(dev->phydev);
else if (ops && ops->get_sset_count)
count = ops->get_sset_count(dev, sset);
- netdev_unlock_ops(dev);
if (count < 0)
count = 0;
@@ -239,7 +227,6 @@ static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset,
struct dsa_switch_tree *dst = cpu_dp->dst;
int count, mcount = 0;
- netdev_lock_ops(dev);
if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
!ops->get_ethtool_phy_stats) {
mcount = phy_ethtool_get_sset_count(dev->phydev);
@@ -253,7 +240,6 @@ static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset,
mcount = 0;
ops->get_strings(dev, stringset, data);
}
- netdev_unlock_ops(dev);
list_for_each_entry(dp, &dst->ports, list) {
if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2f4fac22d1ab..7eeff658b467 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -64,6 +64,7 @@
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
+#include <linux/nospec.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
@@ -371,7 +372,9 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
- if (icmp_pointers[icmp_param->data.icmph.type].error)
+ if (icmp_param->data.icmph.type <= NR_ICMP_TYPES &&
+ icmp_pointers[array_index_nospec(icmp_param->data.icmph.type,
+ NR_ICMP_TYPES + 1)].error)
nf_ct_attach(skb, icmp_param->skb);
return 0;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4ac3ae1bc1af..928654c34156 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1479,16 +1479,19 @@ void inet_csk_listen_stop(struct sock *sk)
if (nreq) {
refcount_set(&nreq->rsk_refcnt, 1);
+ rcu_read_lock();
if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
__NET_INC_STATS(sock_net(nsk),
LINUX_MIB_TCPMIGRATEREQSUCCESS);
reqsk_migrate_reset(req);
+ READ_ONCE(nsk->sk_data_ready)(nsk);
} else {
__NET_INC_STATS(sock_net(nsk),
LINUX_MIB_TCPMIGRATEREQFAILURE);
reqsk_migrate_reset(nreq);
__reqsk_free(nreq);
}
+ rcu_read_unlock();
/* inet_csk_reqsk_queue_add() has already
* called inet_child_forget() on failure case.
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a5db7c67d61b..625a1ca13b1b 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -79,7 +79,7 @@ static int ipt_nat_register_lookups(struct net *net)
while (i)
nf_nat_ipv4_unregister_fn(net, &ops[--i]);
- kfree(ops);
+ kfree_rcu(ops, rcu);
return ret;
}
}
@@ -100,7 +100,7 @@ static void ipt_nat_unregister_lookups(struct net *net)
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
nf_nat_ipv4_unregister_fn(net, &ops[i]);
- kfree(ops);
+ kfree_rcu(ops, rcu);
}
static int iptable_nat_table_init(struct net *net)
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 904a060a7330..f92fcc39fc4c 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -2469,10 +2469,10 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old,
goto err_notify;
}
- /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
+ /* When replacing a nexthop with one of a different family, potentially
* update IPv4 indication in all the groups using the nexthop.
*/
- if (oldi->family == AF_INET && newi->family == AF_INET6) {
+ if (oldi->family != newi->family) {
list_for_each_entry(nhge, &old->grp_list, nh_list) {
struct nexthop *nhp = nhge->nh_parent;
struct nh_group *nhg;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2014a6408e93..432fa28e47d4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3424,7 +3424,7 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ WRITE_ONCE(tp->snd_ssthresh, TCP_INFINITE_SSTHRESH);
tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
tp->snd_cwnd_cnt = 0;
tp->is_cwnd_limited = 0;
@@ -3622,7 +3622,8 @@ static void tcp_enable_tx_delay(struct sock *sk, int val)
if (delta && sk->sk_state == TCP_ESTABLISHED) {
s64 srtt = (s64)tp->srtt_us + delta;
- tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+ WRITE_ONCE(tp->srtt_us,
+ clamp_t(s64, srtt, 1, ~0U));
/* Note: does not deal with non zero icsk_backoff */
tcp_set_rto(sk);
@@ -4190,12 +4191,18 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
struct tcp_info *info)
{
u64 stats[__TCP_CHRONO_MAX], total = 0;
- enum tcp_chrono i;
+ enum tcp_chrono i, cur;
+ /* Following READ_ONCE()s pair with WRITE_ONCE()s in tcp_chrono_set().
+ * This is because socket lock might not be owned by us at this point.
+ * This is best effort, tcp_get_timestamping_opt_stats() can
+ * see wrong values. A real fix would be too costly for TCP fast path.
+ */
+ cur = READ_ONCE(tp->chrono_type);
for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
- stats[i] = tp->chrono_stat[i - 1];
- if (i == tp->chrono_type)
- stats[i] += tcp_jiffies32 - tp->chrono_start;
+ stats[i] = READ_ONCE(tp->chrono_stat[i - 1]);
+ if (i == cur)
+ stats[i] += tcp_jiffies32 - READ_ONCE(tp->chrono_start);
stats[i] *= USEC_PER_SEC / HZ;
total += stats[i];
}
@@ -4427,9 +4434,9 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
info.tcpi_sndbuf_limited, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
- tp->data_segs_out, TCP_NLA_PAD);
+ READ_ONCE(tp->data_segs_out), TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
- tp->total_retrans, TCP_NLA_PAD);
+ READ_ONCE(tp->total_retrans), TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = (rate != ~0UL) ? rate : ~0ULL;
@@ -4438,37 +4445,42 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
rate64 = tcp_compute_delivery_rate(tp);
nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
- nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
- nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
- nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+ nla_put_u32(stats, TCP_NLA_SND_CWND, READ_ONCE(tp->snd_cwnd));
+ nla_put_u32(stats, TCP_NLA_REORDERING, READ_ONCE(tp->reordering));
+ nla_put_u32(stats, TCP_NLA_MIN_RTT, data_race(tcp_min_rtt(tp)));
nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
READ_ONCE(inet_csk(sk)->icsk_retransmits));
- nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
- nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
- nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
- nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
-
- nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
+ nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, data_race(!!tp->rate_app_limited));
+ nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, READ_ONCE(tp->snd_ssthresh));
+ nla_put_u32(stats, TCP_NLA_DELIVERED, READ_ONCE(tp->delivered));
+ nla_put_u32(stats, TCP_NLA_DELIVERED_CE, READ_ONCE(tp->delivered_ce));
+
+ nla_put_u32(stats, TCP_NLA_SNDQ_SIZE,
+ max_t(int, 0,
+ READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_una)));
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
- nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
- TCP_NLA_PAD);
- nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, READ_ONCE(tp->bytes_sent),
TCP_NLA_PAD);
- nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
- nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
- nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
- nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS,
+ READ_ONCE(tp->bytes_retrans), TCP_NLA_PAD);
+ nla_put_u32(stats, TCP_NLA_DSACK_DUPS, READ_ONCE(tp->dsack_dups));
+ nla_put_u32(stats, TCP_NLA_REORD_SEEN, READ_ONCE(tp->reord_seen));
+ nla_put_u32(stats, TCP_NLA_SRTT, READ_ONCE(tp->srtt_us) >> 3);
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH,
+ READ_ONCE(tp->timeout_rehash));
nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
- max_t(int, 0, tp->write_seq - tp->snd_nxt));
+ max_t(int, 0,
+ READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt)));
nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
TCP_NLA_PAD);
if (ack_skb)
nla_put_u8(stats, TCP_NLA_TTL,
tcp_skb_ttl_or_hop_limit(ack_skb));
- nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
+ nla_put_u32(stats, TCP_NLA_REHASH,
+ READ_ONCE(tp->plb_rehash) + READ_ONCE(tp->timeout_rehash));
return stats;
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 1ddc20a399b0..aec7805b1d37 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -897,8 +897,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
- tcp_sk(sk)->snd_ssthresh =
- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+ WRITE_ONCE(tcp_sk(sk)->snd_ssthresh,
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT));
} /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN &&
bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
@@ -1043,7 +1043,7 @@ __bpf_kfunc static void bbr_init(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->prior_cwnd = 0;
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ WRITE_ONCE(tp->snd_ssthresh, TCP_INFINITE_SSTHRESH);
bbr->rtt_cnt = 0;
bbr->next_rtt_delivered = tp->delivered;
bbr->prev_ca_state = TCP_CA_Open;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 58358bf92e1b..65444ff14241 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -74,7 +74,7 @@ static void bictcp_init(struct sock *sk)
bictcp_reset(ca);
if (initial_ssthresh)
- tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+ WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, initial_ssthresh);
}
/*
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index ceabfd690a29..0812c390aee5 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -162,7 +162,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tcp_snd_cwnd(tp));
- tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
return;
}
}
@@ -181,7 +181,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tcp_snd_cwnd(tp));
- tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
}
}
}
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ab78b5ae8d0e..119bf8cbb007 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -136,7 +136,7 @@ __bpf_kfunc static void cubictcp_init(struct sock *sk)
bictcp_hystart_reset(sk);
if (!hystart && initial_ssthresh)
- tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+ WRITE_ONCE(tcp_sk(sk)->snd_ssthresh, initial_ssthresh);
}
__bpf_kfunc static void cubictcp_cwnd_event_tx_start(struct sock *sk)
@@ -420,7 +420,7 @@ static void hystart_update(struct sock *sk, u32 delay)
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tcp_snd_cwnd(tp));
- tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
}
}
}
@@ -440,7 +440,7 @@ static void hystart_update(struct sock *sk, u32 delay)
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tcp_snd_cwnd(tp));
- tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_snd_cwnd(tp));
}
}
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 96c99999e09d..274e628e7cf8 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -177,7 +177,7 @@ static void dctcp_react_to_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
ca->loss_cwnd = tcp_snd_cwnd(tp);
- tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
+ WRITE_ONCE(tp->snd_ssthresh, max(tcp_snd_cwnd(tp) >> 1U, 2U));
}
__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 021f745747c5..d5c9e65d9760 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -476,14 +476,14 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp,
static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
{
- tp->delivered_ce += ecn_count;
+ WRITE_ONCE(tp->delivered_ce, tp->delivered_ce + ecn_count);
}
/* Updates the delivered and delivered_ce counts */
static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
- tp->delivered += delivered;
+ WRITE_ONCE(tp->delivered, tp->delivered + delivered);
if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
tcp_count_delivered_ce(tp, delivered);
}
@@ -1132,7 +1132,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tcp_bpf_rtt(sk, mrtt_us, srtt);
}
- tp->srtt_us = max(1U, srtt);
+ WRITE_ONCE(tp->srtt_us, max(1U, srtt));
}
void tcp_update_pacing_rate(struct sock *sk)
@@ -1246,7 +1246,7 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
state->flag |= FLAG_DSACK_TLP;
- tp->dsack_dups += dup_segs;
+ WRITE_ONCE(tp->dsack_dups, tp->dsack_dups + dup_segs);
/* Skip the DSACK if dup segs weren't retransmitted by sender */
if (tp->dsack_dups > tp->total_retrans)
return 0;
@@ -1293,12 +1293,13 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
- tp->reordering = min_t(u32, (metric + mss - 1) / mss,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
+ WRITE_ONCE(tp->reordering,
+ min_t(u32, (metric + mss - 1) / mss,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)));
}
/* This exciting event is worth to be remembered. 8) */
- tp->reord_seen++;
+ WRITE_ONCE(tp->reord_seen, tp->reord_seen + 1);
NET_INC_STATS(sock_net(sk),
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
@@ -2439,9 +2440,10 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
if (!tcp_limit_reno_sacked(tp))
return;
- tp->reordering = min_t(u32, tp->packets_out + addend,
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
- tp->reord_seen++;
+ WRITE_ONCE(tp->reordering,
+ min_t(u32, tp->packets_out + addend,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)));
+ WRITE_ONCE(tp->reord_seen, tp->reord_seen + 1);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
@@ -2565,7 +2567,7 @@ void tcp_enter_loss(struct sock *sk)
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->prior_cwnd = tcp_snd_cwnd(tp);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ WRITE_ONCE(tp->snd_ssthresh, icsk->icsk_ca_ops->ssthresh(sk));
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
@@ -2579,8 +2581,8 @@ void tcp_enter_loss(struct sock *sk)
reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
tp->sacked_out >= reordering)
- tp->reordering = min_t(unsigned int, tp->reordering,
- reordering);
+ WRITE_ONCE(tp->reordering,
+ min_t(unsigned int, tp->reordering, reordering));
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
@@ -2858,7 +2860,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
if (tp->prior_ssthresh > tp->snd_ssthresh) {
- tp->snd_ssthresh = tp->prior_ssthresh;
+ WRITE_ONCE(tp->snd_ssthresh, tp->prior_ssthresh);
tcp_ecn_withdraw_cwr(tp);
}
}
@@ -2976,7 +2978,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tp->prior_cwnd = tcp_snd_cwnd(tp);
tp->prr_delivered = 0;
tp->prr_out = 0;
- tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ WRITE_ONCE(tp->snd_ssthresh, inet_csk(sk)->icsk_ca_ops->ssthresh(sk));
tcp_ecn_queue_cwr(tp);
}
@@ -3118,7 +3120,7 @@ static void tcp_non_congestion_loss_retransmit(struct sock *sk)
if (icsk->icsk_ca_state != TCP_CA_Loss) {
tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
tp->prior_ssthresh = 0;
tp->undo_marker = 0;
tcp_set_ca_state(sk, TCP_CA_Loss);
@@ -3910,7 +3912,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
sock_owned_by_me((struct sock *)tp);
tp->bytes_acked += delta;
tcp_snd_sne_update(tp, ack);
- tp->snd_una = ack;
+ WRITE_ONCE(tp->snd_una, ack);
}
static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
@@ -4284,11 +4286,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto old_ack;
}
- /* If the ack includes data we haven't sent yet, discard
- * this segment (RFC793 Section 3.9).
+ /* If the ack includes data we haven't sent yet, drop the
+ * segment. RFC 793 Section 3.9 and RFC 5961 Section 5.2
+ * require us to send an ACK back in that case.
*/
- if (after(ack, tp->snd_nxt))
+ if (after(ack, tp->snd_nxt)) {
+ if (!(flag & FLAG_NO_CHALLENGE_ACK))
+ tcp_send_challenge_ack(sk, false);
return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
+ }
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
@@ -6777,7 +6783,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
/* SYN-data is counted as two separate packets in tcp_ack() */
if (tp->delivered > 1)
- --tp->delivered;
+ WRITE_ONCE(tp->delivered, tp->delivered - 1);
}
tcp_fastopen_add_skb(sk, synack);
@@ -7210,7 +7216,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
SKB_DR_SET(reason, NOT_SPECIFIED);
switch (sk->sk_state) {
case TCP_SYN_RECV:
- tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
+ WRITE_ONCE(tp->delivered, tp->delivered + 1); /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
@@ -7238,7 +7244,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_socket)
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ WRITE_ONCE(tp->snd_una, TCP_SKB_CB(skb)->ack_seq);
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 06b1d5d3b6df..dc0c081fc1f3 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -490,13 +490,13 @@ void tcp_init_metrics(struct sock *sk)
val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val) {
- tp->snd_ssthresh = val;
+ WRITE_ONCE(tp->snd_ssthresh, val);
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ WRITE_ONCE(tp->snd_ssthresh, tp->snd_cwnd_clamp);
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val)
- tp->reordering = val;
+ WRITE_ONCE(tp->reordering, val);
crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock();
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index a60662f4bdf9..f345897a68df 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -396,8 +396,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
/* We have enough data to determine we are congested */
ca->nv_allow_cwnd_growth = 0;
- tp->snd_ssthresh =
- (nv_ssthresh_factor * max_win) >> 3;
+ WRITE_ONCE(tp->snd_ssthresh,
+ (nv_ssthresh_factor * max_win) >> 3);
if (tcp_snd_cwnd(tp) - max_win > 2) {
/* gap > 2, we do exponential cwnd decrease */
int dec;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8e99687526a6..f9d8755705f7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -171,7 +171,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
restart_cwnd = min(restart_cwnd, cwnd);
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
@@ -1688,8 +1688,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
- tp->data_segs_out += tcp_skb_pcount(skb);
- tp->bytes_sent += skb->len - tcp_header_size;
+ WRITE_ONCE(tp->data_segs_out,
+ tp->data_segs_out + tcp_skb_pcount(skb));
+ WRITE_ONCE(tp->bytes_sent,
+ tp->bytes_sent + skb->len - tcp_header_size);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -2142,7 +2144,7 @@ static void tcp_cwnd_application_limited(struct sock *sk)
u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 win_used = max(tp->snd_cwnd_used, init_win);
if (win_used < tcp_snd_cwnd(tp)) {
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
}
tp->snd_cwnd_used = 0;
@@ -3642,8 +3644,8 @@ start:
TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans += segs;
- tp->bytes_retrans += skb->len;
+ WRITE_ONCE(tp->total_retrans, tp->total_retrans + segs);
+ WRITE_ONCE(tp->bytes_retrans, tp->bytes_retrans + skb->len);
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
@@ -4152,7 +4154,7 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_wnd = 0;
tcp_init_wl(tp, 0);
tcp_write_queue_purge(sk);
- tp->snd_una = tp->write_seq;
+ WRITE_ONCE(tp->snd_una, tp->write_seq);
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
WRITE_ONCE(tp->snd_nxt, tp->write_seq);
@@ -4646,7 +4648,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
* However in this case, we are dealing with a passive fastopen
* socket thus we can change total_retrans value.
*/
- tcp_sk_rw(sk)->total_retrans++;
+ WRITE_ONCE(tcp_sk_rw(sk)->total_retrans,
+ tcp_sk_rw(sk)->total_retrans + 1);
}
trace_tcp_retransmit_synack(sk, req);
WRITE_ONCE(req->num_retrans, req->num_retrans + 1);
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
index 68ccdb9a5412..c11a0cd3f8fe 100644
--- a/net/ipv4/tcp_plb.c
+++ b/net/ipv4/tcp_plb.c
@@ -80,7 +80,7 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
sk_rethink_txhash(sk);
plb->consec_cong_rounds = 0;
- tcp_sk(sk)->plb_rehash++;
+ WRITE_ONCE(tcp_sk(sk)->plb_rehash, tcp_sk(sk)->plb_rehash + 1);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
}
EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ea99988795e7..8d791a954cd6 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -297,7 +297,7 @@ static int tcp_write_timeout(struct sock *sk)
}
if (sk_rethink_txhash(sk)) {
- tp->timeout_rehash++;
+ WRITE_ONCE(tp->timeout_rehash, tp->timeout_rehash + 1);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 950a66966059..574453af6bc0 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -245,7 +245,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp),
(u32)target_cwnd + 1));
- tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+ WRITE_ONCE(tp->snd_ssthresh,
+ tcp_vegas_ssthresh(tp));
} else if (tcp_in_slow_start(tp)) {
/* Slow start. */
@@ -261,8 +262,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* we slow down.
*/
tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
- tp->snd_ssthresh
- = tcp_vegas_ssthresh(tp);
+ WRITE_ONCE(tp->snd_ssthresh,
+ tcp_vegas_ssthresh(tp));
} else if (diff < alpha) {
/* We don't have enough extra packets
* in the network, so speed up.
@@ -280,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_current_ssthresh(sk));
}
/* Wipe the slate clean for the next RTT. */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c6e97141eef2..b5a42adfd6ca 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -244,11 +244,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
switch (event) {
case CA_EVENT_COMPLETE_CWR:
- tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_westwood_bw_rttmin(sk));
tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
break;
case CA_EVENT_LOSS:
- tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ WRITE_ONCE(tp->snd_ssthresh, tcp_westwood_bw_rttmin(sk));
/* Update RTT_min when next ack arrives */
w->reset_rtt_min = 1;
break;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index b22b3dccd05e..9e581154f18f 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -147,7 +147,8 @@ do_vegas:
tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp),
yeah->reno_count));
- tp->snd_ssthresh = tcp_snd_cwnd(tp);
+ WRITE_ONCE(tp->snd_ssthresh,
+ tcp_snd_cwnd(tp));
}
if (yeah->reno_count <= 2)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 799d9e9ac45d..efb23807a026 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -1104,7 +1104,6 @@ static int icmpv6_rcv(struct sk_buff *skb)
struct net *net = dev_net_rcu(skb->dev);
struct net_device *dev = icmp6_dev(skb);
struct inet6_dev *idev = __in6_dev_get(dev);
- const struct in6_addr *saddr, *daddr;
struct icmp6hdr *hdr;
u8 type;
@@ -1135,12 +1134,10 @@ static int icmpv6_rcv(struct sk_buff *skb)
__ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS);
- saddr = &ipv6_hdr(skb)->saddr;
- daddr = &ipv6_hdr(skb)->daddr;
-
if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) {
net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n",
- saddr, daddr);
+ &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr);
goto csum_error;
}
@@ -1220,7 +1217,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
break;
net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n",
- saddr, daddr);
+ &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr);
/*
* error of unknown type.
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 46bc06506470..c468c83af0f2 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -62,6 +62,8 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("ip6tnl");
MODULE_ALIAS_NETDEV("ip6tnl0");
+#define IP6_TUNNEL_MAX_DEST_TLVS 8
+
#define IP6_TUNNEL_HASH_SIZE_SHIFT 5
#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT)
@@ -425,11 +427,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
break;
}
if (nexthdr == NEXTHDR_DEST) {
+ int tlv_cnt = 0;
u16 i = 2;
while (1) {
struct ipv6_tlv_tnl_enc_lim *tel;
+ if (unlikely(tlv_cnt++ >= IP6_TUNNEL_MAX_DEST_TLVS))
+ break;
+
/* No more room for encapsulation limit */
if (i + sizeof(*tel) > optlen)
break;
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index e119d4f090cc..5be723232df8 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -81,7 +81,7 @@ static int ip6t_nat_register_lookups(struct net *net)
while (i)
nf_nat_ipv6_unregister_fn(net, &ops[--i]);
- kfree(ops);
+ kfree_rcu(ops, rcu);
return ret;
}
}
@@ -102,7 +102,7 @@ static void ip6t_nat_unregister_lookups(struct net *net)
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
nf_nat_ipv6_unregister_fn(net, &ops[i]);
- kfree(ops);
+ kfree_rcu(ops, rcu);
}
static int ip6table_nat_table_init(struct net *net)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 97b50d9b1365..9b64343ebad6 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -746,7 +746,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla,
newts->type = LWTUNNEL_ENCAP_SEG6;
newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
- if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
+ if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP &&
+ tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP_RED)
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
newts->headroom = seg6_lwt_headroom(tuninfo);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 59d593bb5d18..1b210db3119e 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -520,8 +520,10 @@ static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
if (sk->sk_state == TCP_SYN_SENT) {
const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
- if (!timeo || !llc_ui_wait_for_conn(sk, timeo))
+ if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) {
+ rc = -EINPROGRESS;
goto out;
+ }
rc = sock_intr_errno(timeo);
if (signal_pending(current))
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 26fb8c6bbad2..1f3dccbb7aed 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -441,6 +441,7 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
unsigned long f;
u8 tag, flags;
int rc;
+ u8 ver;
msk = NULL;
rc = -EINVAL;
@@ -467,7 +468,8 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
netid = mctp_cb(skb)->net;
skb_pull(skb, sizeof(struct mctp_hdr));
- if (mh->ver != 1)
+ ver = mh->ver & MCTP_HDR_VER_MASK;
+ if (ver < MCTP_VER_MIN || ver > MCTP_VER_MAX)
goto out;
flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM);
@@ -1317,6 +1319,7 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
struct mctp_dst dst;
struct mctp_hdr *mh;
int rc;
+ u8 ver;
rcu_read_lock();
mdev = __mctp_dev_get(dev);
@@ -1334,7 +1337,8 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
/* We have enough for a header; decode and route */
mh = mctp_hdr(skb);
- if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX)
+ ver = mh->ver & MCTP_HDR_VER_MASK;
+ if (ver < MCTP_VER_MIN || ver > MCTP_VER_MAX)
goto err_drop;
/* source must be valid unicast or null; drop reserved ranges and
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index fbffd3a43fe8..718e910ff23f 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3594,7 +3594,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
* uses the correct data
*/
mptcp_copy_inaddrs(nsk, ssk);
- __mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk);
msk->rcvq_space.time = mptcp_stamp();
@@ -4252,6 +4251,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
mptcp_graft_subflows(newsk);
mptcp_rps_record_subflows(msk);
+ __mptcp_propagate_sndbuf(newsk, mptcp_subflow_tcp_sock(subflow));
/* Do late cleanup for the first subflow as necessary. Also
* deal with bad peers not doing a complete shutdown.
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 0fb5162992e5..ce542ed4b013 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -102,6 +102,18 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
return dest_dst;
}
+/* Based on ip_exceeds_mtu(). */
+static bool ip_vs_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+ return false;
+
+ return true;
+}
+
static inline bool
__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
{
@@ -111,10 +123,9 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
*/
if (IP6CB(skb)->frag_max_size > mtu)
return true; /* largest fragment violate MTU */
- }
- else if (skb->len > mtu && !skb_is_gso(skb)) {
+ } else if (ip_vs_exceeds_mtu(skb, mtu))
return true; /* Packet size violate MTU size */
- }
+
return false;
}
@@ -232,7 +243,7 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
return true;
if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len > mtu && !skb_is_gso(skb) &&
+ ip_vs_exceeds_mtu(skb, mtu) &&
!ip_vs_iph_icmp(ipvsh))) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 98deef6cde69..8f1054920a85 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -50,7 +50,7 @@ static unsigned int help(struct sk_buff *skb,
return NF_DROP;
}
- sprintf(buffer, "%u", port);
+ snprintf(buffer, sizeof(buffer), "%u", port);
if (!nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
protoff, matchoff, matchlen,
buffer, strlen(buffer))) {
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 83b2b5e9759a..74ec224ce0d6 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1222,9 +1222,11 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
ret = nf_register_net_hooks(net, nat_ops, ops_count);
if (ret < 0) {
mutex_unlock(&nf_nat_proto_mutex);
- for (i = 0; i < ops_count; i++)
- kfree(nat_ops[i].priv);
- kfree(nat_ops);
+ for (i = 0; i < ops_count; i++) {
+ priv = nat_ops[i].priv;
+ kfree_rcu(priv, rcu_head);
+ }
+ kfree_rcu(nat_ops, rcu);
return ret;
}
@@ -1288,7 +1290,7 @@ void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
}
nat_proto_net->nat_hook_ops = NULL;
- kfree(nat_ops);
+ kfree_rcu(nat_ops, rcu);
}
unlock:
mutex_unlock(&nf_nat_proto_mutex);
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index cf4aeb299bde..c845b6d1a2bd 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -68,25 +68,27 @@ static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
}
static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer,
+ size_t size,
const union nf_inet_addr *addr, bool delim)
{
if (nf_ct_l3num(ct) == NFPROTO_IPV4)
- return sprintf(buffer, "%pI4", &addr->ip);
+ return scnprintf(buffer, size, "%pI4", &addr->ip);
else {
if (delim)
- return sprintf(buffer, "[%pI6c]", &addr->ip6);
+ return scnprintf(buffer, size, "[%pI6c]", &addr->ip6);
else
- return sprintf(buffer, "%pI6c", &addr->ip6);
+ return scnprintf(buffer, size, "%pI6c", &addr->ip6);
}
}
static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer,
+ size_t size,
const union nf_inet_addr *addr, u16 port)
{
if (nf_ct_l3num(ct) == NFPROTO_IPV4)
- return sprintf(buffer, "%pI4:%u", &addr->ip, port);
+ return scnprintf(buffer, size, "%pI4:%u", &addr->ip, port);
else
- return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port);
+ return scnprintf(buffer, size, "[%pI6c]:%u", &addr->ip6, port);
}
static int map_addr(struct sk_buff *skb, unsigned int protoff,
@@ -119,7 +121,7 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff,
if (nf_inet_addr_cmp(&newaddr, addr) && newport == port)
return 1;
- buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport));
+ buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer), &newaddr, ntohs(newport));
return mangle_packet(skb, protoff, dataoff, dptr, datalen,
matchoff, matchlen, buffer, buflen);
}
@@ -212,7 +214,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
&addr, true) > 0 &&
nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) &&
!nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) {
- buflen = sip_sprintf_addr(ct, buffer,
+ buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer),
&ct->tuplehash[!dir].tuple.dst.u3,
true);
if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -229,7 +231,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
&addr, false) > 0 &&
nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) &&
!nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) {
- buflen = sip_sprintf_addr(ct, buffer,
+ buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer),
&ct->tuplehash[!dir].tuple.src.u3,
false);
if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
@@ -247,7 +249,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
- buflen = sprintf(buffer, "%u", ntohs(p));
+ buflen = scnprintf(buffer, sizeof(buffer), "%u", ntohs(p));
if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
poff, plen, buffer, buflen)) {
nf_ct_helper_log(skb, ct, "cannot mangle rport");
@@ -418,7 +420,8 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) ||
exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
- buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port);
+ buflen = sip_sprintf_addr_port(ct, buffer, sizeof(buffer),
+ &newaddr, port);
if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
matchoff, matchlen, buffer, buflen)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
@@ -438,8 +441,8 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[sizeof("4294967295")];
unsigned int matchoff, matchlen;
- char buffer[sizeof("65536")];
int buflen, c_len;
/* Get actual SDP length */
@@ -454,7 +457,7 @@ static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
&matchoff, &matchlen) <= 0)
return 0;
- buflen = sprintf(buffer, "%u", c_len);
+ buflen = scnprintf(buffer, sizeof(buffer), "%u", c_len);
return mangle_packet(skb, protoff, dataoff, dptr, datalen,
matchoff, matchlen, buffer, buflen);
}
@@ -491,7 +494,7 @@ static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
char buffer[INET6_ADDRSTRLEN];
unsigned int buflen;
- buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false);
if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
sdpoff, type, term, buffer, buflen))
return 0;
@@ -509,7 +512,7 @@ static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
char buffer[sizeof("nnnnn")];
unsigned int buflen;
- buflen = sprintf(buffer, "%u", port);
+ buflen = scnprintf(buffer, sizeof(buffer), "%u", port);
if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
matchoff, matchlen, buffer, buflen))
return 0;
@@ -529,7 +532,7 @@ static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff
unsigned int buflen;
/* Mangle session description owner and contact addresses */
- buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ buflen = sip_sprintf_addr(ct, buffer, sizeof(buffer), addr, false);
if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen))
return 0;
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index d64ce21c7b55..acb753ec5697 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -31,26 +31,18 @@ EXPORT_SYMBOL_GPL(nf_osf_fingers);
static inline int nf_osf_ttl(const struct sk_buff *skb,
int ttl_check, unsigned char f_ttl)
{
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
- const struct in_ifaddr *ifa;
- int ret = 0;
- if (ttl_check == NF_OSF_TTL_TRUE)
+ switch (ttl_check) {
+ case NF_OSF_TTL_TRUE:
return ip->ttl == f_ttl;
- if (ttl_check == NF_OSF_TTL_NOCHECK)
- return 1;
- else if (ip->ttl <= f_ttl)
+ break;
+ case NF_OSF_TTL_NOCHECK:
return 1;
-
- in_dev_for_each_ifa_rcu(ifa, in_dev) {
- if (inet_ifa_match(ip->saddr, ifa)) {
- ret = (ip->ttl == f_ttl);
- break;
- }
+ case NF_OSF_TTL_LESS:
+ default:
+ return ip->ttl <= f_ttl;
}
-
- return ret;
}
struct nf_osf_hdr_ctx {
@@ -64,9 +56,9 @@ struct nf_osf_hdr_ctx {
static bool nf_osf_match_one(const struct sk_buff *skb,
const struct nf_osf_user_finger *f,
int ttl_check,
- struct nf_osf_hdr_ctx *ctx)
+ const struct nf_osf_hdr_ctx *ctx)
{
- const __u8 *optpinit = ctx->optp;
+ const __u8 *optp = ctx->optp;
unsigned int check_WSS = 0;
int fmatch = FMATCH_WRONG;
int foptsize, optnum;
@@ -95,17 +87,17 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
check_WSS = f->wss.wc;
for (optnum = 0; optnum < f->opt_num; ++optnum) {
- if (f->opt[optnum].kind == *ctx->optp) {
+ if (f->opt[optnum].kind == *optp) {
__u32 len = f->opt[optnum].length;
- const __u8 *optend = ctx->optp + len;
+ const __u8 *optend = optp + len;
fmatch = FMATCH_OK;
- switch (*ctx->optp) {
+ switch (*optp) {
case OSFOPT_MSS:
- mss = ctx->optp[3];
+ mss = optp[3];
mss <<= 8;
- mss |= ctx->optp[2];
+ mss |= optp[2];
mss = ntohs((__force __be16)mss);
break;
@@ -113,7 +105,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
break;
}
- ctx->optp = optend;
+ optp = optend;
} else
fmatch = FMATCH_OPT_WRONG;
@@ -156,9 +148,6 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
}
}
- if (fmatch != FMATCH_OK)
- ctx->optp = optpinit;
-
return fmatch == FMATCH_OK;
}
@@ -320,6 +309,10 @@ static int nfnl_osf_add_callback(struct sk_buff *skb,
if (f->opt_num > ARRAY_SIZE(f->opt))
return -EINVAL;
+ if (f->wss.wc >= OSF_WSS_MAX ||
+ (f->wss.wc == OSF_WSS_MODULO && f->wss.val == 0))
+ return -EINVAL;
+
for (i = 0; i < f->opt_num; i++) {
if (!f->opt[i].length || f->opt[i].length > MAX_IPOPTLEN)
return -EINVAL;
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index 18003433476c..c02d5cb52143 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct nf_osf_data data;
struct tcphdr _tcph;
+ if (nft_pf(pkt) != NFPROTO_IPV4) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
if (pkt->tprot != IPPROTO_TCP) {
regs->verdict.code = NFT_BREAK;
return;
@@ -114,7 +119,6 @@ static int nft_osf_validate(const struct nft_ctx *ctx,
switch (ctx->family) {
case NFPROTO_IPV4:
- case NFPROTO_IPV6:
case NFPROTO_INET:
hooks = (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_PRE_ROUTING) |
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index 4798cd2ca26e..7fc5156825e4 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -36,25 +36,37 @@ static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static struct xt_match mac_mt_reg __read_mostly = {
- .name = "mac",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .match = mac_mt,
- .matchsize = sizeof(struct xt_mac_info),
- .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_FORWARD),
- .me = THIS_MODULE,
+static struct xt_match mac_mt_reg[] __read_mostly = {
+ {
+ .name = "mac",
+ .family = NFPROTO_IPV4,
+ .match = mac_mt,
+ .matchsize = sizeof(struct xt_mac_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "mac",
+ .family = NFPROTO_IPV6,
+ .match = mac_mt,
+ .matchsize = sizeof(struct xt_mac_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD),
+ .me = THIS_MODULE,
+ },
};
static int __init mac_mt_init(void)
{
- return xt_register_match(&mac_mt_reg);
+ return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
}
static void __exit mac_mt_exit(void)
{
- xt_unregister_match(&mac_mt_reg);
+ xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg));
}
module_init(mac_mt_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 5bfb4843df66..8f2e57b2a586 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -127,26 +127,39 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
return true;
}
-static struct xt_match owner_mt_reg __read_mostly = {
- .name = "owner",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .checkentry = owner_check,
- .match = owner_mt,
- .matchsize = sizeof(struct xt_owner_match_info),
- .hooks = (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING),
- .me = THIS_MODULE,
+static struct xt_match owner_mt_reg[] __read_mostly = {
+ {
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .checkentry = owner_check,
+ .match = owner_mt,
+ .matchsize = sizeof(struct xt_owner_match_info),
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = owner_check,
+ .match = owner_mt,
+ .matchsize = sizeof(struct xt_owner_match_info),
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ }
};
static int __init owner_mt_init(void)
{
- return xt_register_match(&owner_mt_reg);
+ return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
}
static void __exit owner_mt_exit(void)
{
- xt_unregister_match(&owner_mt_reg);
+ xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
}
module_init(owner_mt_init);
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 53997771013f..d2b0b52434fa 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -137,24 +137,33 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
return 0;
}
-static struct xt_match physdev_mt_reg __read_mostly = {
- .name = "physdev",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = physdev_mt_check,
- .match = physdev_mt,
- .matchsize = sizeof(struct xt_physdev_info),
- .me = THIS_MODULE,
+static struct xt_match physdev_mt_reg[] __read_mostly = {
+ {
+ .name = "physdev",
+ .family = NFPROTO_IPV4,
+ .checkentry = physdev_mt_check,
+ .match = physdev_mt,
+ .matchsize = sizeof(struct xt_physdev_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "physdev",
+ .family = NFPROTO_IPV6,
+ .checkentry = physdev_mt_check,
+ .match = physdev_mt,
+ .matchsize = sizeof(struct xt_physdev_info),
+ .me = THIS_MODULE,
+ },
};
static int __init physdev_mt_init(void)
{
- return xt_register_match(&physdev_mt_reg);
+ return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
}
static void __exit physdev_mt_exit(void)
{
- xt_unregister_match(&physdev_mt_reg);
+ xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg));
}
module_init(physdev_mt_init);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 6df485f4403d..61b2f1e58d15 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -33,7 +33,7 @@ static struct xt_match realm_mt_reg __read_mostly = {
.matchsize = sizeof(struct xt_realm_info),
.hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
(1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.me = THIS_MODULE
};
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e209099218b4..bbbde50fc649 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2184,9 +2184,40 @@ error:
return err;
}
+static size_t ovs_vport_cmd_msg_size(void)
+{
+ size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
+
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_PORT_NO */
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_TYPE */
+ msgsize += nla_total_size(IFNAMSIZ); /* OVS_VPORT_ATTR_NAME */
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_VPORT_ATTR_IFINDEX */
+ msgsize += nla_total_size(sizeof(s32)); /* OVS_VPORT_ATTR_NETNSID */
+
+ /* OVS_VPORT_ATTR_STATS */
+ msgsize += nla_total_size_64bit(sizeof(struct ovs_vport_stats));
+
+ /* OVS_VPORT_ATTR_UPCALL_STATS(OVS_VPORT_UPCALL_ATTR_SUCCESS +
+ * OVS_VPORT_UPCALL_ATTR_FAIL)
+ */
+ msgsize += nla_total_size(nla_total_size_64bit(sizeof(u64)) +
+ nla_total_size_64bit(sizeof(u64)));
+
+ /* OVS_VPORT_ATTR_UPCALL_PID */
+ msgsize += nla_total_size(nr_cpu_ids * sizeof(u32));
+
+ /* OVS_VPORT_ATTR_OPTIONS(OVS_TUNNEL_ATTR_DST_PORT +
+ * OVS_TUNNEL_ATTR_EXTENSION(OVS_VXLAN_EXT_GBP))
+ */
+ msgsize += nla_total_size(nla_total_size(sizeof(u16)) +
+ nla_total_size(nla_total_size(0)));
+
+ return msgsize;
+}
+
static struct sk_buff *ovs_vport_cmd_alloc_info(void)
{
- return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ return genlmsg_new(ovs_vport_cmd_msg_size(), GFP_KERNEL);
}
/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
@@ -2196,7 +2227,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
struct sk_buff *skb;
int retval;
- skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ skb = ovs_vport_cmd_alloc_info();
if (!skb)
return ERR_PTR(-ENOMEM);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 23f629e94a36..56b2e2d1a749 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -406,6 +406,9 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids)
if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
return -EINVAL;
+ if (nla_len(ids) / sizeof(u32) > nr_cpu_ids)
+ return -EINVAL;
+
old = ovsl_dereference(vport->upcall_portids);
vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids),
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4b043241fd56..8e6f3a734ba0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2718,7 +2718,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{
struct sk_buff *skb = NULL;
struct net_device *dev;
- struct virtio_net_hdr *vnet_hdr = NULL;
+ struct virtio_net_hdr vnet_hdr;
+ bool has_vnet_hdr = false;
struct sockcm_cookie sockc;
__be16 proto;
int err, reserve = 0;
@@ -2819,16 +2820,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
if (vnet_hdr_sz) {
- vnet_hdr = data;
data += vnet_hdr_sz;
tp_len -= vnet_hdr_sz;
- if (tp_len < 0 ||
- __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
+ if (tp_len < 0) {
+ tp_len = -EINVAL;
+ goto tpacket_error;
+ }
+ memcpy(&vnet_hdr, data - vnet_hdr_sz, sizeof(vnet_hdr));
+ if (__packet_snd_vnet_parse(&vnet_hdr, tp_len)) {
tp_len = -EINVAL;
goto tpacket_error;
}
copylen = __virtio16_to_cpu(vio_le(),
- vnet_hdr->hdr_len);
+ vnet_hdr.hdr_len);
+ has_vnet_hdr = true;
}
copylen = max_t(int, copylen, dev->hard_header_len);
skb = sock_alloc_send_skb(&po->sk,
@@ -2865,12 +2870,12 @@ tpacket_error:
}
}
- if (vnet_hdr_sz) {
- if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
+ if (has_vnet_hdr) {
+ if (virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le())) {
tp_len = -EINVAL;
goto tpacket_error;
}
- virtio_net_hdr_set_proto(skb, vnet_hdr);
+ virtio_net_hdr_set_proto(skb, &vnet_hdr);
}
skb->destructor = tpacket_destruct_skb;
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 412441aaa298..c10b7ed06c49 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -701,6 +701,13 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ /* Zero the per-item buffer before handing it to the
+ * visitor so any field the visitor does not write -
+ * including implicit alignment padding - cannot leak
+ * stack contents to user space via rds_info_copy().
+ */
+ memset(buffer, 0, item_len);
+
/* XXX no c_lock usage.. */
if (!visitor(conn, buffer))
continue;
@@ -750,6 +757,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
*/
cp = conn->c_path;
+ /* Zero the per-item buffer for the same reason as
+ * rds_for_each_conn_info(): any byte the visitor
+ * does not write (including alignment padding) must
+ * not leak stack contents via rds_info_copy().
+ */
+ memset(buffer, 0, item_len);
+
/* XXX no cp_lock usage.. */
if (!visitor(cp, buffer))
continue;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index aa6465dc742c..61fb6e45281b 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -326,10 +326,6 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
if (args->cookie_addr &&
put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
- if (!need_odp) {
- unpin_user_pages(pages, nr_pages);
- kfree(sg);
- }
ret = -EFAULT;
goto out;
}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 96ecb83c9071..27c2aa2dd023 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1486,7 +1486,6 @@ int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int);
void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
-void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index fec59d9338b9..fdd683261226 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -332,7 +332,25 @@ bool rxrpc_input_call_event(struct rxrpc_call *call)
saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
- rxrpc_input_call_packet(call, skb);
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+ sp->hdr.securityIndex != 0 &&
+ skb_cloned(skb)) {
+ /* Unshare the packet so that it can be
+ * modified by in-place decryption.
+ */
+ struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+
+ if (nskb) {
+ rxrpc_new_skb(nskb, rxrpc_skb_new_unshared);
+ rxrpc_input_call_packet(call, nskb);
+ rxrpc_free_skb(nskb, rxrpc_skb_put_call_rx);
+ } else {
+ /* OOM - Drop the packet. */
+ rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem);
+ }
+ } else {
+ rxrpc_input_call_packet(call, skb);
+ }
rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
did_receive = true;
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 9a41ec708aeb..a2130d25aaa9 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -240,6 +240,33 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call)
rxrpc_notify_socket(call);
}
+static int rxrpc_verify_response(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ if (skb_cloned(skb)) {
+ /* Copy the packet if shared so that we can do in-place
+ * decryption.
+ */
+ struct sk_buff *nskb = skb_copy(skb, GFP_NOFS);
+
+ if (nskb) {
+ rxrpc_new_skb(nskb, rxrpc_skb_new_unshared);
+ ret = conn->security->verify_response(conn, nskb);
+ rxrpc_free_skb(nskb, rxrpc_skb_put_response_copy);
+ } else {
+ /* OOM - Drop the packet. */
+ rxrpc_see_skb(skb, rxrpc_skb_see_unshare_nomem);
+ ret = -ENOMEM;
+ }
+ } else {
+ ret = conn->security->verify_response(conn, skb);
+ }
+
+ return ret;
+}
+
/*
* connection-level Rx packet processor
*/
@@ -270,7 +297,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
}
spin_unlock_irq(&conn->state_lock);
- ret = conn->security->verify_response(conn, skb);
+ ret = rxrpc_verify_response(conn, skb);
if (ret < 0)
return ret;
@@ -362,7 +389,6 @@ again:
static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
{
struct sk_buff *skb;
- int ret;
if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
rxrpc_secure_connection(conn);
@@ -371,17 +397,8 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
* connection that each one has when we've finished with it */
while ((skb = skb_dequeue(&conn->rx_queue))) {
rxrpc_see_skb(skb, rxrpc_skb_see_conn_work);
- ret = rxrpc_process_event(conn, skb);
- switch (ret) {
- case -ENOMEM:
- case -EAGAIN:
- skb_queue_head(&conn->rx_queue, skb);
- rxrpc_queue_conn(conn, rxrpc_conn_queue_retry_work);
- break;
- default:
- rxrpc_free_skb(skb, rxrpc_skb_put_conn_work);
- break;
- }
+ rxrpc_process_event(conn, skb);
+ rxrpc_free_skb(skb, rxrpc_skb_put_conn_work);
}
}
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 697956931925..dc5184a2fa9d 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -192,13 +192,12 @@ static bool rxrpc_extract_abort(struct sk_buff *skb)
/*
* Process packets received on the local endpoint
*/
-static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb)
+static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb)
{
struct rxrpc_connection *conn;
struct sockaddr_rxrpc peer_srx;
struct rxrpc_skb_priv *sp;
struct rxrpc_peer *peer = NULL;
- struct sk_buff *skb = *_skb;
bool ret = false;
skb_pull(skb, sizeof(struct udphdr));
@@ -244,25 +243,6 @@ static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb)
return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call);
if (sp->hdr.seq == 0)
return rxrpc_bad_message(skb, rxrpc_badmsg_zero_seq);
-
- /* Unshare the packet so that it can be modified for in-place
- * decryption.
- */
- if (sp->hdr.securityIndex != 0) {
- skb = skb_unshare(skb, GFP_ATOMIC);
- if (!skb) {
- rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem);
- *_skb = NULL;
- return just_discard;
- }
-
- if (skb != *_skb) {
- rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare);
- *_skb = skb;
- rxrpc_new_skb(skb, rxrpc_skb_new_unshared);
- sp = rxrpc_skb(skb);
- }
- }
break;
case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -494,7 +474,7 @@ int rxrpc_io_thread(void *data)
switch (skb->mark) {
case RXRPC_SKB_MARK_PACKET:
skb->priority = 0;
- if (!rxrpc_input_packet(local, &skb))
+ if (!rxrpc_input_packet(local, skb))
rxrpc_reject_packet(local, skb);
trace_rxrpc_rx_done(skb->mark, skb->priority);
rxrpc_free_skb(skb, rxrpc_skb_put_input);
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 6301d79ee35a..3ec3d89fdf14 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -502,6 +502,10 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep)
if (v1->security_index != RXRPC_SECURITY_RXKAD)
goto error;
+ ret = -EKEYREJECTED;
+ if (v1->ticket_length > AFSTOKEN_RK_TIX_MAX)
+ goto error;
+
plen = sizeof(*token->kad) + v1->ticket_length;
prep->quotalen += plen + sizeof(*token);
diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c
index 30275cb5ba3e..0ef2a29eb695 100644
--- a/net/rxrpc/rxgk_app.c
+++ b/net/rxrpc/rxgk_app.c
@@ -214,7 +214,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
ticket_len = ntohl(container.token_len);
ticket_offset = token_offset + sizeof(container);
- if (xdr_round_up(ticket_len) > token_len - sizeof(container))
+ if (ticket_len > xdr_round_down(token_len - sizeof(container)))
goto short_packet;
_debug("KVNO %u", kvno);
@@ -245,6 +245,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb,
if (ret != -ENOMEM)
return rxrpc_abort_conn(conn, skb, ec, ret,
rxgk_abort_resp_tok_dec);
+ return ret;
}
ret = conn->security->default_decode_ticket(conn, skb, ticket_offset,
diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h
index 80164d89e19c..1e257d7ab8ec 100644
--- a/net/rxrpc/rxgk_common.h
+++ b/net/rxrpc/rxgk_common.h
@@ -34,6 +34,7 @@ struct rxgk_context {
};
#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_round_down(x) (round_down((x), sizeof(__be32)))
#define xdr_object_len(x) (4 + xdr_round_up(x))
/*
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index eb7f2769d2b1..cba7935977f0 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -510,6 +510,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON,
rxkad_abort_2_short_header);
+ /* Don't let the crypto algo see a misaligned length. */
+ sp->len = round_down(sp->len, 8);
+
/* Decrypt the skbuff in-place. TODO: We really want to decrypt
* directly into the target buffer.
*/
@@ -543,8 +546,10 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
if (sg != _sg)
kfree(sg);
if (ret < 0) {
- WARN_ON_ONCE(ret != -ENOMEM);
- return ret;
+ if (ret == -ENOMEM)
+ return ret;
+ return rxrpc_abort_eproto(call, skb, RXKADSEALEDINCON,
+ rxkad_abort_2_crypto_unaligned);
}
/* Extract the decrypted packet length */
@@ -1136,7 +1141,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
struct rxrpc_crypt session_key;
struct key *server_key;
time64_t expiry;
- void *ticket;
+ void *ticket = NULL;
u32 version, kvno, ticket_len, level;
__be32 csum;
int ret, i;
@@ -1162,13 +1167,13 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
ret = -ENOMEM;
response = kzalloc_obj(struct rxkad_response, GFP_NOFS);
if (!response)
- goto temporary_error;
+ goto error;
if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
response, sizeof(*response)) < 0) {
- rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
- rxkad_abort_resp_short);
- goto protocol_error;
+ ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
+ rxkad_abort_resp_short);
+ goto error;
}
version = ntohl(response->version);
@@ -1178,62 +1183,62 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len);
if (version != RXKAD_VERSION) {
- rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
- rxkad_abort_resp_version);
- goto protocol_error;
+ ret = rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO,
+ rxkad_abort_resp_version);
+ goto error;
}
if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) {
- rxrpc_abort_conn(conn, skb, RXKADTICKETLEN, -EPROTO,
- rxkad_abort_resp_tkt_len);
- goto protocol_error;
+ ret = rxrpc_abort_conn(conn, skb, RXKADTICKETLEN, -EPROTO,
+ rxkad_abort_resp_tkt_len);
+ goto error;
}
if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) {
- rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, -EPROTO,
- rxkad_abort_resp_unknown_tkt);
- goto protocol_error;
+ ret = rxrpc_abort_conn(conn, skb, RXKADUNKNOWNKEY, -EPROTO,
+ rxkad_abort_resp_unknown_tkt);
+ goto error;
}
/* extract the kerberos ticket and decrypt and decode it */
ret = -ENOMEM;
ticket = kmalloc(ticket_len, GFP_NOFS);
if (!ticket)
- goto temporary_error_free_resp;
+ goto error;
if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response),
ticket, ticket_len) < 0) {
- rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
- rxkad_abort_resp_short_tkt);
- goto protocol_error;
+ ret = rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO,
+ rxkad_abort_resp_short_tkt);
+ goto error;
}
ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len,
&session_key, &expiry);
if (ret < 0)
- goto temporary_error_free_ticket;
+ goto error;
/* use the session key from inside the ticket to decrypt the
* response */
ret = rxkad_decrypt_response(conn, response, &session_key);
if (ret < 0)
- goto temporary_error_free_ticket;
+ goto error;
if (ntohl(response->encrypted.epoch) != conn->proto.epoch ||
ntohl(response->encrypted.cid) != conn->proto.cid ||
ntohl(response->encrypted.securityIndex) != conn->security_ix) {
- rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
- rxkad_abort_resp_bad_param);
- goto protocol_error_free;
+ ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
+ rxkad_abort_resp_bad_param);
+ goto error;
}
csum = response->encrypted.checksum;
response->encrypted.checksum = 0;
rxkad_calc_response_checksum(response);
if (response->encrypted.checksum != csum) {
- rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
- rxkad_abort_resp_bad_checksum);
- goto protocol_error_free;
+ ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
+ rxkad_abort_resp_bad_checksum);
+ goto error;
}
for (i = 0; i < RXRPC_MAXCALLS; i++) {
@@ -1241,38 +1246,38 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
u32 counter = READ_ONCE(conn->channels[i].call_counter);
if (call_id > INT_MAX) {
- rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
- rxkad_abort_resp_bad_callid);
- goto protocol_error_free;
+ ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
+ rxkad_abort_resp_bad_callid);
+ goto error;
}
if (call_id < counter) {
- rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
- rxkad_abort_resp_call_ctr);
- goto protocol_error_free;
+ ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
+ rxkad_abort_resp_call_ctr);
+ goto error;
}
if (call_id > counter) {
if (conn->channels[i].call) {
- rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
+ ret = rxrpc_abort_conn(conn, skb, RXKADSEALEDINCON, -EPROTO,
rxkad_abort_resp_call_state);
- goto protocol_error_free;
+ goto error;
}
conn->channels[i].call_counter = call_id;
}
}
if (ntohl(response->encrypted.inc_nonce) != conn->rxkad.nonce + 1) {
- rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO,
- rxkad_abort_resp_ooseq);
- goto protocol_error_free;
+ ret = rxrpc_abort_conn(conn, skb, RXKADOUTOFSEQUENCE, -EPROTO,
+ rxkad_abort_resp_ooseq);
+ goto error;
}
level = ntohl(response->encrypted.level);
if (level > RXRPC_SECURITY_ENCRYPT) {
- rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EPROTO,
- rxkad_abort_resp_level);
- goto protocol_error_free;
+ ret = rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EPROTO,
+ rxkad_abort_resp_level);
+ goto error;
}
conn->security_level = level;
@@ -1280,31 +1285,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
* this the connection security can be handled in exactly the same way
* as for a client connection */
ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno);
- if (ret < 0)
- goto temporary_error_free_ticket;
-
- kfree(ticket);
- kfree(response);
- _leave(" = 0");
- return 0;
-
-protocol_error_free:
- kfree(ticket);
-protocol_error:
- kfree(response);
- key_put(server_key);
- return -EPROTO;
-temporary_error_free_ticket:
+error:
kfree(ticket);
-temporary_error_free_resp:
kfree(response);
-temporary_error:
- /* Ignore the response packet if we got a temporary error such as
- * ENOMEM. We just want to send the challenge again. Note that we
- * also come out this way if the ticket decryption fails.
- */
key_put(server_key);
+ _leave(" = %d", ret);
return ret;
}
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 3bcd6ee80396..e2169d1a14b5 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -47,15 +47,6 @@ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
}
/*
- * Note the dropping of a ref on a socket buffer by the core.
- */
-void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
-{
- int n = atomic_inc_return(&rxrpc_n_rx_skbs);
- trace_rxrpc_skb(skb, 0, n, why);
-}
-
-/*
* Note the destruction of a socket buffer.
*/
void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 05e0b14b5773..2c5a7a321a94 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -354,7 +354,7 @@ static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m,
goto assign_prev;
tcf_mirred_to_dev(skb, m, dev_prev,
- dev_is_mac_header_xmit(dev),
+ dev_is_mac_header_xmit(dev_prev),
mirred_eaction, retval);
assign_prev:
dev_prev = dev;
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index ffea9fbd522d..02e1fa4577ae 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -619,7 +619,7 @@ static bool cake_update_flowkeys(struct flow_keys *keys,
}
port = rev ? tuple.src.u.all : tuple.dst.u.all;
if (port != keys->ports.dst) {
- port = keys->ports.dst;
+ keys->ports.dst = port;
upd = true;
}
}
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index fe6f5e889625..241e6a46bd00 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -868,11 +868,35 @@ static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
old_backlog = sch->qstats.backlog;
while (qdisc_qlen(sch) > sch->limit ||
q->memory_used > q->memory_limit) {
- struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
+ struct sk_buff *skb = NULL;
- q->memory_used -= skb->truesize;
- qdisc_qstats_backlog_dec(sch, skb);
- rtnl_qdisc_drop(skb, sch);
+ if (qdisc_qlen(sch) > qdisc_qlen(q->l_queue)) {
+ skb = qdisc_dequeue_internal(sch, true);
+ if (unlikely(!skb)) {
+ WARN_ON_ONCE(1);
+ break;
+ }
+ q->memory_used -= skb->truesize;
+ rtnl_qdisc_drop(skb, sch);
+ } else if (qdisc_qlen(q->l_queue)) {
+ skb = qdisc_dequeue_internal(q->l_queue, true);
+ if (unlikely(!skb)) {
+ WARN_ON_ONCE(1);
+ break;
+ }
+ /* L-queue packets are counted in both sch and
+ * l_queue on enqueue; qdisc_dequeue_internal()
+ * handled l_queue, so we further account for sch.
+ */
+ --sch->q.qlen;
+ qdisc_qstats_backlog_dec(sch, skb);
+ q->memory_used -= skb->truesize;
+ rtnl_qdisc_drop(skb, q->l_queue);
+ qdisc_qstats_drop(sch);
+ } else {
+ WARN_ON_ONCE(1);
+ break;
+ }
}
qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
old_backlog - sch->qstats.backlog);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 2a3d758f67ab..0664b2f2d6f2 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -585,6 +585,8 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
};
struct list_head *pos;
+ sch_tree_lock(sch);
+
st.qdisc_stats.maxpacket = q->cstats.maxpacket;
st.qdisc_stats.drop_overlimit = q->drop_overlimit;
st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
@@ -593,7 +595,6 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.qdisc_stats.memory_usage = q->memory_usage;
st.qdisc_stats.drop_overmemory = q->drop_overmemory;
- sch_tree_lock(sch);
list_for_each(pos, &q->new_flows)
st.qdisc_stats.new_flows_len++;
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 95e5d9bfd9c8..96021f52d835 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -198,7 +198,8 @@ static struct hh_flow_state *seek_list(const u32 hash,
return NULL;
list_del(&flow->flowchain);
kfree(flow);
- q->hh_flows_current_cnt--;
+ WRITE_ONCE(q->hh_flows_current_cnt,
+ q->hh_flows_current_cnt - 1);
} else if (flow->hash_id == hash) {
return flow;
}
@@ -226,7 +227,7 @@ static struct hh_flow_state *alloc_new_hh(struct list_head *head,
}
if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
- q->hh_flows_overlimit++;
+ WRITE_ONCE(q->hh_flows_overlimit, q->hh_flows_overlimit + 1);
return NULL;
}
/* Create new entry. */
@@ -234,7 +235,7 @@ static struct hh_flow_state *alloc_new_hh(struct list_head *head,
if (!flow)
return NULL;
- q->hh_flows_current_cnt++;
+ WRITE_ONCE(q->hh_flows_current_cnt, q->hh_flows_current_cnt + 1);
INIT_LIST_HEAD(&flow->flowchain);
list_add_tail(&flow->flowchain, head);
@@ -309,7 +310,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
return WDRR_BUCKET_FOR_NON_HH;
flow->hash_id = hash;
flow->hit_timestamp = now;
- q->hh_flows_total_cnt++;
+ WRITE_ONCE(q->hh_flows_total_cnt, q->hh_flows_total_cnt + 1);
/* By returning without updating counters in q->hhf_arrays,
* we implicitly implement "shielding" (see Optimization O1).
@@ -403,7 +404,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return NET_XMIT_SUCCESS;
prev_backlog = sch->qstats.backlog;
- q->drop_overlimit++;
+ WRITE_ONCE(q->drop_overlimit, q->drop_overlimit + 1);
/* Return Congestion Notification only if we dropped a packet from this
* bucket.
*/
@@ -686,10 +687,10 @@ static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct tc_hhf_xstats st = {
- .drop_overlimit = q->drop_overlimit,
- .hh_overlimit = q->hh_flows_overlimit,
- .hh_tot_count = q->hh_flows_total_cnt,
- .hh_cur_count = q->hh_flows_current_cnt,
+ .drop_overlimit = READ_ONCE(q->drop_overlimit),
+ .hh_overlimit = READ_ONCE(q->hh_flows_overlimit),
+ .hh_tot_count = READ_ONCE(q->hh_flows_total_cnt),
+ .hh_cur_count = READ_ONCE(q->hh_flows_current_cnt),
};
return gnet_stats_copy_app(d, &st, sizeof(st));
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 16f3f629cb8e..fb53fbf0e328 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -90,7 +90,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
bool enqueue = false;
if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
- q->stats.overlimit++;
+ WRITE_ONCE(q->stats.overlimit, q->stats.overlimit + 1);
goto out;
}
@@ -104,7 +104,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* If packet is ecn capable, mark it if drop probability
* is lower than 10%, else drop it.
*/
- q->stats.ecn_mark++;
+ WRITE_ONCE(q->stats.ecn_mark, q->stats.ecn_mark + 1);
enqueue = true;
}
@@ -114,15 +114,15 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (!q->params.dq_rate_estimator)
pie_set_enqueue_time(skb);
- q->stats.packets_in++;
+ WRITE_ONCE(q->stats.packets_in, q->stats.packets_in + 1);
if (qdisc_qlen(sch) > q->stats.maxq)
- q->stats.maxq = qdisc_qlen(sch);
+ WRITE_ONCE(q->stats.maxq, qdisc_qlen(sch));
return qdisc_enqueue_tail(skb, sch);
}
out:
- q->stats.dropped++;
+ WRITE_ONCE(q->stats.dropped, q->stats.dropped + 1);
q->vars.accu_prob = 0;
return qdisc_drop_reason(skb, sch, to_free, reason);
}
@@ -267,11 +267,11 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
count = count / dtime;
if (vars->avg_dq_rate == 0)
- vars->avg_dq_rate = count;
+ WRITE_ONCE(vars->avg_dq_rate, count);
else
- vars->avg_dq_rate =
+ WRITE_ONCE(vars->avg_dq_rate,
(vars->avg_dq_rate -
- (vars->avg_dq_rate >> 3)) + (count >> 3);
+ (vars->avg_dq_rate >> 3)) + (count >> 3));
/* If the queue has receded below the threshold, we hold
* on to the last drain rate calculated, else we reset
@@ -381,7 +381,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
if (delta > 0) {
/* prevent overflow */
if (vars->prob < oldprob) {
- vars->prob = MAX_PROB;
+ WRITE_ONCE(vars->prob, MAX_PROB);
/* Prevent normalization error. If probability is at
* maximum value already, we normalize it here, and
* skip the check to do a non-linear drop in the next
@@ -392,7 +392,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
} else {
/* prevent underflow */
if (vars->prob > oldprob)
- vars->prob = 0;
+ WRITE_ONCE(vars->prob, 0);
}
/* Non-linear drop in probability: Reduce drop probability quickly if
@@ -403,7 +403,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
/* Reduce drop probability to 98.4% */
vars->prob -= vars->prob / 64;
- vars->qdelay = qdelay;
+ WRITE_ONCE(vars->qdelay, qdelay);
vars->backlog_old = backlog;
/* We restart the measurement cycle if the following conditions are met
@@ -502,21 +502,21 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
.prob = q->vars.prob << BITS_PER_BYTE,
- .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
+ .delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) /
NSEC_PER_USEC,
- .packets_in = q->stats.packets_in,
- .overlimit = q->stats.overlimit,
- .maxq = q->stats.maxq,
- .dropped = q->stats.dropped,
- .ecn_mark = q->stats.ecn_mark,
+ .packets_in = READ_ONCE(q->stats.packets_in),
+ .overlimit = READ_ONCE(q->stats.overlimit),
+ .maxq = READ_ONCE(q->stats.maxq),
+ .dropped = READ_ONCE(q->stats.dropped),
+ .ecn_mark = READ_ONCE(q->stats.ecn_mark),
};
/* avg_dq_rate is only valid if dq_rate_estimator is enabled */
st.dq_rate_estimating = q->params.dq_rate_estimator;
/* unscale and return dq_rate in bytes per sec */
- if (q->params.dq_rate_estimator)
- st.avg_dq_rate = q->vars.avg_dq_rate *
+ if (st.dq_rate_estimating)
+ st.avg_dq_rate = READ_ONCE(q->vars.avg_dq_rate) *
(PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
return gnet_stats_copy_app(d, &st, sizeof(st));
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index c8d3d09f15e3..432b8a3000a5 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -90,17 +90,20 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
if (!red_use_ecn(q)) {
- q->stats.prob_drop++;
+ WRITE_ONCE(q->stats.prob_drop,
+ q->stats.prob_drop + 1);
goto congestion_drop;
}
if (INET_ECN_set_ce(skb)) {
- q->stats.prob_mark++;
+ WRITE_ONCE(q->stats.prob_mark,
+ q->stats.prob_mark + 1);
skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret);
if (!skb)
return NET_XMIT_CN | ret;
} else if (!red_use_nodrop(q)) {
- q->stats.prob_drop++;
+ WRITE_ONCE(q->stats.prob_drop,
+ q->stats.prob_drop + 1);
goto congestion_drop;
}
@@ -111,17 +114,20 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
reason = QDISC_DROP_OVERLIMIT;
qdisc_qstats_overlimit(sch);
if (red_use_harddrop(q) || !red_use_ecn(q)) {
- q->stats.forced_drop++;
+ WRITE_ONCE(q->stats.forced_drop,
+ q->stats.forced_drop + 1);
goto congestion_drop;
}
if (INET_ECN_set_ce(skb)) {
- q->stats.forced_mark++;
+ WRITE_ONCE(q->stats.forced_mark,
+ q->stats.forced_mark + 1);
skb = tcf_qevent_handle(&q->qe_mark, sch, skb, to_free, &ret);
if (!skb)
return NET_XMIT_CN | ret;
} else if (!red_use_nodrop(q)) {
- q->stats.forced_drop++;
+ WRITE_ONCE(q->stats.forced_drop,
+ q->stats.forced_drop + 1);
goto congestion_drop;
}
@@ -135,7 +141,8 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
sch->qstats.backlog += len;
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
- q->stats.pdrop++;
+ WRITE_ONCE(q->stats.pdrop,
+ q->stats.pdrop + 1);
qdisc_qstats_drop(sch);
}
return ret;
@@ -463,9 +470,13 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
&hw_stats_request);
}
- st.early = q->stats.prob_drop + q->stats.forced_drop;
- st.pdrop = q->stats.pdrop;
- st.marked = q->stats.prob_mark + q->stats.forced_mark;
+ st.early = READ_ONCE(q->stats.prob_drop) +
+ READ_ONCE(q->stats.forced_drop);
+
+ st.pdrop = READ_ONCE(q->stats.pdrop);
+
+ st.marked = READ_ONCE(q->stats.prob_mark) +
+ READ_ONCE(q->stats.forced_mark);
return gnet_stats_copy_app(d, &st, sizeof(st));
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 013738662128..bd5ef561030f 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -130,7 +130,7 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
sfbhash >>= SFB_BUCKET_SHIFT;
if (b[hash].qlen < 0xFFFF)
- b[hash].qlen++;
+ WRITE_ONCE(b[hash].qlen, b[hash].qlen + 1);
b += SFB_NUMBUCKETS; /* next level */
}
}
@@ -159,7 +159,7 @@ static void decrement_one_qlen(u32 sfbhash, u32 slot,
sfbhash >>= SFB_BUCKET_SHIFT;
if (b[hash].qlen > 0)
- b[hash].qlen--;
+ WRITE_ONCE(b[hash].qlen, b[hash].qlen - 1);
b += SFB_NUMBUCKETS; /* next level */
}
}
@@ -179,12 +179,12 @@ static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
{
- b->p_mark = prob_minus(b->p_mark, q->decrement);
+ WRITE_ONCE(b->p_mark, prob_minus(b->p_mark, q->decrement));
}
static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
{
- b->p_mark = prob_plus(b->p_mark, q->increment);
+ WRITE_ONCE(b->p_mark, prob_plus(b->p_mark, q->increment));
}
static void sfb_zero_all_buckets(struct sfb_sched_data *q)
@@ -202,11 +202,14 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da
const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
- if (qlen < b->qlen)
- qlen = b->qlen;
- totalpm += b->p_mark;
- if (prob < b->p_mark)
- prob = b->p_mark;
+ u32 b_qlen = READ_ONCE(b->qlen);
+ u32 b_mark = READ_ONCE(b->p_mark);
+
+ if (qlen < b_qlen)
+ qlen = b_qlen;
+ totalpm += b_mark;
+ if (prob < b_mark)
+ prob = b_mark;
b++;
}
*prob_r = prob;
@@ -295,7 +298,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(sch->q.qlen >= q->limit)) {
qdisc_qstats_overlimit(sch);
- q->stats.queuedrop++;
+ WRITE_ONCE(q->stats.queuedrop,
+ q->stats.queuedrop + 1);
goto drop;
}
@@ -348,7 +352,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(minqlen >= q->max)) {
qdisc_qstats_overlimit(sch);
- q->stats.bucketdrop++;
+ WRITE_ONCE(q->stats.bucketdrop,
+ q->stats.bucketdrop + 1);
goto drop;
}
@@ -374,7 +379,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
if (sfb_rate_limit(skb, q)) {
qdisc_qstats_overlimit(sch);
- q->stats.penaltydrop++;
+ WRITE_ONCE(q->stats.penaltydrop,
+ q->stats.penaltydrop + 1);
goto drop;
}
goto enqueue;
@@ -390,14 +396,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* In either case, we want to start dropping packets.
*/
if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
- q->stats.earlydrop++;
+ WRITE_ONCE(q->stats.earlydrop,
+ q->stats.earlydrop + 1);
goto drop;
}
}
if (INET_ECN_set_ce(skb)) {
- q->stats.marked++;
+ WRITE_ONCE(q->stats.marked,
+ q->stats.marked + 1);
} else {
- q->stats.earlydrop++;
+ WRITE_ONCE(q->stats.earlydrop,
+ q->stats.earlydrop + 1);
goto drop;
}
}
@@ -410,7 +419,8 @@ enqueue:
sch->q.qlen++;
increment_qlen(&cb, q);
} else if (net_xmit_drop_count(ret)) {
- q->stats.childdrop++;
+ WRITE_ONCE(q->stats.childdrop,
+ q->stats.childdrop + 1);
qdisc_qstats_drop(sch);
}
return ret;
@@ -599,12 +609,12 @@ static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct tc_sfb_xstats st = {
- .earlydrop = q->stats.earlydrop,
- .penaltydrop = q->stats.penaltydrop,
- .bucketdrop = q->stats.bucketdrop,
- .queuedrop = q->stats.queuedrop,
- .childdrop = q->stats.childdrop,
- .marked = q->stats.marked,
+ .earlydrop = READ_ONCE(q->stats.earlydrop),
+ .penaltydrop = READ_ONCE(q->stats.penaltydrop),
+ .bucketdrop = READ_ONCE(q->stats.bucketdrop),
+ .queuedrop = READ_ONCE(q->stats.queuedrop),
+ .childdrop = READ_ONCE(q->stats.childdrop),
+ .marked = READ_ONCE(q->stats.marked),
};
st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 8e3752811950..a47a09d76400 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -972,11 +972,12 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
}
if (should_change_schedules(admin, oper, end_time)) {
- /* Set things so the next time this runs, the new
- * schedule runs.
- */
- end_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
+ /* After changing schedules, the next entry is the first one
+ * in the new schedule, with a pre-calculated end_time.
+ */
+ next = list_first_entry(&oper->entries, struct sched_entry, list);
+ end_time = next->end_time;
}
next->end_time = end_time;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d2665bbd41a2..58d0d9747f0b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4855,8 +4855,9 @@ static struct sock *sctp_clone_sock(struct sock *sk,
if (!newsk)
return ERR_PTR(err);
- /* sk_clone() sets refcnt to 2 */
+ /* sk_clone() sets refcnt to 2 and increments sockets_allocated */
sock_put(newsk);
+ sk_sockets_allocated_dec(newsk);
newinet = inet_sk(newsk);
newsp = sctp_sk(newsk);
@@ -7033,7 +7034,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
/* See if the user provided enough room for all the data */
num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr);
- if (len < num_chunks)
+ if (len < sizeof(struct sctp_authchunks) + num_chunks)
return -EINVAL;
if (copy_to_user(to, ch->chunks, num_chunks))
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index c38fc7bf0a7e..014d527d5462 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -788,8 +788,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
dclc = (struct smc_clc_msg_decline *)clcm;
reason_code = SMC_CLC_DECL_PEERDECL;
smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
- if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 &
- SMC_FIRST_CONTACT_MASK) {
+ if ((dclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK) &&
+ smc->conn.lgr) {
smc->conn.lgr->sync_err = 1;
smc_lgr_terminate_sched(smc->conn.lgr);
}
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 76284fc538eb..b0bba0feef56 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -177,8 +177,20 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
if (fragid == LAST_FRAGMENT) {
TIPC_SKB_CB(head)->validated = 0;
- if (unlikely(!tipc_msg_validate(&head)))
+
+ /* If the reassembled skb has been freed in
+ * tipc_msg_validate() because of an invalid truesize,
+ * then head will point to a newly allocated reassembled
+ * skb, while *headbuf points to freed reassembled skb.
+ * In such cases, correct *headbuf for freeing the newly
+ * allocated reassembled skb later.
+ */
+ if (unlikely(!tipc_msg_validate(&head))) {
+ if (head != *headbuf)
+ *headbuf = head;
goto err;
+ }
+
*buf = head;
TIPC_SKB_CB(head)->tail = NULL;
*headbuf = NULL;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f668ff107722..e2d787ca3e74 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1968,16 +1968,19 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
static void unix_destruct_scm(struct sk_buff *skb)
{
- struct scm_cookie scm;
+ struct scm_cookie scm = {};
+
+ swap(scm.pid, UNIXCB(skb).pid);
- memset(&scm, 0, sizeof(scm));
- scm.pid = UNIXCB(skb).pid;
if (UNIXCB(skb).fp)
unix_detach_fds(&scm, skb);
- /* Alas, it calls VFS */
- /* So fscking what? fput() had been SMP-safe since the last Summer */
scm_destroy(&scm);
+}
+
+static void unix_wfree(struct sk_buff *skb)
+{
+ unix_destruct_scm(skb);
sock_wfree(skb);
}
@@ -1993,7 +1996,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
if (scm->fp && send_fds)
err = unix_attach_fds(scm, skb);
- skb->destructor = unix_destruct_scm;
+ skb->destructor = unix_wfree;
return err;
}
@@ -2070,6 +2073,13 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
}
}
+static void unix_orphan_scm(struct sock *sk, struct sk_buff *skb)
+{
+ scm_stat_del(sk, skb);
+ unix_destruct_scm(skb);
+ skb->destructor = sock_wfree;
+}
+
/*
* Send AF_UNIX data.
*/
@@ -2683,10 +2693,16 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
int err;
mutex_lock(&u->iolock);
+
skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
- mutex_unlock(&u->iolock);
- if (!skb)
+ if (!skb) {
+ mutex_unlock(&u->iolock);
return err;
+ }
+
+ unix_orphan_scm(sk, skb);
+
+ mutex_unlock(&u->iolock);
return recv_actor(sk, skb);
}
@@ -2886,6 +2902,9 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
#endif
spin_unlock(&queue->lock);
+
+ unix_orphan_scm(sk, skb);
+
mutex_unlock(&u->iolock);
return recv_actor(sk, skb);
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 2b7c0b5896ed..f862988c1e86 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -694,7 +694,6 @@ out:
static s64 hvs_stream_has_data(struct vsock_sock *vsk)
{
struct hvsock *hvs = vsk->trans;
- bool need_refill;
s64 ret;
if (hvs->recv_data_len > 0)
@@ -702,9 +701,31 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk)
switch (hvs_channel_readable_payload(hvs->chan)) {
case 1:
- need_refill = !hvs->recv_desc;
- if (!need_refill)
- return -EIO;
+ if (hvs->recv_desc) {
+ /* Here hvs->recv_data_len is 0, so hvs->recv_desc must
+ * be NULL unless it points to the 0-byte-payload FIN
+ * packet or a malformed/short packet: see
+ * hvs_update_recv_data().
+ *
+ * If hvs->recv_desc points to the FIN packet, here all
+ * the payload has been dequeued and the peer_shutdown
+ * flag is set, but hvs_channel_readable_payload() still
+ * returns 1, because the VMBus ringbuffer's read_index
+ * is not updated for the FIN packet:
+ * hvs_stream_dequeue() -> hv_pkt_iter_next() updates
+ * the cached priv_read_index but has no opportunity to
+ * update the read_index in hv_pkt_iter_close() as
+ * hvs_stream_has_data() returns 0 for the FIN packet,
+ * so it won't get dequeued.
+ *
+ * In case hvs->recv_desc points to a malformed/short
+ * packet, return -EIO.
+ */
+ if (!(vsk->peer_shutdown & SEND_SHUTDOWN))
+ return -EIO;
+
+ return 0;
+ }
hvs->recv_desc = hv_pkt_iter_first(hvs->chan);
if (!hvs->recv_desc)
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index a152a9e208d0..416d533f493d 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -73,6 +73,7 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
struct sk_buff *skb,
struct msghdr *msg,
+ size_t pkt_len,
bool zerocopy)
{
struct ubuf_info *uarg;
@@ -81,12 +82,10 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
uarg = msg->msg_ubuf;
net_zcopy_get(uarg);
} else {
- struct iov_iter *iter = &msg->msg_iter;
struct ubuf_info_msgzc *uarg_zc;
uarg = msg_zerocopy_realloc(sk_vsock(vsk),
- iter->count,
- NULL, false);
+ pkt_len, NULL, false);
if (!uarg)
return -1;
@@ -398,11 +397,17 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
* each iteration. If this is last skb for this buffer
* and MSG_ZEROCOPY mode is in use - we must allocate
* completion for the current syscall.
+ *
+ * Pass pkt_len because msg iter is already consumed
+ * by virtio_transport_fill_skb(), so iter->count
+ * can not be used for RLIMIT_MEMLOCK pinned-pages
+ * accounting done by msg_zerocopy_realloc().
*/
if (info->msg && info->msg->msg_flags & MSG_ZEROCOPY &&
skb_len == rest_len && info->op == VIRTIO_VSOCK_OP_RW) {
if (virtio_transport_init_zcopy_skb(vsk, skb,
info->msg,
+ pkt_len,
can_zcopy)) {
kfree_skb(skb);
ret = -ENOMEM;
@@ -545,9 +550,8 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
skb_queue_walk(&vvs->rx_queue, skb) {
size_t bytes;
- bytes = len - total;
- if (bytes > skb->len)
- bytes = skb->len;
+ bytes = min_t(size_t, len - total,
+ skb->len - VIRTIO_VSOCK_SKB_CB(skb)->offset);
spin_unlock_bh(&vvs->rx_lock);
@@ -1558,8 +1562,6 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
return -ENOMEM;
}
- sk_acceptq_added(sk);
-
lock_sock_nested(child, SINGLE_DEPTH_NESTING);
child->sk_state = TCP_ESTABLISHED;
@@ -1581,6 +1583,7 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
return ret;
}
+ sk_acceptq_added(sk);
if (virtio_transport_space_update(child, skb))
child->sk_write_space(child);