summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-05-07 10:32:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-05-07 10:32:03 -0700
commitfcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0 (patch)
treefc6254372916832b89cb60f94464d41a48b2f045 /net
parent19cbc75c56c0ed4fa3f637e3c41a98895a68dfae (diff)
parent41ae14071cd7f6a7770e2fe1f8a0859d4c2c6ba4 (diff)
Merge tag 'net-7.1-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Jakub Kicinski: "Including fixes from Netfilter, IPsec, Bluetooth and WiFi. Current release - fix to a fix: - ipmr: add __rcu to netns_ipv4.mrt, make sure we hold the RCU lock in all relevant places Current release - new code bugs: - fixes for the recently added resizable hash tables - ipv6: make sure we default IPv6 tunnel drivers to =m now that IPv6 itself is built in - drv: octeontx2-af: fixes for parser/CAM fixes Previous releases - regressions: - phy: micrel: fix LAN8814 QSGMII soft reset - wifi: - cw1200: revert "Fix locking in error paths" - ath12k: fix crash on WCN7850, due to adding the same queue buffer to a list multiple times Previous releases - always broken: - number of info leak fixes - ipv6: implement limits on extension header parsing - wifi: number of fixes for missing bound checks in the drivers - Bluetooth: fixes for races and locking issues - af_unix: - fix an issue between garbage collection and PEEK - fix yet another issue with OOB data - xfrm: esp: avoid in-place decrypt on shared skb frags - netfilter: replace skb_try_make_writable() by skb_ensure_writable() - openvswitch: vport: fix race between tunnel creation and linking leading to invalid memory accesses (type confusion) - drv: amd-xgbe: fix PTP addend overflow causing frozen clock Misc: - sched/isolation: make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN (for relevant IPVS change)" * tag 'net-7.1-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (190 commits) net: sparx5: configure serdes for 1000BASE-X in sparx5_port_init() net: sparx5: fix wrong chip ids for TSN SKUs net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in nvt_set_phy_intf_sel() tcp: Fix dst leak in tcp_v6_connect(). ipmr: Call ipmr_fib_lookup() under RCU. net: phy: broadcom: Save PHY counters during suspend net/smc: fix missing sk_err when TCP handshake fails af_unix: Reject SIOCATMARK on non-stream sockets veth: fix OOB txq access in veth_poll() with asymmetric queue counts eth: fbnic: fix double-free of PCS on phylink creation failure net: ethernet: cortina: Drop half-assembled SKB selftests: mptcp: pm: restrict 'unknown' check to pm_nl_ctl selftests: mptcp: check output: catch cmd errors mptcp: pm: prio: skip closed subflows mptcp: pm: ADD_ADDR rtx: return early if no retrans mptcp: pm: ADD_ADDR rtx: skip inactive subflows mptcp: pm: ADD_ADDR rtx: resched blocked ADD_ADDR quicker mptcp: pm: ADD_ADDR rtx: free sk if last mptcp: pm: ADD_ADDR rtx: always decrease sk refcount mptcp: pm: ADD_ADDR rtx: fix potential data-race ...
Diffstat (limited to 'net')
-rw-r--r--net/bluetooth/bnep/core.c13
-rw-r--r--net/bluetooth/hci_conn.c124
-rw-r--r--net/bluetooth/hci_event.c29
-rw-r--r--net/bluetooth/hidp/core.c27
-rw-r--r--net/bluetooth/iso.c56
-rw-r--r--net/bluetooth/l2cap_core.c14
-rw-r--r--net/bluetooth/l2cap_sock.c9
-rw-r--r--net/bluetooth/rfcomm/core.c7
-rw-r--r--net/bluetooth/sco.c60
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/netpoll.c23
-rw-r--r--net/core/rtnetlink.c1
-rw-r--r--net/ipv4/ah4.c14
-rw-r--r--net/ipv4/esp4.c3
-rw-r--r--net/ipv4/igmp.c58
-rw-r--r--net/ipv4/inetpeer.c3
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ipmr.c10
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c3
-rw-r--r--net/ipv4/tcp_ipv4.c14
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv6/Kconfig4
-rw-r--r--net/ipv6/ah6.c14
-rw-r--r--net/ipv6/esp6.c3
-rw-r--r--net/ipv6/exthdrs_core.c7
-rw-r--r--net/ipv6/ip6_gre.c5
-rw-r--r--net/ipv6/ip6_input.c5
-rw-r--r--net/ipv6/ip6_output.c5
-rw-r--r--net/ipv6/ip6_tunnel.c4
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c5
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/tcp_ipv6.c17
-rw-r--r--net/ipv6/xfrm6_protocol.c4
-rw-r--r--net/mac80211/mlme.c18
-rw-r--r--net/mac80211/rx.c6
-rw-r--r--net/mac80211/tests/chan-mode.c1
-rw-r--r--net/mac80211/util.c4
-rw-r--r--net/mctp/test/route-test.c2
-rw-r--r--net/mctp/test/utils.c2
-rw-r--r--net/mptcp/fastopen.c4
-rw-r--r--net/mptcp/pm.c62
-rw-r--r--net/mptcp/pm_kernel.c13
-rw-r--r--net/mptcp/sockopt.c4
-rw-r--r--net/mptcp/subflow.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c76
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c164
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c83
-rw-r--r--net/netfilter/nf_dup_netdev.c16
-rw-r--r--net/netfilter/nf_flow_table_core.c1
-rw-r--r--net/netfilter/nf_flow_table_ip.c151
-rw-r--r--net/netfilter/nf_flow_table_path.c7
-rw-r--r--net/netfilter/nf_tables_api.c35
-rw-r--r--net/netfilter/nf_tables_core.c2
-rw-r--r--net/netfilter/nft_compat.c45
-rw-r--r--net/netfilter/nft_exthdr.c2
-rw-r--r--net/netfilter/nft_fwd_netdev.c29
-rw-r--r--net/netfilter/nft_osf.c2
-rw-r--r--net/netfilter/nft_tproxy.c8
-rw-r--r--net/netfilter/x_tables.c79
-rw-r--r--net/netfilter/xt_CT.c8
-rw-r--r--net/netfilter/xt_TCPMSS.c33
-rw-r--r--net/netfilter/xt_TPROXY.c11
-rw-r--r--net/netfilter/xt_addrtype.c25
-rw-r--r--net/netfilter/xt_devgroup.c18
-rw-r--r--net/netfilter/xt_ecn.c4
-rw-r--r--net/netfilter/xt_hashlimit.c4
-rw-r--r--net/netfilter/xt_osf.c3
-rw-r--r--net/netfilter/xt_physdev.c20
-rw-r--r--net/netfilter/xt_policy.c24
-rw-r--r--net/netfilter/xt_set.c39
-rw-r--r--net/netfilter/xt_tcpmss.c4
-rw-r--r--net/openvswitch/vport-geneve.c5
-rw-r--r--net/openvswitch/vport-gre.c5
-rw-r--r--net/openvswitch/vport-netdev.c64
-rw-r--r--net/openvswitch/vport-netdev.h2
-rw-r--r--net/openvswitch/vport-vxlan.c5
-rw-r--r--net/psp/psp_main.c42
-rw-r--r--net/rds/message.c20
-rw-r--r--net/sched/sch_cake.c155
-rw-r--r--net/sched/sch_fq_codel.c39
-rw-r--r--net/sched/sch_pie.c14
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sched/sch_sfb.c2
-rw-r--r--net/sched/sch_sfq.c48
-rw-r--r--net/smc/af_smc.c8
-rw-r--r--net/tls/tls_sw.c6
-rw-r--r--net/unix/af_unix.c3
-rw-r--r--net/unix/garbage.c6
-rw-r--r--net/vmw_vsock/virtio_transport_common.c4
-rw-r--r--net/wireless/nl80211.c27
-rw-r--r--net/wireless/pmsr.c2
-rw-r--r--net/xdp/xsk.c115
-rw-r--r--net/xdp/xsk_buff_pool.c3
-rw-r--r--net/xfrm/xfrm_output.c20
-rw-r--r--net/xfrm/xfrm_state.c12
-rw-r--r--net/xfrm/xfrm_user.c1
97 files changed, 1481 insertions, 696 deletions
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index d44987d4515c..853c8d7644b5 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -330,11 +330,18 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
goto badframe;
break;
case BNEP_FILTER_MULTI_ADDR_SET:
- case BNEP_FILTER_NET_TYPE_SET:
- /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */
- if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2))
+ case BNEP_FILTER_NET_TYPE_SET: {
+ u8 *hdr;
+
+ /* Pull ctrl type (1 b) + len (2 b) */
+ hdr = skb_pull_data(skb, 3);
+ if (!hdr)
+ goto badframe;
+ /* Pull data (len bytes); length is big-endian */
+ if (!skb_pull(skb, get_unaligned_be16(&hdr[1])))
goto badframe;
break;
+ }
default:
kfree_skb(skb);
return 0;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 3a0592599086..17b46ad6a349 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -480,40 +480,107 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
return hci_setup_sync_conn(conn, handle);
}
-u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
- u16 to_multiplier)
+struct le_conn_update_data {
+ struct hci_conn *conn;
+ u16 min;
+ u16 max;
+ u16 latency;
+ u16 to_multiplier;
+};
+
+static int le_conn_update_sync(struct hci_dev *hdev, void *data)
{
- struct hci_dev *hdev = conn->hdev;
+ struct le_conn_update_data *d = data;
+ struct hci_conn *conn = d->conn;
struct hci_conn_params *params;
struct hci_cp_le_conn_update cp;
+ u16 timeout;
+ u8 store_hint;
+ int err;
+ /* Verify connection is still alive and read conn fields under
+ * the same lock to prevent a concurrent disconnect from freeing
+ * or reusing the connection while we build the HCI command.
+ */
hci_dev_lock(hdev);
- params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
- if (params) {
- params->conn_min_interval = min;
- params->conn_max_interval = max;
- params->conn_latency = latency;
- params->supervision_timeout = to_multiplier;
+ if (!hci_conn_valid(hdev, conn)) {
+ hci_dev_unlock(hdev);
+ return -ECANCELED;
}
- hci_dev_unlock(hdev);
-
memset(&cp, 0, sizeof(cp));
cp.handle = cpu_to_le16(conn->handle);
- cp.conn_interval_min = cpu_to_le16(min);
- cp.conn_interval_max = cpu_to_le16(max);
- cp.conn_latency = cpu_to_le16(latency);
- cp.supervision_timeout = cpu_to_le16(to_multiplier);
+ cp.conn_interval_min = cpu_to_le16(d->min);
+ cp.conn_interval_max = cpu_to_le16(d->max);
+ cp.conn_latency = cpu_to_le16(d->latency);
+ cp.supervision_timeout = cpu_to_le16(d->to_multiplier);
cp.min_ce_len = cpu_to_le16(0x0000);
cp.max_ce_len = cpu_to_le16(0x0000);
+ timeout = conn->conn_timeout;
+
+ hci_dev_unlock(hdev);
- hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp);
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CONN_UPDATE,
+ sizeof(cp), &cp,
+ HCI_EV_LE_CONN_UPDATE_COMPLETE,
+ timeout, NULL);
+ if (err)
+ return err;
+
+ /* Update stored connection parameters after the controller has
+ * confirmed the update via the LE Connection Update Complete event.
+ */
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ params->conn_min_interval = d->min;
+ params->conn_max_interval = d->max;
+ params->conn_latency = d->latency;
+ params->supervision_timeout = d->to_multiplier;
+ store_hint = 0x01;
+ } else {
+ store_hint = 0x00;
+ }
- if (params)
- return 0x01;
+ hci_dev_unlock(hdev);
- return 0x00;
+ mgmt_new_conn_param(hdev, &conn->dst, conn->dst_type, store_hint,
+ d->min, d->max, d->latency, d->to_multiplier);
+
+ return 0;
+}
+
+static void le_conn_update_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct le_conn_update_data *d = data;
+
+ hci_conn_put(d->conn);
+ kfree(d);
+}
+
+void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
+ u16 to_multiplier)
+{
+ struct le_conn_update_data *d;
+
+ d = kzalloc_obj(*d);
+ if (!d)
+ return;
+
+ hci_conn_get(conn);
+ d->conn = conn;
+ d->min = min;
+ d->max = max;
+ d->latency = latency;
+ d->to_multiplier = to_multiplier;
+
+ if (hci_cmd_sync_queue(conn->hdev, le_conn_update_sync, d,
+ le_conn_update_complete) < 0) {
+ hci_conn_put(conn);
+ kfree(d);
+ }
}
void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
@@ -2130,6 +2197,9 @@ static int create_big_sync(struct hci_dev *hdev, void *data)
u32 flags = 0;
int err;
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
if (qos->bcast.out.phys == BIT(1))
flags |= MGMT_ADV_FLAG_SEC_2M;
@@ -2204,11 +2274,24 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err)
bt_dev_dbg(hdev, "conn %p", conn);
+ if (err == -ECANCELED)
+ goto done;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, conn))
+ goto unlock;
+
if (err) {
bt_dev_err(hdev, "Unable to create BIG: %d", err);
hci_connect_cfm(conn, err);
hci_conn_del(conn);
}
+
+unlock:
+ hci_dev_unlock(hdev);
+done:
+ hci_conn_put(conn);
}
struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
@@ -2336,10 +2419,11 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
BT_BOUND, &data);
/* Queue start periodic advertising and create BIG */
- err = hci_cmd_sync_queue(hdev, create_big_sync, conn,
+ err = hci_cmd_sync_queue(hdev, create_big_sync, hci_conn_get(conn),
create_big_complete);
if (err < 0) {
hci_conn_drop(conn);
+ hci_conn_put(conn);
return ERR_PTR(err);
}
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index b2ee6b6a0f56..eea2f810aafa 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -7118,9 +7118,29 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
continue;
}
+ if (ev->num_bis <= i) {
+ bt_dev_err(hdev,
+ "Not enough BIS handles for BIG 0x%2.2x",
+ ev->handle);
+ ev->status = HCI_ERROR_UNSPECIFIED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+ continue;
+ }
+
if (hci_conn_set_handle(conn,
- __le16_to_cpu(ev->bis_handle[i++])))
+ __le16_to_cpu(ev->bis_handle[i++]))) {
+ bt_dev_err(hdev,
+ "Failed to set BIS handle for BIG 0x%2.2x",
+ ev->handle);
+ /* Force error so BIG gets terminated as not all BIS
+ * could be connected.
+ */
+ ev->status = HCI_ERROR_UNSPECIFIED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
continue;
+ }
conn->state = BT_CONNECTED;
set_bit(HCI_CONN_BIG_CREATED, &conn->flags);
@@ -7129,7 +7149,10 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
hci_iso_setup_path(conn);
}
- if (!ev->status && !i)
+ /* If there is an unexpected error or if no BISes have been connected
+ * for the BIG, terminate it.
+ */
+ if (ev->status == HCI_ERROR_UNSPECIFIED || (!ev->status && !i))
/* If no BISes have been connected for the BIG,
* terminate. This is in case all bound connections
* have been closed before the BIG creation
@@ -7168,7 +7191,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
conn->num_bis = 0;
- memset(conn->bis, 0, sizeof(conn->num_bis));
+ memset(conn->bis, 0, sizeof(conn->bis));
for (i = 0; i < ev->num_bis; i++) {
u16 handle = le16_to_cpu(ev->bis[i]);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 7bcf8c5ceaee..976f91eeb745 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1036,6 +1036,28 @@ static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr)
}
/*
+ * Consume session->conn: clear the member under hidp_session_sem, then
+ * l2cap_unregister_user() and l2cap_conn_put() the snapshot outside the
+ * sem. At most one caller wins; later callers see NULL and skip. The
+ * reference is the one hidp_session_new() took via l2cap_conn_get().
+ */
+static void hidp_session_unregister_conn(struct hidp_session *session)
+{
+ struct l2cap_conn *conn;
+
+ down_write(&hidp_session_sem);
+ conn = session->conn;
+ if (conn)
+ session->conn = NULL;
+ up_write(&hidp_session_sem);
+
+ if (conn) {
+ l2cap_unregister_user(conn, &session->user);
+ l2cap_conn_put(conn);
+ }
+}
+
+/*
* Start session synchronously
* This starts a session thread and waits until initialization
* is done or returns an error if it couldn't be started.
@@ -1311,8 +1333,7 @@ static int hidp_session_thread(void *arg)
* Instead, this call has the same semantics as if user-space tried to
* delete the session.
*/
- if (session->conn)
- l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_unregister_conn(session);
hidp_session_put(session);
@@ -1418,7 +1439,7 @@ int hidp_connection_del(struct hidp_conndel_req *req)
HIDP_CTRL_VIRTUAL_CABLE_UNPLUG,
NULL, 0);
else
- l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_unregister_conn(session);
hidp_session_put(session);
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index be145e2736b7..7cb2864fe872 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -347,6 +347,7 @@ static int iso_connect_bis(struct sock *sk)
return -EHOSTUNREACH;
hci_dev_lock(hdev);
+ lock_sock(sk);
if (!bis_capable(hdev)) {
err = -EOPNOTSUPP;
@@ -399,13 +400,9 @@ static int iso_connect_bis(struct sock *sk)
goto unlock;
}
- lock_sock(sk);
-
err = iso_chan_add(conn, sk, NULL);
- if (err) {
- release_sock(sk);
+ if (err)
goto unlock;
- }
/* Update source addr of the socket */
bacpy(&iso_pi(sk)->src, &hcon->src);
@@ -421,9 +418,8 @@ static int iso_connect_bis(struct sock *sk)
iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
}
- release_sock(sk);
-
unlock:
+ release_sock(sk);
hci_dev_unlock(hdev);
hci_dev_put(hdev);
return err;
@@ -444,6 +440,7 @@ static int iso_connect_cis(struct sock *sk)
return -EHOSTUNREACH;
hci_dev_lock(hdev);
+ lock_sock(sk);
if (!cis_central_capable(hdev)) {
err = -EOPNOTSUPP;
@@ -498,13 +495,9 @@ static int iso_connect_cis(struct sock *sk)
goto unlock;
}
- lock_sock(sk);
-
err = iso_chan_add(conn, sk, NULL);
- if (err) {
- release_sock(sk);
+ if (err)
goto unlock;
- }
/* Update source addr of the socket */
bacpy(&iso_pi(sk)->src, &hcon->src);
@@ -520,9 +513,8 @@ static int iso_connect_cis(struct sock *sk)
iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
}
- release_sock(sk);
-
unlock:
+ release_sock(sk);
hci_dev_unlock(hdev);
hci_dev_put(hdev);
return err;
@@ -1193,7 +1185,7 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
release_sock(sk);
- if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
+ if (bacmp(&sa->iso_bdaddr, BDADDR_ANY))
err = iso_connect_cis(sk);
else
err = iso_connect_bis(sk);
@@ -2256,8 +2248,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
iso_match_sid, ev1);
if (sk && !ev1->status) {
+ lock_sock(sk);
iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
iso_pi(sk)->bc_sid = ev1->sid;
+ release_sock(sk);
}
goto done;
@@ -2268,8 +2262,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
iso_match_sid_past, ev1a);
if (sk && !ev1a->status) {
+ lock_sock(sk);
iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle);
iso_pi(sk)->bc_sid = ev1a->sid;
+ release_sock(sk);
}
goto done;
@@ -2296,27 +2292,35 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
ev2);
if (sk) {
- int err;
- struct hci_conn *hcon = iso_pi(sk)->conn->hcon;
+ int err = 0;
+ bool big_sync;
+ struct hci_conn *hcon;
+ lock_sock(sk);
+
+ hcon = iso_pi(sk)->conn->hcon;
iso_pi(sk)->qos.bcast.encryption = ev2->encryption;
if (ev2->num_bis < iso_pi(sk)->bc_num_bis)
iso_pi(sk)->bc_num_bis = ev2->num_bis;
- if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
- !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) {
+ big_sync = !test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
+ !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags);
+
+ if (big_sync)
err = hci_conn_big_create_sync(hdev, hcon,
&iso_pi(sk)->qos,
iso_pi(sk)->sync_handle,
iso_pi(sk)->bc_num_bis,
iso_pi(sk)->bc_bis);
- if (err) {
- bt_dev_err(hdev, "hci_le_big_create_sync: %d",
- err);
- sock_put(sk);
- sk = NULL;
- }
+
+ release_sock(sk);
+
+ if (big_sync && err) {
+ bt_dev_err(hdev, "hci_le_big_create_sync: %d",
+ err);
+ sock_put(sk);
+ sk = NULL;
}
}
@@ -2370,8 +2374,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
if (!base || base_len > BASE_MAX_LENGTH)
goto done;
+ lock_sock(sk);
memcpy(iso_pi(sk)->base, base, base_len);
iso_pi(sk)->base_len = base_len;
+ release_sock(sk);
} else {
/* This is a PA data fragment. Keep pa_data_len set to 0
* until all data has been reassembled.
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 77dec104a9c3..7701528f1167 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -4706,16 +4706,8 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP,
sizeof(rsp), &rsp);
- if (!err) {
- u8 store_hint;
-
- store_hint = hci_le_conn_update(hcon, min, max, latency,
- to_multiplier);
- mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type,
- store_hint, min, max, latency,
- to_multiplier);
-
- }
+ if (!err)
+ hci_le_conn_update(hcon, min, max, latency, to_multiplier);
return 0;
}
@@ -5428,7 +5420,7 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn,
* configured, the MPS field may be less than the current MPS
* of that channel.
*/
- if (chan[i]->remote_mps >= mps && i) {
+ if (chan[i]->remote_mps > mps && num_scid > 1) {
BT_ERR("chan %p decreased MPS %u -> %u", chan[i],
chan[i]->remote_mps, mps);
result = L2CAP_RECONF_INVALID_MPS;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 71e8c1b45bce..cf590a67d364 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1498,6 +1498,9 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
{
struct sock *sk, *parent = chan->data;
+ if (!parent)
+ return NULL;
+
lock_sock(parent);
/* Check for backlog size */
@@ -1657,6 +1660,9 @@ static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state,
{
struct sock *sk = chan->data;
+ if (!sk)
+ return;
+
sk->sk_state = state;
if (err)
@@ -1758,6 +1764,9 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
+ if (!sk)
+ return 0;
+
return READ_ONCE(sk->sk_sndtimeo);
}
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 611a9a94151e..d11bd5337d57 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -1715,9 +1715,12 @@ static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk
}
if (pf && d->cfc) {
- u8 credits = *(u8 *) skb->data; skb_pull(skb, 1);
+ u8 *credits = skb_pull_data(skb, 1);
- d->tx_credits += credits;
+ if (!credits)
+ goto drop;
+
+ d->tx_credits += *credits;
if (d->tx_credits)
clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 18826d4b9c0b..eba44525d41d 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -472,9 +472,13 @@ static struct sock *sco_get_sock_listen(bdaddr_t *src)
sk1 = sk;
}
+ sk = sk ? sk : sk1;
+ if (sk)
+ sock_hold(sk);
+
read_unlock(&sco_sk_list.lock);
- return sk ? sk : sk1;
+ return sk;
}
static void sco_sock_destruct(struct sock *sk)
@@ -515,11 +519,13 @@ static void sco_sock_kill(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
/* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ lock_sock(sk);
if (sco_pi(sk)->conn) {
sco_conn_lock(sco_pi(sk)->conn);
sco_pi(sk)->conn->sk = NULL;
sco_conn_unlock(sco_pi(sk)->conn);
}
+ release_sock(sk);
/* Kill poor orphan */
bt_sock_unlink(&sco_sk_list, sk);
@@ -1365,40 +1371,51 @@ static int sco_sock_release(struct socket *sock)
static void sco_conn_ready(struct sco_conn *conn)
{
- struct sock *parent;
- struct sock *sk = conn->sk;
+ struct sock *parent, *sk;
+
+ sco_conn_lock(conn);
+ sk = sco_sock_hold(conn);
+ sco_conn_unlock(conn);
BT_DBG("conn %p", conn);
if (sk) {
lock_sock(sk);
- sco_sock_clear_timer(sk);
- sk->sk_state = BT_CONNECTED;
- sk->sk_state_change(sk);
+
+ /* conn->sk may have become NULL if racing with sk close, but
+ * due to held hdev->lock, it can't become different sk.
+ */
+ if (conn->sk) {
+ sco_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+ }
+
release_sock(sk);
+ sock_put(sk);
} else {
- sco_conn_lock(conn);
-
- if (!conn->hcon) {
- sco_conn_unlock(conn);
+ if (!conn->hcon)
return;
- }
+
+ lockdep_assert_held(&conn->hcon->hdev->lock);
parent = sco_get_sock_listen(&conn->hcon->src);
- if (!parent) {
- sco_conn_unlock(conn);
+ if (!parent)
return;
- }
lock_sock(parent);
+ sco_conn_lock(conn);
+
+ /* hdev->lock guarantees conn->sk == NULL still here */
+
+ if (parent->sk_state != BT_LISTEN)
+ goto release;
+
sk = sco_sock_alloc(sock_net(parent), NULL,
BTPROTO_SCO, GFP_ATOMIC, 0);
- if (!sk) {
- release_sock(parent);
- sco_conn_unlock(conn);
- return;
- }
+ if (!sk)
+ goto release;
sco_sock_init(sk, parent);
@@ -1417,9 +1434,10 @@ static void sco_conn_ready(struct sco_conn *conn)
/* Wake up parent */
parent->sk_data_ready(parent);
- release_sock(parent);
-
+release:
sco_conn_unlock(conn);
+ release_sock(parent);
+ sock_put(parent);
}
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 06c195906231..8bfa8313ef62 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head)
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
netdev_name_node_del(name_node);
- list_del(&name_node->list);
+ list_del_rcu(&name_node->list);
call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4381e0fc25bf..84faace50ac2 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -608,14 +608,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
/*
* Returns a pointer to a string representation of the identifier used
* to select the egress interface for the given netpoll instance. buf
- * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ * is used to format np->dev_mac when np->dev_name is empty; bufsz must
+ * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address
+ * and its NUL terminator.
*/
-static char *egress_dev(struct netpoll *np, char *buf)
+static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz)
{
if (np->dev_name[0])
return np->dev_name;
- snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ snprintf(buf, bufsz, "%pM", np->dev_mac);
return buf;
}
@@ -645,7 +647,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
if (!IS_ENABLED(CONFIG_IPV6)) {
np_err(np, "IPv6 is not supported %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EINVAL;
}
@@ -667,7 +669,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
}
if (err) {
np_err(np, "no IPv6 address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return err;
}
@@ -687,14 +689,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
in_dev = __in_dev_get_rtnl(ndev);
if (!in_dev) {
np_err(np, "no IP address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EDESTADDRREQ;
}
ifa = rtnl_dereference(in_dev->ifa_list);
if (!ifa) {
np_err(np, "no IP address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EDESTADDRREQ;
}
@@ -736,7 +738,8 @@ int netpoll_setup(struct netpoll *np)
ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
if (!ndev) {
- np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
+ np_err(np, "%s doesn't exist, aborting\n",
+ egress_dev(np, buf, sizeof(buf)));
err = -ENODEV;
goto unlock;
}
@@ -744,14 +747,14 @@ int netpoll_setup(struct netpoll *np)
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
err = -EBUSY;
goto put;
}
if (!netif_running(ndev)) {
np_info(np, "device %s not up yet, forcing it\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
err = dev_open(ndev, NULL);
if (err) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b613bb6e07df..df042da422ef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
port_guid.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memset(&vf_broadcast, 0, sizeof(vf_broadcast));
memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5fb812443a08..4366cbac3f06 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -124,9 +124,14 @@ static void ah_output_done(void *data, int err)
struct iphdr *top_iph = ip_hdr(skb);
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int ihl = ip_hdrlen(skb);
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
iph = AH_SKB_CB(skb)->tmp;
- icv = ah_tmp_icv(iph, ihl);
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
@@ -270,12 +275,17 @@ static void ah_input_done(void *data, int err)
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int ihl = ip_hdrlen(skb);
int ah_hlen = (ah->hdrlen + 2) << 2;
+ int seqhi_len = 0;
+ __be32 *seqhi;
if (err)
goto out;
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
work_iph = AH_SKB_CB(skb)->tmp;
- auth_data = ah_tmp_auth(work_iph, ihl);
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 6dfc0bcdef65..6a5febbdbee4 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -873,7 +873,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = 1;
goto skip_cow;
- } else if (!skb_has_frag_list(skb)) {
+ } else if (!skb_has_frag_list(skb) &&
+ !skb_has_shared_frag(skb)) {
nfrags = skb_shinfo(skb)->nr_frags;
nfrags++;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a674fb44ec25..a9ad39064f3b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -122,16 +122,29 @@
* contradict to specs provided this delay is small enough.
*/
-#define IGMP_V1_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
- IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
- ((in_dev)->mr_v1_seen && \
- time_before(jiffies, (in_dev)->mr_v1_seen)))
-#define IGMP_V2_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
- IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
- ((in_dev)->mr_v2_seen && \
- time_before(jiffies, (in_dev)->mr_v2_seen)))
+static bool IGMP_V1_SEEN(const struct in_device *in_dev)
+{
+ unsigned long seen;
+
+ if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1)
+ return true;
+ if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1)
+ return true;
+ seen = READ_ONCE(in_dev->mr_v1_seen);
+ return seen && time_before(jiffies, seen);
+}
+
+static bool IGMP_V2_SEEN(const struct in_device *in_dev)
+{
+ unsigned long seen;
+
+ if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2)
+ return true;
+ if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2)
+ return true;
+ seen = READ_ONCE(in_dev->mr_v2_seen);
+ return seen && time_before(jiffies, seen);
+}
static int unsolicited_report_interval(struct in_device *in_dev)
{
@@ -954,23 +967,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int max_delay;
int mark = 0;
struct net *net = dev_net(in_dev->dev);
-
+ unsigned long seen;
if (len == 8) {
+ seen = jiffies + READ_ONCE(in_dev->mr_qrv) * READ_ONCE(in_dev->mr_qi) +
+ READ_ONCE(in_dev->mr_qri);
if (ih->code == 0) {
/* Alas, old v1 router presents here. */
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
- in_dev->mr_v1_seen = jiffies +
- (in_dev->mr_qrv * in_dev->mr_qi) +
- in_dev->mr_qri;
+ WRITE_ONCE(in_dev->mr_v1_seen, seen);
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
- in_dev->mr_v2_seen = jiffies +
- (in_dev->mr_qrv * in_dev->mr_qi) +
- in_dev->mr_qri;
+ WRITE_ONCE(in_dev->mr_v2_seen, seen);
}
/* cancel the interface change timer */
WRITE_ONCE(in_dev->mr_ifc_count, 0);
@@ -995,6 +1006,8 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
+ unsigned long mr_qi;
+
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return true;
@@ -1015,15 +1028,16 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
* received value was zero, use the default or statically
* configured value.
*/
- in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
- in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
-
+ WRITE_ONCE(in_dev->mr_qrv,
+ ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
+ mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+ WRITE_ONCE(in_dev->mr_qi, mr_qi);
/* RFC3376, 8.3. Query Response Interval:
* The number of seconds represented by the [Query Response
* Interval] must be less than the [Query Interval].
*/
- if (in_dev->mr_qri >= in_dev->mr_qi)
- in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+ if (READ_ONCE(in_dev->mr_qri) >= mr_qi)
+ WRITE_ONCE(in_dev->mr_qri, (mr_qi/HZ - 1) * HZ);
if (!group) { /* general query */
if (ih3->nsrcs)
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d8083b9033c2..5b957a831e7c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -179,7 +179,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
seq = read_seqbegin(&base->lock);
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
- if (p)
+ /* Make sure tree was not modified during our lookup. */
+ if (p && !read_seqretry(&base->lock, seq))
return p;
/* retry an exact lookup, taking the lock before.
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e4790cc7b5c2..5bcd73cbdb41 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1233,6 +1233,8 @@ alloc_new_skb:
if (err < 0)
goto error;
copy = err;
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
wmem_alloc_delta += copy;
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2058ca860294..2628cd3a93a6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -537,15 +537,16 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
};
int err;
+ rcu_read_lock();
err = ipmr_fib_lookup(net, &fl4, &mrt);
if (err < 0) {
+ rcu_read_unlock();
kfree_skb(skb);
return err;
}
DEV_STATS_ADD(dev, tx_bytes, skb->len);
DEV_STATS_INC(dev, tx_packets);
- rcu_read_lock();
/* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
@@ -1112,11 +1113,12 @@ static int ipmr_cache_report(const struct mr_table *mrt,
msg->im_vif_hi = vifi >> 8;
ipv4_pktinfo_prepare(mroute_sk, pkt, false);
memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
- /* Add our header */
- igmp = skb_put(skb, sizeof(struct igmphdr));
+ /* Add our header.
+ * Note that code, csum and group fields are cleared.
+ */
+ igmp = skb_put_zero(skb, sizeof(struct igmphdr));
igmp->type = assert;
msg->im_msgtype = assert;
- igmp->code = 0;
ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
skb->transport_header = skb->network_header;
}
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 5080fa5fbf6a..f9c6755f5ec5 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -94,6 +94,9 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
#endif
int doff = 0;
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ return NULL;
+
if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
struct tcphdr _hdr;
struct udphdr *hp;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8fc24c3743c5..c0526cc03980 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1827,7 +1827,6 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
enum skb_drop_reason reason;
- struct sock *rsk;
reason = psp_sk_rx_policy_check(sk, skb);
if (reason)
@@ -1863,24 +1862,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
if (nsk != sk) {
reason = tcp_child_process(sk, nsk, skb);
- if (reason) {
- rsk = nsk;
+ sock_put(nsk);
+ if (reason)
goto reset;
- }
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
reason = tcp_rcv_state_process(sk, skb);
- if (reason) {
- rsk = sk;
+ if (reason)
goto reset;
- }
return 0;
reset:
- tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
+ tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
discard:
sk_skb_reason_drop(sk, skb, reason);
/* Be careful here. If this function gets more complicated and
@@ -2193,8 +2189,10 @@ lookup:
rst_reason = sk_rst_convert_drop_reason(drop_reason);
tcp_v4_send_reset(nsk, skb, rst_reason);
+ sock_put(nsk);
goto discard_and_relse;
}
+ sock_put(nsk);
sock_put(sk);
return 0;
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 199f0b579e89..e6092c3ac840 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -1012,6 +1012,6 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
}
bh_unlock_sock(child);
- sock_put(child);
+
return reason;
}
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index c024aa77f25b..c3806c6ac96f 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -164,7 +164,7 @@ config IPV6_SIT
select INET_TUNNEL
select NET_IP_TUNNEL
select IPV6_NDISC_NODETYPE
- default y
+ default m
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -172,7 +172,7 @@ config IPV6_SIT
into IPv4 packets. This is useful if you want to connect two IPv6
networks over an IPv4-only path.
- Saying M here will produce a module called sit. If unsure, say Y.
+ Saying M here will produce a module called sit. If unsure, say M.
config IPV6_SIT_6RD
bool "IPv6: IPv6 Rapid Deployment (6RD)"
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index cb26beea4398..de1e68199a01 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -317,14 +317,19 @@ static void ah6_output_done(void *data, int err)
struct ipv6hdr *top_iph = ipv6_hdr(skb);
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
struct tmp_ext *iph_ext;
+ int seqhi_len = 0;
+ __be32 *seqhi;
extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
if (extlen)
extlen += sizeof(*iph_ext);
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
iph_base = AH_SKB_CB(skb)->tmp;
iph_ext = ah_tmp_ext(iph_base);
- icv = ah_tmp_icv(iph_ext, extlen);
+ seqhi = (__be32 *)((char *)iph_ext + extlen);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
@@ -471,13 +476,18 @@ static void ah6_input_done(void *data, int err)
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int hdr_len = skb_network_header_len(skb);
int ah_hlen = ipv6_authlen(ah);
+ int seqhi_len = 0;
+ __be32 *seqhi;
if (err)
goto out;
+ if (x->props.flags & XFRM_STATE_ESN)
+ seqhi_len = sizeof(*seqhi);
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, hdr_len);
- icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
+ seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
if (err)
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9f75313734f8..9c06c5a1419d 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -915,7 +915,8 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
nfrags = 1;
goto skip_cow;
- } else if (!skb_has_frag_list(skb)) {
+ } else if (!skb_has_frag_list(skb) &&
+ !skb_has_shared_frag(skb)) {
nfrags = skb_shinfo(skb)->nr_frags;
nfrags++;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 49e31e4ae7b7..9d06d487e8b1 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -73,6 +73,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
__be16 *frag_offp)
{
u8 nexthdr = *nexthdrp;
+ int exthdr_cnt = 0;
*frag_offp = 0;
@@ -82,6 +83,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
if (nexthdr == NEXTHDR_NONE)
return -1;
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ return -1;
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
if (!hp)
return -1;
@@ -190,6 +193,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
{
unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int exthdr_cnt = 0;
bool found;
if (fragoff)
@@ -216,6 +220,9 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
return -ENOENT;
}
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ return -EBADMSG;
+
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
if (!hp)
return -EBADMSG;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 63fc8556b475..365b4059eb20 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -2262,10 +2262,11 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+ struct ip6_tnl *t = netdev_priv(dev);
struct __ip6_tnl_parm p;
- struct ip6_tnl *t;
+ struct ip6gre_net *ign;
+ ign = net_generic(t->net, ip6gre_net_id);
t = ip6gre_changelink_common(dev, tb, data, &p, extack);
if (IS_ERR(t))
return PTR_ERR(t);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 967b07aeb683..8972863c93ee 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -403,6 +403,7 @@ INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
bool have_final)
{
+ int exthdr_cnt = IP6CB(skb)->flags & IP6SKB_HOPBYHOP ? 1 : 0;
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
@@ -487,6 +488,10 @@ resubmit_final:
nexthdr = ret;
goto resubmit_final;
} else {
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) {
+ SKB_DR_SET(reason, IPV6_TOO_MANY_EXTHDRS);
+ goto discard;
+ }
goto resubmit;
}
} else if (ret == 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7e92909ab5be..c14adcdd4396 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -468,6 +468,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
default:
break;
}
+ hdr = ipv6_hdr(skb);
}
/*
@@ -582,6 +583,8 @@ int ip6_forward(struct sk_buff *skb)
if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
int proxied = ip6_forward_proxy_check(skb);
+
+ hdr = ipv6_hdr(skb);
if (proxied > 0) {
/* It's tempting to decrease the hop limit
* here by 1, as we do at the end of the
@@ -1794,6 +1797,8 @@ alloc_new_skb:
if (err < 0)
goto error;
copy = err;
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
wmem_alloc_delta += copy;
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index c468c83af0f2..9d1037ac082f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -399,11 +399,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
unsigned int nhoff = raw - skb->data;
unsigned int off = nhoff + sizeof(*ipv6h);
u8 nexthdr = ipv6h->nexthdr;
+ int exthdr_cnt = 0;
while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
struct ipv6_opt_hdr *hdr;
u16 optlen;
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ break;
+
if (!pskb_may_pull(skb, off + sizeof(*hdr)))
break;
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index ced8bd44828e..893f2aeb4711 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -100,6 +100,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
const struct in6_addr *daddr = NULL, *saddr = NULL;
struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
struct sk_buff *data_skb = NULL;
+ unsigned short fragoff = 0;
int doff = 0;
int thoff = 0, tproto;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -107,8 +108,8 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
struct nf_conn const *ct;
#endif
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
+ tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (tproto < 0 || fragoff) {
pr_debug("unable to find transport header in IPv6 packet, dropping\n");
return NULL;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 19eb6b702227..e3d355d1fbd6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1645,6 +1645,10 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
rcu_read_lock();
idev = __in6_dev_get(dev);
+ if (!idev) {
+ rcu_read_unlock();
+ return 0;
+ }
mtu = READ_ONCE(idev->cnf.mtu6);
rcu_read_unlock();
}
@@ -4995,6 +4999,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ fib6_update_sernum(net, rt);
rt6_multipath_rebalance(rt);
break;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2c3f7a739709..d13d49bfef19 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -288,8 +288,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
saddr = &fl6->saddr;
err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
- if (err)
+ if (err) {
+ dst_release(dst);
goto failure;
+ }
}
/* set the source address */
@@ -1617,12 +1619,13 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v6_cookie_check(sk, skb);
+ if (!nsk)
+ return 0;
if (nsk != sk) {
- if (nsk) {
- reason = tcp_child_process(sk, nsk, skb);
- if (reason)
- goto reset;
- }
+ reason = tcp_child_process(sk, nsk, skb);
+ sock_put(nsk);
+ if (reason)
+ goto reset;
return 0;
}
} else
@@ -1827,8 +1830,10 @@ lookup:
rst_reason = sk_rst_convert_drop_reason(drop_reason);
tcp_v6_send_reset(nsk, skb, rst_reason);
+ sock_put(nsk);
goto discard_and_relse;
}
+ sock_put(nsk);
sock_put(sk);
return 0;
}
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index ea2f805d3b01..9b586fcec485 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -88,8 +88,10 @@ int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6,
skb, flags);
- if (dst->error)
+ if (dst->error) {
+ dst_release(dst);
goto drop;
+ }
skb_dst_set(skb, dst);
}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 160ae65a5c64..0a0f27836d57 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -438,6 +438,15 @@ ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata,
ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
/*
+ * Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in
+ * their basic HT-MCS set. On cards with lower spatial streams, the check
+ * would fail, and we'd be stuck with no HT when it in fact work fine with
+ * its own supported rate. So check it only in strict mode.
+ */
+ if (!ieee80211_hw_check(&sdata->local->hw, STRICT))
+ return true;
+
+ /*
* P802.11REVme/D7.0 - 6.5.4.2.4
* ...
* If the MLME of an HT STA receives an MLME-JOIN.request primitive
@@ -9140,7 +9149,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
struct ieee80211_bss *bss = (void *)cbss->priv;
struct sta_info *new_sta = NULL;
struct ieee80211_link_data *link;
- bool have_sta = false;
+ struct sta_info *have_sta = NULL;
bool mlo;
int err;
u16 new_links;
@@ -9159,11 +9168,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
mlo = false;
}
- if (assoc) {
- rcu_read_lock();
+ if (assoc)
have_sta = sta_info_get(sdata, ap_mld_addr);
- rcu_read_unlock();
- }
if (mlo && !have_sta &&
WARN_ON(sdata->vif.valid_links || sdata->vif.active_links))
@@ -9327,6 +9333,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
out_release_chan:
ieee80211_link_release_channel(link);
out_err:
+ if (mlo && have_sta)
+ WARN_ON(__sta_info_destroy(have_sta));
ieee80211_vif_set_links(sdata, 0, 0);
return err;
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3e5d1c47a5b0..d18e962126ce 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4971,7 +4971,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
struct sk_buff *skb = rx->skb;
struct ieee80211_hdr *hdr = (void *)skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
- static ieee80211_rx_result res;
+ ieee80211_rx_result res;
int orig_len = skb->len;
int hdrlen = ieee80211_hdrlen(hdr->frame_control);
int snap_offs = hdrlen;
@@ -5380,7 +5380,9 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
if (!link_sta)
goto out;
- ieee80211_rx_data_set_link(&rx, link_sta->link_id);
+ if (!ieee80211_rx_data_set_link(&rx,
+ link_sta->link_id))
+ goto out;
}
if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c
index adc069065e73..fa370831d617 100644
--- a/net/mac80211/tests/chan-mode.c
+++ b/net/mac80211/tests/chan-mode.c
@@ -65,6 +65,7 @@ static const struct determine_chan_mode_case {
.ht_capa_mask = {
.mcs.rx_mask[0] = 0xf7,
},
+ .strict = true,
}, {
.desc = "Masking out a RX rate in VHT capabilities",
.conn_mode = IEEE80211_CONN_MODE_EHT,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b093bc203c81..2529b01e2cd5 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3700,11 +3700,11 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy,
struct ieee80211_local *local =
container_of(work, struct ieee80211_local, radar_detected_work);
struct cfg80211_chan_def chandef;
- struct ieee80211_chanctx *ctx;
+ struct ieee80211_chanctx *ctx, *tmp;
lockdep_assert_wiphy(local->hw.wiphy);
- list_for_each_entry(ctx, &local->chanctx_list, list) {
+ list_for_each_entry_safe(ctx, tmp, &local->chanctx_list, list) {
if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index e1033643fab0..e4b230ef6099 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -920,9 +920,9 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test)
static void mctp_test_route_input_null_eid(struct kunit *test)
{
struct mctp_hdr hdr = RX_HDR(1, 10, 0, FL_S | FL_E | FL_TO);
+ struct sockaddr_mctp addr = { 0 };
struct sk_buff *skb_pkt, *skb_sk;
struct mctp_test_dev *dev;
- struct sockaddr_mctp addr;
struct socket *sock;
u8 type = 0;
int rc;
diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c
index c3987d5ade7a..6eef8d485c25 100644
--- a/net/mctp/test/utils.c
+++ b/net/mctp/test/utils.c
@@ -116,7 +116,7 @@ void mctp_test_destroy_dev(struct mctp_test_dev *dev)
static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
{
skb->dev = dst->dev->dev;
- dev_queue_xmit(skb);
+ dev_direct_xmit(skb, 0);
return 0;
}
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
index 82ec15bcfd7f..082c46c0f50e 100644
--- a/net/mptcp/fastopen.c
+++ b/net/mptcp/fastopen.c
@@ -12,6 +12,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
struct sock *sk, *ssk;
struct sk_buff *skb;
struct tcp_sock *tp;
+ bool has_rxtstamp;
/* on early fallback the subflow context is deleted by
* subflow_syn_recv_sock()
@@ -40,12 +41,13 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
*/
tp->copied_seq += skb->len;
subflow->ssn_offset += skb->len;
+ has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
/* Only the sequence delta is relevant */
MPTCP_SKB_CB(skb)->map_seq = -skb->len;
MPTCP_SKB_CB(skb)->end_seq = 0;
MPTCP_SKB_CB(skb)->offset = 0;
- MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+ MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
MPTCP_SKB_CB(skb)->cant_coalesce = 1;
mptcp_data_lock(sk);
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 57a456690406..3c152bf66cd5 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -16,6 +16,7 @@ struct mptcp_pm_add_entry {
struct list_head list;
struct mptcp_addr_info addr;
u8 retrans_times;
+ bool timer_done;
struct timer_list add_timer;
struct mptcp_sock *sock;
struct rcu_head rcu;
@@ -283,6 +284,9 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
struct mptcp_addr_info local, remote;
+ if (!__mptcp_subflow_active(subflow))
+ continue;
+
mptcp_local_address((struct sock_common *)ssk, &local);
if (!mptcp_addresses_equal(&local, addr, addr->port))
continue;
@@ -305,18 +309,31 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk)
const struct net *net = sock_net((struct sock *)msk);
unsigned int rto = mptcp_get_add_addr_timeout(net);
struct mptcp_subflow_context *subflow;
- unsigned int max = 0;
+ unsigned int max = 0, max_stale = 0;
+
+ if (!rto)
+ return 0;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
struct inet_connection_sock *icsk = inet_csk(ssk);
- if (icsk->icsk_rto > max)
+ if (!__mptcp_subflow_active(subflow))
+ continue;
+
+ if (unlikely(subflow->stale)) {
+ if (icsk->icsk_rto > max_stale)
+ max_stale = icsk->icsk_rto;
+ } else if (icsk->icsk_rto > max) {
max = icsk->icsk_rto;
+ }
}
- if (max && max < rto)
- rto = max;
+ if (max)
+ return min(max, rto);
+
+ if (max_stale)
+ return min(max_stale, rto);
return rto;
}
@@ -327,26 +344,22 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
add_timer);
struct mptcp_sock *msk = entry->sock;
struct sock *sk = (struct sock *)msk;
- unsigned int timeout;
+ unsigned int timeout = 0;
pr_debug("msk=%p\n", msk);
- if (!msk)
- return;
-
- if (inet_sk_state_load(sk) == TCP_CLOSE)
- return;
-
- if (!entry->addr.id)
- return;
+ bh_lock_sock(sk);
+ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
+ goto out;
- if (mptcp_pm_should_add_signal_addr(msk)) {
- sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ timeout = HZ / 20;
goto out;
}
timeout = mptcp_adjust_add_addr_timeout(msk);
- if (!timeout)
+ if (!timeout || mptcp_pm_should_add_signal_addr(msk))
goto out;
spin_lock_bh(&msk->pm.lock);
@@ -359,8 +372,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
}
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
- sk_reset_timer(sk, timer,
- jiffies + (timeout << entry->retrans_times));
+ timeout <<= entry->retrans_times;
+ else
+ timeout = 0;
spin_unlock_bh(&msk->pm.lock);
@@ -368,7 +382,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
mptcp_pm_subflow_established(msk);
out:
- __sock_put(sk);
+ if (timeout)
+ sk_reset_timer(sk, timer, jiffies + timeout);
+ else
+ /* if sock_put calls sk_free: avoid waiting for this timer */
+ entry->timer_done = true;
+ bh_unlock_sock(sk);
+ sock_put(sk);
}
struct mptcp_pm_add_entry *
@@ -431,6 +451,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
reset_timer:
+ add_entry->timer_done = false;
timeout = mptcp_adjust_add_addr_timeout(msk);
if (timeout)
sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
@@ -451,7 +472,8 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
list_for_each_entry_safe(entry, tmp, &free_list, list) {
- sk_stop_timer_sync(sk, &entry->add_timer);
+ if (!entry->timer_done)
+ sk_stop_timer_sync(sk, &entry->add_timer);
kfree_rcu(entry, rcu);
}
}
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index c9f1e5af3cd3..fc818b63752e 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -347,6 +347,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
/* check first for announce */
if (msk->pm.add_addr_signaled < endp_signal_max) {
+ u8 endp_id;
+
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
@@ -360,19 +362,20 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
if (!select_signal_address(pernet, msk, &local))
goto subflow;
+ /* Special case for ID0: set the correct ID */
+ endp_id = local.addr.id;
+ if (endp_id == msk->mpc_endpoint_id)
+ local.addr.id = 0;
+
/* If the alloc fails, we are on memory pressure, not worth
* continuing, and trying to create subflows.
*/
if (!mptcp_pm_alloc_anno_list(msk, &local.addr))
return;
- __clear_bit(local.addr.id, msk->pm.id_avail_bitmap);
+ __clear_bit(endp_id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++;
- /* Special case for ID0: set the correct ID */
- if (local.addr.id == msk->mpc_endpoint_id)
- local.addr.id = 0;
-
mptcp_pm_announce_addr(msk, &local.addr, false);
mptcp_pm_addr_send_ack(msk);
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 0efe40be2fde..1cf608e7357b 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -812,6 +812,10 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
if (ret)
break;
}
+
+ if (!ret)
+ sockopt_seq_inc(msk);
+
return ret;
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index e2cb9d23e4a0..d562e149606f 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -581,7 +581,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->backup);
if (!subflow_thmac_valid(subflow)) {
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC);
subflow->reset_reason = MPTCP_RST_EMPTCP;
goto do_reset;
}
@@ -908,7 +908,7 @@ create_child:
if (!subflow_hmac_valid(subflow_req, &mp_opt)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
- subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
goto dispose_child;
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2082bfb2d93c..9ea6b4fa78bf 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
hash_key2 = hash_key;
use2 = false;
}
+
conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
&head, &head2);
- spin_lock(&cp->lock);
-
- if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- cp->flags |= IP_VS_CONN_F_HASHED;
- WRITE_ONCE(cp->hn0.hash_key, hash_key);
- WRITE_ONCE(cp->hn1.hash_key, hash_key2);
- refcount_inc(&cp->refcnt);
- hlist_bl_add_head_rcu(&cp->hn0.node, head);
- if (use2)
- hlist_bl_add_head_rcu(&cp->hn1.node, head2);
- ret = 1;
- } else {
- pr_err("%s(): request for already hashed, called from %pS\n",
- __func__, __builtin_return_address(0));
- ret = 0;
- }
- spin_unlock(&cp->lock);
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ WRITE_ONCE(cp->hn0.hash_key, hash_key);
+ WRITE_ONCE(cp->hn1.hash_key, hash_key2);
+ refcount_inc(&cp->refcnt);
+ hlist_bl_add_head_rcu(&cp->hn0.node, head);
+ if (use2)
+ hlist_bl_add_head_rcu(&cp->hn1.node, head2);
+
conn_tab_unlock(head, head2);
+ ret = 1;
/* Schedule resizing if load increases */
if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
@@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
&head, &head2);
- spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
/* Decrease refcnt and unlink conn only if we are last user */
@@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
}
}
- spin_unlock(&cp->lock);
conn_tab_unlock(head, head2);
rcu_read_unlock();
@@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
struct ip_vs_conn_hnode *hn;
u32 hash_key, hash_key_new;
struct ip_vs_conn_param p;
+ bool by_me = false;
int ntbl;
int dir;
@@ -664,8 +656,16 @@ retry:
t = rcu_dereference(t->new_tbl);
ntbl++;
/* We are lost? */
- if (ntbl >= 2)
+ if (ntbl >= 2) {
+ spin_lock_bh(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me)
+ cp->cport = 0;
+ /* hn1 will be rehashed on next packet */
+ spin_unlock_bh(&cp->lock);
+ IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n",
+ __func__, dir);
return;
+ }
}
/* Rehashing during resize? Use the recent table for adds */
@@ -683,10 +683,13 @@ retry:
if (head > head2 && t == t2)
swap(head, head2);
+ /* Protect the cp->flags modification */
+ spin_lock_bh(&cp->lock);
+
/* Lock seqcount only for the old bucket, even if we are on new table
* because it affects the del operation, not the adding.
*/
- spin_lock_bh(&t->lock[hash_key & t->lock_mask].l);
+ spin_lock(&t->lock[hash_key & t->lock_mask].l);
preempt_disable_nested();
write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
@@ -704,14 +707,23 @@ retry:
hlist_bl_unlock(head);
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
preempt_enable_nested();
- spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
+ spin_unlock(&t->lock[hash_key & t->lock_mask].l);
+ spin_unlock_bh(&cp->lock);
hash_key = hash_key_new;
goto retry;
}
- spin_lock(&cp->lock);
- if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
- (cp->flags & IP_VS_CONN_F_HASHED)) {
+ /* Fill cport once, even if multiple packets try to do it */
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) {
+ /* If we race with resizing make sure cport is set for dir 1 */
+ if (!cp->cport) {
+ cp->cport = cport;
+ by_me = true;
+ }
+ if (!dir) {
+ atomic_dec(&ipvs->no_cport_conns[af_id]);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ }
/* We do not recalc hash_key_r under lock, we assume the
* parameters in cp do not change, i.e. cport is
* the only possible change.
@@ -726,21 +738,17 @@ retry:
hlist_bl_del_rcu(&hn->node);
hlist_bl_add_head_rcu(&hn->node, head_new);
}
- if (!dir) {
- atomic_dec(&ipvs->no_cport_conns[af_id]);
- cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
- cp->cport = cport;
- }
}
- spin_unlock(&cp->lock);
if (head != head2)
hlist_bl_unlock(head2);
hlist_bl_unlock(head);
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
preempt_enable_nested();
- spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
- if (dir--)
+ spin_unlock(&t->lock[hash_key & t->lock_mask].l);
+
+ spin_unlock_bh(&cp->lock);
+ if (dir-- && by_me)
goto next_dir;
}
@@ -1835,7 +1843,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
if (!rcu_dereference_protected(ipvs->conn_tab, 1))
return;
- cancel_delayed_work_sync(&ipvs->conn_resize_work);
+ disable_delayed_work_sync(&ipvs->conn_resize_work);
if (!atomic_read(&ipvs->conn_count))
goto unreg;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f5b7a2047291..d40b404c1bf6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
{
if (!t)
return 1 << min_bits;
- n = roundup_pow_of_two(n);
+ n = n > 0 ? roundup_pow_of_two(n) : 1;
if (lfactor < 0) {
int factor = min(-lfactor, max_bits);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6632daa87ded..c7c7f6a7a9f6 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work)
if (!kd)
continue;
/* New config ? Stop kthread tasks */
- if (genid != genid_done)
- ip_vs_est_kthread_stop(kd);
+ if (genid != genid_done) {
+ if (!id) {
+ /* Only we can stop kt 0 but not under mutex */
+ mutex_unlock(&ipvs->est_mutex);
+ ip_vs_est_kthread_stop(kd);
+ mutex_lock(&ipvs->est_mutex);
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock;
+ /* kd for kt 0 is never destroyed */
+ } else {
+ ip_vs_est_kthread_stop(kd);
+ }
+ }
if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ bool start;
+
/* Do not start kthreads above 0 in calc phase */
- if ((!id || !ipvs->est_calc_phase) &&
- ip_vs_est_kthread_start(ipvs, kd) < 0)
+ if (id)
+ start = !ipvs->est_calc_phase;
+ else
+ start = kd->needed;
+ if (start && ip_vs_est_kthread_start(ipvs, kd) < 0)
repeat = true;
}
}
@@ -1102,6 +1118,24 @@ out:
return dest;
}
+/* Put destination in trash */
+static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest, unsigned long istart,
+ bool cleanup)
+{
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ refcount_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash with reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = istart;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+}
+
static void ip_vs_dest_rcu_free(struct rcu_head *head)
{
struct ip_vs_dest *dest;
@@ -1461,9 +1495,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ntohs(dest->vport));
ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ /* On error put back dest into the trash */
if (ret < 0)
- return ret;
- __ip_vs_update_dest(svc, dest, udest, 1);
+ ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start,
+ false);
+ else
+ __ip_vs_update_dest(svc, dest, udest, 1);
} else {
/*
* Allocate and initialize the dest structure
@@ -1533,17 +1570,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
*/
ip_vs_rs_unhash(dest);
- spin_lock_bh(&ipvs->dest_trash_lock);
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
- refcount_read(&dest->refcnt));
- if (list_empty(&ipvs->dest_trash) && !cleanup)
- mod_timer(&ipvs->dest_trash_timer,
- jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
- /* dest lives in trash with reference */
- list_add(&dest->t_list, &ipvs->dest_trash);
- dest->idle_start = 0;
- spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_trash_put_dest(ipvs, dest, 0, cleanup);
/* Queue up delayed work to expire all no destination connections.
* No-op when CONFIG_SYSCTL is disabled.
@@ -1812,11 +1839,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
*svc_p = svc;
if (!READ_ONCE(ipvs->enable)) {
+ mutex_lock(&ipvs->est_mutex);
+
/* Now there is a service - full throttle */
WRITE_ONCE(ipvs->enable, 1);
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
/* Start estimation for first time */
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
+ mutex_unlock(&ipvs->est_mutex);
}
return 0;
@@ -2032,6 +2064,9 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
cancel_delayed_work_sync(&ipvs->svc_resize_work);
if (t) {
rcu_assign_pointer(ipvs->svc_table, NULL);
+ /* Inform readers that table is removed */
+ smp_mb__before_atomic();
+ atomic_inc(&ipvs->svc_table_changes);
while (1) {
p = rcu_dereference_protected(t->new_tbl, 1);
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
@@ -2078,6 +2113,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
t = rcu_dereference_protected(ipvs->svc_table, 1);
if (t) {
rcu_assign_pointer(ipvs->svc_table, NULL);
+ /* Inform readers that table is removed */
+ smp_mb__before_atomic();
+ atomic_inc(&ipvs->svc_table_changes);
while (1) {
p = rcu_dereference_protected(t->new_tbl, 1);
call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
@@ -2086,6 +2124,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
t = p;
}
}
+ /* Stop the tot_stats estimator early under service_mutex
+ * to avoid locking it again later.
+ */
+ if (cleanup)
+ ip_vs_stop_estimator_tot_stats(ipvs);
return 0;
}
@@ -2331,7 +2374,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
/* est_max_threads may depend on cpulist size */
ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
ipvs->est_calc_phase = 1;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
unlock:
mutex_unlock(&ipvs->est_mutex);
@@ -2351,11 +2394,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
mutex_lock(&ipvs->est_mutex);
- if (ipvs->est_cpulist_valid)
- mask = *valp;
- else
- mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
- ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+ /* HK_TYPE_KTHREAD cpumask needs RCU protection */
+ scoped_guard(rcu) {
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+ }
mutex_unlock(&ipvs->est_mutex);
@@ -2411,7 +2457,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
mutex_lock(&ipvs->est_mutex);
if (*valp != val) {
*valp = val;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
}
mutex_unlock(&ipvs->est_mutex);
}
@@ -2438,7 +2484,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
mutex_lock(&ipvs->est_mutex);
if (*valp != val) {
*valp = val;
- ip_vs_est_reload_start(ipvs);
+ ip_vs_est_reload_start(ipvs, true);
}
mutex_unlock(&ipvs->est_mutex);
}
@@ -2463,7 +2509,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
if (val < -8 || val > 8) {
ret = -EINVAL;
} else {
- *valp = val;
+ WRITE_ONCE(*valp, val);
if (rcu_access_pointer(ipvs->conn_tab))
mod_delayed_work(system_unbound_wq,
&ipvs->conn_resize_work, 0);
@@ -2490,10 +2536,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
if (val < -8 || val > 8) {
ret = -EINVAL;
} else {
- *valp = val;
- if (rcu_access_pointer(ipvs->svc_table))
+ mutex_lock(&ipvs->service_mutex);
+ WRITE_ONCE(*valp, val);
+ /* Make sure the services are present */
+ if (rcu_access_pointer(ipvs->svc_table) &&
+ READ_ONCE(ipvs->enable) &&
+ !test_bit(IP_VS_WORK_SVC_NORESIZE,
+ &ipvs->work_flags))
mod_delayed_work(system_unbound_wq,
&ipvs->svc_resize_work, 0);
+ mutex_unlock(&ipvs->service_mutex);
}
}
return ret;
@@ -3004,7 +3056,8 @@ static int ip_vs_status_show(struct seq_file *seq, void *v)
int old_gen, new_gen;
u32 counts[8];
u32 bucket;
- int count;
+ u32 count;
+ int loops;
u32 sum1;
u32 sum;
int i;
@@ -3020,6 +3073,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v)
if (!atomic_read(&ipvs->conn_count))
goto after_conns;
old_gen = atomic_read(&ipvs->conn_tab_changes);
+ loops = 0;
repeat_conn:
smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
@@ -3032,8 +3086,11 @@ repeat_conn:
resched_score++;
ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
count = 0;
- hlist_bl_for_each_entry_rcu(hn, e, head, node)
+ hlist_bl_for_each_entry_rcu(hn, e, head, node) {
count++;
+ if (count >= ARRAY_SIZE(counts) - 1)
+ break;
+ }
}
resched_score += count;
if (resched_score >= 100) {
@@ -3042,37 +3099,41 @@ repeat_conn:
new_gen = atomic_read(&ipvs->conn_tab_changes);
/* New table installed ? */
if (old_gen != new_gen) {
+ /* Too many changes? */
+ if (++loops >= 5)
+ goto after_conns;
old_gen = new_gen;
goto repeat_conn;
}
}
- counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ counts[count]++;
}
}
for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
sum += counts[i];
sum1 = sum - counts[0];
- seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
- counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n",
+ counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
for (i = 1; i < ARRAY_SIZE(counts); i++) {
if (!counts[i])
continue;
- seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
+ seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n",
i, counts[i],
- (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
}
after_conns:
t = rcu_dereference(ipvs->svc_table);
count = ip_vs_get_num_services(ipvs);
- seq_printf(seq, "Services:\t%d\n", count);
+ seq_printf(seq, "Services:\t%u\n", count);
seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
if (!count)
goto after_svc;
old_gen = atomic_read(&ipvs->svc_table_changes);
+ loops = 0;
repeat_svc:
smp_rmb(); /* ipvs->svc_table and svc_table_changes */
@@ -3086,8 +3147,11 @@ repeat_svc:
ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
count = 0;
hlist_bl_for_each_entry_rcu(svc, e, head,
- s_list)
+ s_list) {
count++;
+ if (count >= ARRAY_SIZE(counts) - 1)
+ break;
+ }
}
resched_score += count;
if (resched_score >= 100) {
@@ -3096,24 +3160,27 @@ repeat_svc:
new_gen = atomic_read(&ipvs->svc_table_changes);
/* New table installed ? */
if (old_gen != new_gen) {
+ /* Too many changes? */
+ if (++loops >= 5)
+ goto after_svc;
old_gen = new_gen;
goto repeat_svc;
}
}
- counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
+ counts[count]++;
}
}
for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
sum += counts[i];
sum1 = sum - counts[0];
- seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
- counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
+ seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n",
+ counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
for (i = 1; i < ARRAY_SIZE(counts); i++) {
if (!counts[i])
continue;
- seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
+ seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n",
i, counts[i],
- (unsigned long)counts[i] * 100 / max(sum1, 1U));
+ div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
}
after_svc:
@@ -4967,7 +5034,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ipvs->tot_stats->s.est.ktid != -2) {
+ /* Not stopped yet? This happens only on netns init error and
+ * we even do not need to lock the service_mutex for this case.
+ */
+ mutex_lock(&ipvs->service_mutex);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ mutex_unlock(&ipvs->service_mutex);
+ }
if (ipvs->est_cpulist_valid)
free_cpumask_var(ipvs->sysctl_est_cpulist);
@@ -5039,7 +5113,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
ipvs->net->proc_net,
ip_vs_stats_percpu_show, NULL))
goto err_percpu;
- if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
+ if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net,
ip_vs_status_show, NULL))
goto err_status;
#endif
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 433ba3cab58c..ab09f5182951 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -68,6 +68,11 @@
and the limit of estimators per kthread
- est_add_ktid: ktid where to add new ests, can point to empty slot where
we should add kt data
+ - data protected by service_mutex: est_temp_list, est_add_ktid,
+ est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W)
+ - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist,
+ est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation,
+ est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0)
*/
static struct lock_class_key __ipvs_est_key;
@@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data)
}
/* Schedule stop/start for kthread tasks */
-void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart)
{
+ lockdep_assert_held(&ipvs->est_mutex);
+
/* Ignore reloads before first service is added */
if (!READ_ONCE(ipvs->enable))
return;
ip_vs_est_stopped_recalc(ipvs);
- /* Bump the kthread configuration genid */
- atomic_inc(&ipvs->est_genid);
+ /* Bump the kthread configuration genid if stopping is requested */
+ if (restart)
+ atomic_inc(&ipvs->est_genid);
queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
}
@@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
void *arr = NULL;
int i;
- if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
- READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
- return -EINVAL;
-
mutex_lock(&ipvs->est_mutex);
+ /* Allow kt 0 data to be created before the services are added
+ * and limit the kthreads when services are present.
+ */
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
+ ret = -EINVAL;
+ goto out;
+ }
+
for (i = 0; i < id; i++) {
if (!ipvs->est_kt_arr[i])
break;
@@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
kd->est_timer = jiffies;
kd->id = id;
ip_vs_est_set_params(ipvs, kd);
+ kd->needed = 1;
/* Pre-allocate stats used in calc phase */
if (!id && !kd->calc_stats) {
@@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
goto out;
}
- /* Start kthread tasks only when services are present */
- if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
- ret = ip_vs_est_kthread_start(ipvs, kd);
- if (ret < 0)
- goto out;
- }
+ /* Request kthread to be started */
+ ip_vs_est_reload_start(ipvs, false);
if (arr)
ipvs->est_kt_count++;
@@ -482,12 +492,11 @@ out:
/* Start estimation for stats */
int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ?
+ ipvs->est_kt_arr[0] : NULL;
struct ip_vs_estimator *est = &stats->est;
int ret;
- if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
- ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
-
est->ktid = -1;
est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
@@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
* will not allocate much memory, just for kt 0.
*/
ret = 0;
- if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ if (!kd) {
ret = ip_vs_est_add_kthread(ipvs);
+ } else if (!kd->needed) {
+ mutex_lock(&ipvs->est_mutex);
+ /* We have job for the kt 0 task */
+ kd->needed = 1;
+ ip_vs_est_reload_start(ipvs, true);
+ mutex_unlock(&ipvs->est_mutex);
+ }
if (ret >= 0)
hlist_add_head(&est->list, &ipvs->est_temp_list);
else
@@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
}
end_kt0:
- /* kt 0 is freed after all other kthreads and chains are empty */
+ /* kt 0 task is stopped after all other kt slots and chains are empty */
if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
kd = ipvs->est_kt_arr[0];
- if (!kd || !kd->est_count) {
+ if (kd && !kd->est_count) {
mutex_lock(&ipvs->est_mutex);
- if (kd) {
- ip_vs_est_kthread_destroy(kd);
- ipvs->est_kt_arr[0] = NULL;
- }
- ipvs->est_kt_count--;
+ /* Keep the kt0 data but request kthread_stop */
+ kd->needed = 0;
+ ip_vs_est_reload_start(ipvs, true);
mutex_unlock(&ipvs->est_mutex);
ipvs->est_add_ktid = 0;
}
@@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
u64 val;
INIT_HLIST_HEAD(&chain);
- mutex_lock(&ipvs->service_mutex);
+ mutex_lock(&ipvs->est_mutex);
kd = ipvs->est_kt_arr[0];
- mutex_unlock(&ipvs->service_mutex);
+ mutex_unlock(&ipvs->est_mutex);
s = kd ? kd->calc_stats : NULL;
if (!s)
goto out;
@@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
if (!ip_vs_est_calc_limits(ipvs, &chain_max))
return;
- mutex_lock(&ipvs->service_mutex);
-
/* Stop all other tasks, so that we can immediately move the
* estimators to est_temp_list without RCU grace period
*/
mutex_lock(&ipvs->est_mutex);
for (id = 1; id < ipvs->est_kt_count; id++) {
/* netns clean up started, abort */
- if (!READ_ONCE(ipvs->enable))
- goto unlock2;
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) {
+ mutex_unlock(&ipvs->est_mutex);
+ return;
+ }
kd = ipvs->est_kt_arr[id];
if (!kd)
continue;
@@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
}
mutex_unlock(&ipvs->est_mutex);
+ mutex_lock(&ipvs->service_mutex);
+
/* Move all estimators to est_temp_list but carefully,
* all estimators and kthread data can be released while
- * we reschedule. Even for kthread 0.
+ * we reschedule.
*/
step = 0;
@@ -849,9 +865,7 @@ walk_chain:
ip_vs_stop_estimator(ipvs, stats);
/* Tasks are stopped, move without RCU grace period */
est->ktid = -1;
- est->ktrow = row - kd->est_row;
- if (est->ktrow < 0)
- est->ktrow += IPVS_EST_NTICKS;
+ est->ktrow = delay;
hlist_add_head(&est->list, &ipvs->est_temp_list);
/* kd freed ? */
if (last)
@@ -889,7 +903,6 @@ end_dequeue:
if (genid == atomic_read(&ipvs->est_genid))
ipvs->est_calc_phase = 0;
-unlock2:
mutex_unlock(&ipvs->est_mutex);
unlock:
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index e348fb90b8dc..3b0a70e154cd 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,22 +13,6 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-#define NF_RECURSION_LIMIT 2
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
- return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
-}
-#else
-
-static u8 *nf_get_nf_dup_skb_recursion(void)
-{
- return &current->net_xmit.nf_dup_skb_recursion;
-}
-
-#endif
-
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 2c4140e6f53c..785d8c244a77 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -122,6 +122,7 @@ static int flow_offload_fill_route(struct flow_offload *flow,
flow_tuple->tun = route->tuple[dir].in.tun;
flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment;
flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
switch (route->tuple[dir].xmit_type) {
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index fd56d663cb5b..9c05a50d6013 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -445,13 +445,13 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
switch (skb->protocol) {
case htons(ETH_P_8021Q):
vlan_hdr = (struct vlan_hdr *)skb->data;
- __skb_pull(skb, VLAN_HLEN);
+ skb_pull_rcsum(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vlan_hdr);
skb_reset_network_header(skb);
break;
case htons(ETH_P_PPP_SES):
skb->protocol = __nf_flow_pppoe_proto(skb);
- skb_pull(skb, PPPOE_SES_HLEN);
+ skb_pull_rcsum(skb, PPPOE_SES_HLEN);
skb_reset_network_header(skb);
break;
}
@@ -462,23 +462,6 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
nf_flow_ip_tunnel_pop(ctx, skb);
}
-struct nf_flow_xmit {
- const void *dest;
- const void *source;
- struct net_device *outdev;
-};
-
-static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
- struct nf_flow_xmit *xmit)
-{
- skb->dev = xmit->outdev;
- dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
- xmit->dest, xmit->source, skb->len);
- dev_queue_xmit(skb);
-
- return NF_STOLEN;
-}
-
static struct flow_offload_tuple_rhash *
nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
struct nf_flowtable *flow_table, struct sk_buff *skb)
@@ -524,7 +507,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
return 0;
}
- if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ if (skb_ensure_writable(skb, thoff + ctx->hdrsize))
return -1;
flow_offload_refresh(flow_table, flow, false);
@@ -544,7 +527,34 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
return 1;
}
-static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
+/* Similar to skb_vlan_push. */
+static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id,
+ u32 needed_headroom)
+{
+ if (skb_vlan_tag_present(skb)) {
+ struct vlan_hdr *vhdr;
+
+ if (skb_cow_head(skb, needed_headroom + VLAN_HLEN))
+ return -1;
+
+ __skb_push(skb, VLAN_HLEN);
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header -= VLAN_HLEN;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ skb->network_header -= VLAN_HLEN;
+ vhdr->h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ vhdr->h_vlan_encapsulated_proto = skb->protocol;
+ skb->protocol = skb->vlan_proto;
+ skb_postpush_rcsum(skb, skb->data, VLAN_HLEN);
+ }
+ __vlan_hwaccel_put_tag(skb, proto, id);
+
+ return 0;
+}
+
+static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id,
+ u32 needed_headroom)
{
int data_len = skb->len + sizeof(__be16);
struct ppp_hdr {
@@ -553,7 +563,7 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
} *ph;
__be16 proto;
- if (skb_cow_head(skb, PPPOE_SES_HLEN))
+ if (skb_cow_head(skb, needed_headroom + PPPOE_SES_HLEN))
return -1;
switch (skb->protocol) {
@@ -730,21 +740,24 @@ static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
}
static int nf_flow_encap_push(struct sk_buff *skb,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple,
+ struct net_device *outdev)
{
+ u32 needed_headroom = LL_RESERVED_SPACE(outdev);
int i;
- for (i = 0; i < tuple->encap_num; i++) {
+ for (i = tuple->encap_num - 1; i >= 0; i--) {
switch (tuple->encap[i].proto) {
case htons(ETH_P_8021Q):
case htons(ETH_P_8021AD):
- skb_reset_mac_header(skb);
- if (skb_vlan_push(skb, tuple->encap[i].proto,
- tuple->encap[i].id) < 0)
+ if (nf_flow_vlan_push(skb, tuple->encap[i].proto,
+ tuple->encap[i].id,
+ needed_headroom) < 0)
return -1;
break;
case htons(ETH_P_PPP_SES):
- if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0)
+ if (nf_flow_pppoe_push(skb, tuple->encap[i].id,
+ needed_headroom) < 0)
return -1;
break;
}
@@ -753,6 +766,76 @@ static int nf_flow_encap_push(struct sk_buff *skb,
return 0;
}
+struct nf_flow_xmit {
+ const void *dest;
+ const void *source;
+ struct net_device *outdev;
+ struct flow_offload_tuple *tuple;
+ bool needs_gso_segment;
+};
+
+static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ struct net_device *dev = xmit->outdev;
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return;
+ }
+
+ skb->dev = dev;
+ dev_hard_header(skb, dev, ntohs(skb->protocol),
+ xmit->dest, xmit->source, skb->len);
+ dev_queue_xmit(skb);
+}
+
+static unsigned int nf_flow_encap_gso_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ struct sk_buff *segs, *nskb;
+
+ segs = skb_gso_segment(skb, 0);
+ if (IS_ERR(segs))
+ return NF_DROP;
+
+ if (segs)
+ consume_skb(skb);
+ else
+ segs = skb;
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+
+ if (nf_flow_encap_push(segs, xmit->tuple, xmit->outdev) < 0) {
+ kfree_skb(segs);
+ kfree_skb_list(nskb);
+ return NF_STOLEN;
+ }
+ __nf_flow_queue_xmit(net, segs, xmit);
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ if (xmit->tuple->encap_num) {
+ if (skb_is_gso(skb) && xmit->needs_gso_segment)
+ return nf_flow_encap_gso_xmit(net, skb, xmit);
+
+ if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0)
+ return NF_DROP;
+ }
+
+ __nf_flow_queue_xmit(net, skb, xmit);
+
+ return NF_STOLEN;
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -797,9 +880,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0)
return NF_DROP;
- if (nf_flow_encap_push(skb, other_tuple) < 0)
- return NF_DROP;
-
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
rt = dst_rtable(tuplehash->tuple.dst_cache);
@@ -829,6 +909,8 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
WARN_ON_ONCE(1);
return NF_DROP;
}
+ xmit.tuple = other_tuple;
+ xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment;
return nf_flow_queue_xmit(state->net, skb, &xmit);
}
@@ -1037,7 +1119,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
return 0;
}
- if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ if (skb_ensure_writable(skb, thoff + ctx->hdrsize))
return -1;
flow_offload_refresh(flow_table, flow, false);
@@ -1119,9 +1201,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
&ip6_daddr, encap_limit) < 0)
return NF_DROP;
- if (nf_flow_encap_push(skb, other_tuple) < 0)
- return NF_DROP;
-
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
rt = dst_rt6_info(tuplehash->tuple.dst_cache);
@@ -1151,6 +1230,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
WARN_ON_ONCE(1);
return NF_DROP;
}
+ xmit.tuple = other_tuple;
+ xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment;
return nf_flow_queue_xmit(state->net, skb, &xmit);
}
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index 6bb9579dcc2a..9e88ea6a2eef 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -86,6 +86,7 @@ struct nft_forward_info {
u8 ingress_vlans;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
+ bool needs_gso_segment;
enum flow_offload_xmit_type xmit_type;
};
@@ -138,8 +139,11 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
path->encap.proto;
info->num_encaps++;
}
- if (path->type == DEV_PATH_PPPOE)
+ if (path->type == DEV_PATH_PPPOE) {
memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ info->needs_gso_segment = 1;
+ }
break;
case DEV_PATH_BRIDGE:
if (is_zero_ether_addr(info->h_source))
@@ -279,6 +283,7 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
route->tuple[dir].xmit_type = info.xmit_type;
}
+ route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment;
}
int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d20ce5c36d31..87387adbca65 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -407,6 +407,7 @@ static void nft_netdev_unregister_trans_hook(struct net *net,
}
static void nft_netdev_unregister_hooks(struct net *net,
+ const struct nft_table *table,
struct list_head *hook_list,
bool release_netdev)
{
@@ -414,8 +415,10 @@ static void nft_netdev_unregister_hooks(struct net *net,
struct nf_hook_ops *ops;
list_for_each_entry_safe(hook, next, hook_list, list) {
- list_for_each_entry(ops, &hook->ops_list, list)
- nf_unregister_net_hook(net, ops);
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
+ }
if (release_netdev)
nft_netdev_hook_unlink_free_rcu(hook);
}
@@ -452,20 +455,25 @@ static void __nf_tables_unregister_hook(struct net *net,
struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
- if (table->flags & NFT_TABLE_F_DORMANT ||
- !nft_is_base_chain(chain))
+ if (!nft_is_base_chain(chain))
return;
basechain = nft_base_chain(chain);
ops = &basechain->ops;
+ /* must also be called for dormant tables */
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ nft_netdev_unregister_hooks(net, table, &basechain->hook_list,
+ release_netdev);
+ return;
+ }
+
+ if (table->flags & NFT_TABLE_F_DORMANT)
+ return;
+
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
- nft_netdev_unregister_hooks(net, &basechain->hook_list,
- release_netdev);
- else
- nf_unregister_net_hook(net, &basechain->ops);
+ nf_unregister_net_hook(net, &basechain->ops);
}
static void nf_tables_unregister_hook(struct net *net,
@@ -4205,6 +4213,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
struct nft_chain *chain;
struct nft_ctx ctx = {
.net = net,
+ .table = (struct nft_table *)table,
.family = table->family,
};
int err = 0;
@@ -11281,11 +11290,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
- if (!(table->flags & NFT_TABLE_F_DORMANT)) {
- nft_netdev_unregister_hooks(net,
- &nft_trans_chain_hooks(trans),
- true);
- }
+ nft_netdev_unregister_hooks(net, table,
+ &nft_trans_chain_hooks(trans),
+ true);
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 5ddd5b6e135f..8ab186f86dd4 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -153,7 +153,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb) + pkt->nhoff;
else {
- if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
return false;
ptr = skb->data + nft_thoff(pkt);
}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index decc725a33c2..0caa9304d2d0 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -261,10 +261,10 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
}
- nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
-
nft_compat_wait_for_destructors(ctx->net);
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0) {
if (ret == -ENOENT) {
@@ -353,8 +353,6 @@ nla_put_failure:
static int nft_target_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- struct xt_target *target = expr->ops->data;
- unsigned int hook_mask = 0;
int ret;
if (ctx->family != NFPROTO_IPV4 &&
@@ -377,11 +375,21 @@ static int nft_target_validate(const struct nft_ctx *ctx,
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
const struct nf_hook_ops *ops = &basechain->ops;
+ unsigned int hook_mask = 1 << ops->hooknum;
+ struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_tgchk_param par;
+ union nft_entry e = {};
- hook_mask = 1 << ops->hooknum;
if (target->hooks && !(hook_mask & target->hooks))
return -EINVAL;
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, 0, false);
+
+ ret = xt_check_hooks_target(&par);
+ if (ret < 0)
+ return ret;
+
ret = nft_compat_chain_validate_dependency(ctx, target->table);
if (ret < 0)
return ret;
@@ -515,10 +523,10 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return ret;
}
- nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
-
nft_compat_wait_for_destructors(ctx->net);
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
return xt_check_match(&par, size, proto, inv);
}
@@ -614,8 +622,6 @@ static int nft_match_large_dump(struct sk_buff *skb,
static int nft_match_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
- struct xt_match *match = expr->ops->data;
- unsigned int hook_mask = 0;
int ret;
if (ctx->family != NFPROTO_IPV4 &&
@@ -638,11 +644,30 @@ static int nft_match_validate(const struct nft_ctx *ctx,
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
const struct nf_hook_ops *ops = &basechain->ops;
+ unsigned int hook_mask = 1 << ops->hooknum;
+ struct xt_match *match = expr->ops->data;
+ size_t size = XT_ALIGN(match->matchsize);
+ struct xt_mtchk_param par;
+ union nft_entry e = {};
+ void *info;
- hook_mask = 1 << ops->hooknum;
if (match->hooks && !(hook_mask & match->hooks))
return -EINVAL;
+ if (NFT_EXPR_SIZE(size) > NFT_MATCH_LARGE_THRESH) {
+ struct nft_xt_match_priv *priv = nft_expr_priv(expr);
+
+ info = priv->info;
+ } else {
+ info = nft_expr_priv(expr);
+ }
+
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, 0, false);
+
+ ret = xt_check_hooks_match(&par);
+ if (ret < 0)
+ return ret;
+
ret = nft_compat_chain_validate_dependency(ctx, match->table);
if (ret < 0)
return ret;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 0407d6f708ae..e6a07c0df207 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -376,7 +376,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
const struct sctp_chunkhdr *sch;
struct sctp_chunkhdr _sch;
- if (pkt->tprot != IPPROTO_SCTP)
+ if (pkt->tprot != IPPROTO_SCTP || pkt->fragoff)
goto err;
do {
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 4bce36c3a6a0..b9e88d7cf308 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -95,12 +95,15 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
void *addr = &regs->data[priv->sreg_addr];
int oif = regs->data[priv->sreg_dev];
unsigned int verdict = NF_STOLEN;
struct sk_buff *skb = pkt->skb;
+ int nhoff = skb_network_offset(skb);
struct net_device *dev;
+ unsigned int hh_len;
int neigh_table;
switch (priv->nfproto) {
@@ -111,7 +114,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
verdict = NFT_BREAK;
goto out;
}
- if (skb_try_make_writable(skb, sizeof(*iph))) {
+ if (skb_ensure_writable(skb, nhoff + sizeof(*iph))) {
verdict = NF_DROP;
goto out;
}
@@ -132,7 +135,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
verdict = NFT_BREAK;
goto out;
}
- if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+ if (skb_ensure_writable(skb, nhoff + sizeof(*ip6h))) {
verdict = NF_DROP;
goto out;
}
@@ -151,13 +154,31 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
goto out;
}
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) {
+ verdict = NF_DROP;
+ goto out;
+ }
+
dev = dev_get_by_index_rcu(nft_net(pkt), oif);
- if (dev == NULL)
- return;
+ if (dev == NULL) {
+ verdict = NF_DROP;
+ goto out;
+ }
+
+ hh_len = LL_RESERVED_SPACE(dev);
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb) {
+ verdict = NF_STOLEN;
+ goto out;
+ }
+ }
skb->dev = dev;
skb_clear_tstamp(skb);
+ (*nf_dup_skb_recursion)++;
neigh_xmit(neigh_table, dev, addr, skb);
+ (*nf_dup_skb_recursion)--;
out:
regs->verdict.code = verdict;
}
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index c02d5cb52143..45fe56da5044 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -33,7 +33,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- if (pkt->tprot != IPPROTO_TCP) {
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index f2101af8c867..89be443734f6 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -30,8 +30,8 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
__be16 tport = 0;
struct sock *sk;
- if (pkt->tprot != IPPROTO_TCP &&
- pkt->tprot != IPPROTO_UDP) {
+ if ((pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -97,8 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
memset(&taddr, 0, sizeof(taddr));
- if (pkt->tprot != IPPROTO_TCP &&
- pkt->tprot != IPPROTO_UDP) {
+ if ((pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) || pkt->fragoff) {
regs->verdict.code = NFT_BREAK;
return;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 9f837fb5ceb4..2c67c2e6b132 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -477,11 +477,9 @@ int xt_check_proc_name(const char *name, unsigned int size)
}
EXPORT_SYMBOL(xt_check_proc_name);
-int xt_check_match(struct xt_mtchk_param *par,
- unsigned int size, u16 proto, bool inv_proto)
+static int xt_check_match_common(struct xt_mtchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
{
- int ret;
-
if (XT_ALIGN(par->match->matchsize) != size &&
par->match->matchsize != -1) {
/*
@@ -530,6 +528,14 @@ int xt_check_match(struct xt_mtchk_param *par,
par->match->proto);
return -EINVAL;
}
+
+ return 0;
+}
+
+static int xt_checkentry_match(struct xt_mtchk_param *par)
+{
+ int ret;
+
if (par->match->checkentry != NULL) {
ret = par->match->checkentry(par);
if (ret < 0)
@@ -538,8 +544,34 @@ int xt_check_match(struct xt_mtchk_param *par,
/* Flag up potential errors. */
return -EIO;
}
+
+ return 0;
+}
+
+int xt_check_hooks_match(struct xt_mtchk_param *par)
+{
+ if (par->match->check_hooks != NULL)
+ return par->match->check_hooks(par);
+
return 0;
}
+EXPORT_SYMBOL_GPL(xt_check_hooks_match);
+
+int xt_check_match(struct xt_mtchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
+{
+ int ret;
+
+ ret = xt_check_match_common(par, size, proto, inv_proto);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_check_hooks_match(par);
+ if (ret < 0)
+ return ret;
+
+ return xt_checkentry_match(par);
+}
EXPORT_SYMBOL_GPL(xt_check_match);
/** xt_check_entry_match - check that matches end before start of target
@@ -1012,11 +1044,9 @@ bool xt_find_jump_offset(const unsigned int *offsets,
}
EXPORT_SYMBOL(xt_find_jump_offset);
-int xt_check_target(struct xt_tgchk_param *par,
- unsigned int size, u16 proto, bool inv_proto)
+static int xt_check_target_common(struct xt_tgchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
{
- int ret;
-
if (XT_ALIGN(par->target->targetsize) != size) {
pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n",
xt_prefix[par->family], par->target->name,
@@ -1061,6 +1091,23 @@ int xt_check_target(struct xt_tgchk_param *par,
par->target->proto);
return -EINVAL;
}
+
+ return 0;
+}
+
+int xt_check_hooks_target(struct xt_tgchk_param *par)
+{
+ if (par->target->check_hooks != NULL)
+ return par->target->check_hooks(par);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_hooks_target);
+
+static int xt_checkentry_target(struct xt_tgchk_param *par)
+{
+ int ret;
+
if (par->target->checkentry != NULL) {
ret = par->target->checkentry(par);
if (ret < 0)
@@ -1071,6 +1118,22 @@ int xt_check_target(struct xt_tgchk_param *par,
}
return 0;
}
+
+int xt_check_target(struct xt_tgchk_param *par,
+ unsigned int size, u16 proto, bool inv_proto)
+{
+ int ret;
+
+ ret = xt_check_target_common(par, size, proto, inv_proto);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_check_hooks_target(par);
+ if (ret < 0)
+ return ret;
+
+ return xt_checkentry_target(par);
+}
EXPORT_SYMBOL_GPL(xt_check_target);
/**
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 498f5871c84a..d2aeacf94230 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -354,7 +354,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -366,7 +366,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -398,7 +398,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v1,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
@@ -410,7 +410,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
- .usersize = offsetof(struct xt_ct_target_info, ct),
+ .usersize = offsetof(struct xt_ct_target_info_v1, ct),
.checkentry = xt_ct_tg_check_v2,
.destroy = xt_ct_tg_destroy_v1,
.target = xt_ct_target_v1,
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 116a885adb3c..80e1634bc51f 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
}
#endif
+static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par)
+{
+ const struct xt_tcpmss_info *info = par->targinfo;
+
+ if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
+ (par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING))) != 0) {
+ pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Must specify -p tcp --syn */
static inline bool find_syn_match(const struct xt_entry_match *m)
{
@@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m)
static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
{
- const struct xt_tcpmss_info *info = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
const struct xt_entry_match *ematch;
- if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
- (par->hook_mask & ~((1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING))) != 0) {
- pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
- return -EINVAL;
- }
if (par->nft_compat)
return 0;
@@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
{
- const struct xt_tcpmss_info *info = par->targinfo;
const struct ip6t_entry *e = par->entryinfo;
const struct xt_entry_match *ematch;
- if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
- (par->hook_mask & ~((1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING))) != 0) {
- pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
- return -EINVAL;
- }
if (par->nft_compat)
return 0;
@@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
{
.family = NFPROTO_IPV4,
.name = "TCPMSS",
+ .check_hooks = tcpmss_tg4_check_hooks,
.checkentry = tcpmss_tg4_check,
.target = tcpmss_tg4,
.targetsize = sizeof(struct xt_tcpmss_info),
@@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = {
{
.family = NFPROTO_IPV6,
.name = "TCPMSS",
+ .check_hooks = tcpmss_tg4_check_hooks,
.checkentry = tcpmss_tg6_check,
.target = tcpmss_tg6,
.targetsize = sizeof(struct xt_tcpmss_info),
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e4bea1d346cf..5f60e7298a1e 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -86,6 +86,9 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info *tgi = par->targinfo;
+ if (par->fragoff)
+ return NF_DROP;
+
return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
tgi->mark_mask, tgi->mark_value);
}
@@ -95,6 +98,9 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ if (par->fragoff)
+ return NF_DROP;
+
return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
tgi->mark_mask, tgi->mark_value);
}
@@ -106,6 +112,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ unsigned short fragoff = 0;
struct udphdr _hdr, *hp;
struct sock *sk;
const struct in6_addr *laddr;
@@ -113,8 +120,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int thoff = 0;
int tproto;
- tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0)
+ tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (tproto < 0 || fragoff)
return NF_DROP;
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index a77088943107..913dbe3aa5e2 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par)
{
- const char *errmsg = "both incoming and outgoing interface limitation cannot be selected";
struct xt_addrtype_info_v1 *info = par->matchinfo;
-
- if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
- info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
- goto err;
+ const char *errmsg;
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) &&
@@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
goto err;
}
+ return 0;
+err:
+ pr_info_ratelimited("%s\n", errmsg);
+ return -EINVAL;
+}
+
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+ const char *errmsg = "both incoming and outgoing interface limitation cannot be selected";
+ struct xt_addrtype_info_v1 *info = par->matchinfo;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
+ goto err;
+
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
if (par->family == NFPROTO_IPV6) {
if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) {
@@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.revision = 1,
.match = addrtype_mt_v1,
+ .check_hooks = addrtype_mt_check_hooks,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
@@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.revision = 1,
.match = addrtype_mt_v1,
+ .check_hooks = addrtype_mt_check_hooks,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index 9520dd00070b..6d1a44ab5eee 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
return true;
}
-static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_devgroup_info *info = par->matchinfo;
- if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
- XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
- return -EINVAL;
-
if (info->flags & XT_DEVGROUP_MATCH_SRC &&
par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
@@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
return 0;
}
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct xt_match devgroup_mt_reg __read_mostly = {
.name = "devgroup",
.match = devgroup_mt,
+ .check_hooks = devgroup_mt_check_hooks,
.checkentry = devgroup_mt_checkentry,
.matchsize = sizeof(struct xt_devgroup_info),
.family = NFPROTO_UNSPEC,
diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c
index b96e8203ac54..a8503f5d26bf 100644
--- a/net/netfilter/xt_ecn.c
+++ b/net/netfilter/xt_ecn.c
@@ -30,6 +30,10 @@ static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par)
struct tcphdr _tcph;
const struct tcphdr *th;
+ /* this is fine for IPv6 as ecn_mt_check6() enforces -p tcp */
+ if (par->fragoff)
+ return false;
+
/* In practice, TCP match does this, so can't fail. But let's
* be good citizens.
*/
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 3bd127bfc114..2704b4b60d1e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -658,6 +658,8 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
if (!(hinfo->cfg.mode &
(XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
return 0;
+ if (ntohs(ip_hdr(skb)->frag_off) & IP_OFFSET)
+ return -1;
nexthdr = ip_hdr(skb)->protocol;
break;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
@@ -681,7 +683,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
nexthdr = ipv6_hdr(skb)->nexthdr;
protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);
- if ((int)protoff < 0)
+ if ((int)protoff < 0 || ntohs(frag_off) & IP6_OFFSET)
return -1;
break;
}
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index dc9485854002..e8807caede68 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -27,6 +27,9 @@
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
+ if (p->fragoff)
+ return false;
+
return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers);
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index d2b0b52434fa..dd98f758176c 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -91,14 +91,10 @@ match_outdev:
return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT));
}
-static int physdev_mt_check(const struct xt_mtchk_param *par)
+static int physdev_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_physdev_info *info = par->matchinfo;
- static bool brnf_probed __read_mostly;
- if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
- info->bitmask & ~XT_PHYSDEV_OP_MASK)
- return -EINVAL;
if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) &&
(!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
@@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ return 0;
+}
+
+static int physdev_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_physdev_info *info = par->matchinfo;
+ static bool brnf_probed __read_mostly;
+
+ if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
+ info->bitmask & ~XT_PHYSDEV_OP_MASK)
+ return -EINVAL;
+
#define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb)
if (info->bitmask & XT_PHYSDEV_OP_IN) {
if (info->physindev[0] == '\0')
@@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
{
.name = "physdev",
.family = NFPROTO_IPV4,
+ .check_hooks = physdev_mt_check_hooks,
.checkentry = physdev_mt_check,
.match = physdev_mt,
.matchsize = sizeof(struct xt_physdev_info),
@@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = {
{
.name = "physdev",
.family = NFPROTO_IPV6,
+ .check_hooks = physdev_mt_check_hooks,
.checkentry = physdev_mt_check,
.match = physdev_mt,
.matchsize = sizeof(struct xt_physdev_info),
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index b5fa65558318..ff54e3a8581e 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret;
}
-static int policy_mt_check(const struct xt_mtchk_param *par)
+static int policy_mt_check_hooks(const struct xt_mtchk_param *par)
{
const struct xt_policy_info *info = par->matchinfo;
- const char *errmsg = "neither incoming nor outgoing policy selected";
-
- if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT)))
- goto err;
+ const char *errmsg;
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) {
@@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par)
errmsg = "input policy not valid in POSTROUTING and OUTPUT";
goto err;
}
+
+ return 0;
+err:
+ pr_info_ratelimited("%s\n", errmsg);
+ return -EINVAL;
+}
+
+static int policy_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_policy_info *info = par->matchinfo;
+ const char *errmsg = "neither incoming nor outgoing policy selected";
+
+ if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT)))
+ goto err;
+
if (info->len > XT_POLICY_MAX_ELEM) {
errmsg = "too many policy elements";
goto err;
@@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
{
.name = "policy",
.family = NFPROTO_IPV4,
+ .check_hooks = policy_mt_check_hooks,
.checkentry = policy_mt_check,
.match = policy_mt,
.matchsize = sizeof(struct xt_policy_info),
@@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = {
{
.name = "policy",
.family = NFPROTO_IPV6,
+ .check_hooks = policy_mt_check_hooks,
.checkentry = policy_mt_check,
.match = policy_mt,
.matchsize = sizeof(struct xt_policy_info),
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 731bc2cafae4..4ae04bba9358 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -431,6 +431,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
}
static int
+set_target_v3_check_hooks(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target_v3 *info = par->targinfo;
+
+ if (info->map_set.index != IPSET_INVALID_ID) {
+ if (strncmp(par->table, "mangle", 7)) {
+ pr_info_ratelimited("--map-set only usable from mangle table\n");
+ return -EINVAL;
+ }
+ if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
+ (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
+ (par->hook_mask & ~(1 << NF_INET_FORWARD |
+ 1 << NF_INET_LOCAL_OUT |
+ 1 << NF_INET_POST_ROUTING))) {
+ pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
@@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
}
if (info->map_set.index != IPSET_INVALID_ID) {
- if (strncmp(par->table, "mangle", 7)) {
- pr_info_ratelimited("--map-set only usable from mangle table\n");
- ret = -EINVAL;
- goto cleanup_del;
- }
- if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
- (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
- (par->hook_mask & ~(1 << NF_INET_FORWARD |
- 1 << NF_INET_LOCAL_OUT |
- 1 << NF_INET_POST_ROUTING))) {
- pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
- ret = -EINVAL;
- goto cleanup_del;
- }
index = ip_set_nfnl_get_byindex(par->net,
info->map_set.index);
if (index == IPSET_INVALID_ID) {
@@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = {
.family = NFPROTO_IPV4,
.target = set_target_v3,
.targetsize = sizeof(struct xt_set_info_target_v3),
+ .check_hooks = set_target_v3_check_hooks,
.checkentry = set_target_v3_checkentry,
.destroy = set_target_v3_destroy,
.me = THIS_MODULE
@@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = {
.family = NFPROTO_IPV6,
.target = set_target_v3,
.targetsize = sizeof(struct xt_set_info_target_v3),
+ .check_hooks = set_target_v3_check_hooks,
.checkentry = set_target_v3_checkentry,
.destroy = set_target_v3_destroy,
.me = THIS_MODULE
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 0d32d4841cb3..b9da8269161d 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -32,6 +32,10 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par)
u8 _opt[15 * 4 - sizeof(_tcph)];
unsigned int i, optlen;
+ /* this is fine for IPv6 as xt_tcpmss enforces -p tcp */
+ if (par->fragoff)
+ return false;
+
/* If we don't have the whole header, drop packet. */
th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
if (th == NULL)
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index b10e1602c6b1..cb5ea4424ffc 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -97,6 +97,9 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
goto error;
}
+ vport->dev = dev;
+ netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
rtnl_unlock();
return vport;
error:
@@ -111,7 +114,7 @@ static struct vport *geneve_create(const struct vport_parms *parms)
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ return ovs_netdev_link(vport, true);
}
static struct vport_ops ovs_geneve_vport_ops = {
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 4014c9b5eb79..6cb5a697b396 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -63,6 +63,9 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
return ERR_PTR(err);
}
+ vport->dev = dev;
+ netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
rtnl_unlock();
return vport;
}
@@ -75,7 +78,7 @@ static struct vport *gre_create(const struct vport_parms *parms)
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ return ovs_netdev_link(vport, true);
}
static struct vport_ops ovs_gre_vport_ops = {
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 12055af832dc..c42642075685 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -73,37 +73,21 @@ static struct net_device *get_dpdev(const struct datapath *dp)
return local->dev;
}
-struct vport *ovs_netdev_link(struct vport *vport, const char *name)
+struct vport *ovs_netdev_link(struct vport *vport, bool tunnel)
{
int err;
- vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
- if (!vport->dev) {
+ if (WARN_ON_ONCE(!vport->dev)) {
err = -ENODEV;
goto error_free_vport;
}
- /* Ensure that the device exists and that the provided
- * name is not one of its aliases.
- */
- if (strcmp(name, ovs_vport_name(vport))) {
- err = -ENODEV;
- goto error_put;
- }
- netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL);
- if (vport->dev->flags & IFF_LOOPBACK ||
- (vport->dev->type != ARPHRD_ETHER &&
- vport->dev->type != ARPHRD_NONE) ||
- ovs_is_internal_dev(vport->dev)) {
- err = -EINVAL;
- goto error_put;
- }
rtnl_lock();
err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp),
NULL, NULL, NULL);
if (err)
- goto error_unlock;
+ goto error_put_unlock;
err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
@@ -119,10 +103,11 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
error_master_upper_dev_unlink:
netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
-error_unlock:
- rtnl_unlock();
-error_put:
+error_put_unlock:
+ if (tunnel && vport->dev->reg_state == NETREG_REGISTERED)
+ rtnl_delete_link(vport->dev, 0, NULL);
netdev_put(vport->dev, &vport->dev_tracker);
+ rtnl_unlock();
error_free_vport:
ovs_vport_free(vport);
return ERR_PTR(err);
@@ -132,12 +117,39 @@ EXPORT_SYMBOL_GPL(ovs_netdev_link);
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
+ int err;
vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
+ if (!vport->dev) {
+ err = -ENODEV;
+ goto error_free_vport;
+ }
+ netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
+ /* Ensure that the provided name is not an alias. */
+ if (strcmp(parms->name, ovs_vport_name(vport))) {
+ err = -ENODEV;
+ goto error_put;
+ }
+
+ if (vport->dev->flags & IFF_LOOPBACK ||
+ (vport->dev->type != ARPHRD_ETHER &&
+ vport->dev->type != ARPHRD_NONE) ||
+ ovs_is_internal_dev(vport->dev)) {
+ err = -EINVAL;
+ goto error_put;
+ }
+
+ return ovs_netdev_link(vport, false);
+error_put:
+ netdev_put(vport->dev, &vport->dev_tracker);
+error_free_vport:
+ ovs_vport_free(vport);
+ return ERR_PTR(err);
}
static void vport_netdev_free(struct rcu_head *rcu)
@@ -196,9 +208,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
*/
if (vport->dev->reg_state == NETREG_REGISTERED)
rtnl_delete_link(vport->dev, 0, NULL);
- rtnl_unlock();
+ /* We can't put the device reference yet, since it can still be in
+ * use, but rtnl_unlock()->netdev_run_todo() will block until all
+ * the references are released, so the RCU call must be before it.
+ */
call_rcu(&vport->rcu, vport_netdev_free);
+ rtnl_unlock();
}
EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
index c5d83a43bfc4..6c0d7366f986 100644
--- a/net/openvswitch/vport-netdev.h
+++ b/net/openvswitch/vport-netdev.h
@@ -13,7 +13,7 @@
struct vport *ovs_netdev_get_vport(struct net_device *dev);
-struct vport *ovs_netdev_link(struct vport *vport, const char *name);
+struct vport *ovs_netdev_link(struct vport *vport, bool tunnel);
void ovs_netdev_detach_dev(struct vport *);
int __init ovs_netdev_init(void);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 0b881b043bcf..c1b37b50d29e 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -126,6 +126,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
goto error;
}
+ vport->dev = dev;
+ netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL);
+
rtnl_unlock();
return vport;
error:
@@ -140,7 +143,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms)
if (IS_ERR(vport))
return vport;
- return ovs_netdev_link(vport, parms->name);
+ return ovs_netdev_link(vport, true);
}
static struct vport_ops ovs_vxlan_netdev_vport_ops = {
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
index 9508b6c38003..e45549f08eef 100644
--- a/net/psp/psp_main.c
+++ b/net/psp/psp_main.c
@@ -263,15 +263,16 @@ EXPORT_SYMBOL(psp_dev_encapsulate);
/* Receive handler for PSP packets.
*
- * Presently it accepts only already-authenticated packets and does not
- * support optional fields, such as virtualization cookies. The caller should
- * ensure that skb->data is pointing to the mac header, and that skb->mac_len
- * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE
- * is not supported).
+ * Accepts only already-authenticated packets. The full PSP header is
+ * stripped according to psph->hdrlen; any optional fields it advertises
+ * (virtualization cookies, etc.) are ignored and discarded along with the
+ * rest of the header. The caller should ensure that skb->data is pointing
+ * to the mac header, and that skb->mac_len is set. This function does not
+ * currently adjust skb->csum (CHECKSUM_COMPLETE is not supported).
*/
int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
{
- int l2_hlen = 0, l3_hlen, encap;
+ int l2_hlen = 0, l3_hlen, encap, psp_hlen;
struct psp_skb_ext *pse;
struct psphdr *psph;
struct ethhdr *eth;
@@ -312,18 +313,36 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT)))
return -EINVAL;
- pse = skb_ext_add(skb, SKB_EXT_PSP);
- if (!pse)
+ psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen +
+ sizeof(struct udphdr));
+
+ /* Strip the full PSP header per psph->hdrlen; VC/options are pulled
+ * into the linear region only so they can be discarded with the
+ * rest of the header.
+ */
+ psp_hlen = (psph->hdrlen + 1) * 8;
+
+ if (unlikely(psp_hlen < sizeof(struct psphdr)))
+ return -EINVAL;
+
+ if (psp_hlen > sizeof(struct psphdr) &&
+ !pskb_may_pull(skb, l2_hlen + l3_hlen +
+ sizeof(struct udphdr) + psp_hlen))
return -EINVAL;
psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen +
sizeof(struct udphdr));
+
+ pse = skb_ext_add(skb, SKB_EXT_PSP);
+ if (!pse)
+ return -EINVAL;
+
pse->spi = psph->spi;
pse->dev_id = dev_id;
pse->generation = generation;
pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl);
- encap = PSP_ENCAP_HLEN;
+ encap = sizeof(struct udphdr) + psp_hlen;
encap += strip_icv ? PSP_TRL_SIZE : 0;
if (proto == htons(ETH_P_IP)) {
@@ -340,8 +359,9 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap);
}
- memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen);
- skb_pull(skb, PSP_ENCAP_HLEN);
+ memmove(skb->data + sizeof(struct udphdr) + psp_hlen,
+ skb->data, l2_hlen + l3_hlen);
+ skb_pull(skb, sizeof(struct udphdr) + psp_hlen);
if (strip_icv)
pskb_trim(skb, skb->len - PSP_TRL_SIZE);
diff --git a/net/rds/message.c b/net/rds/message.c
index eaa6f22601a4..25fedcb3cd00 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -131,24 +131,34 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
*/
static void rds_message_purge(struct rds_message *rm)
{
+ struct rds_znotifier *znotifier;
unsigned long i, flags;
- bool zcopy = false;
+ bool zcopy;
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
spin_lock_irqsave(&rm->m_rs_lock, flags);
+ znotifier = rm->data.op_mmp_znotifier;
+ rm->data.op_mmp_znotifier = NULL;
+ zcopy = !!znotifier;
+
if (rm->m_rs) {
struct rds_sock *rs = rm->m_rs;
- if (rm->data.op_mmp_znotifier) {
- zcopy = true;
- rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+ if (znotifier) {
+ rds_rm_zerocopy_callback(rs, znotifier);
rds_wake_sk_sleep(rs);
- rm->data.op_mmp_znotifier = NULL;
}
sock_put(rds_rs_to_sk(rs));
rm->m_rs = NULL;
+ } else if (znotifier) {
+ /*
+ * Zerocopy can fail before the message is queued on the
+ * socket, so there is no rs to carry the notification.
+ */
+ mm_unaccount_pinned_pages(&znotifier->z_mmp);
+ kfree(rds_info_from_znotifier(znotifier));
}
spin_unlock_irqrestore(&rm->m_rs_lock, flags);
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 13c6d1869a14..5862933be8d7 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -399,14 +399,14 @@ static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust);
* Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
*/
-static void cobalt_newton_step(struct cobalt_vars *vars)
+static void cobalt_newton_step(struct cobalt_vars *vars, u32 count)
{
u32 invsqrt, invsqrt2;
u64 val;
invsqrt = vars->rec_inv_sqrt;
invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
- val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+ val = (3LL << 32) - ((u64)count * invsqrt2);
val >>= 2; /* avoid overflow in following multiply */
val = (val * invsqrt) >> (32 - 2 + 1);
@@ -414,12 +414,12 @@ static void cobalt_newton_step(struct cobalt_vars *vars)
vars->rec_inv_sqrt = val;
}
-static void cobalt_invsqrt(struct cobalt_vars *vars)
+static void cobalt_invsqrt(struct cobalt_vars *vars, u32 count)
{
- if (vars->count < REC_INV_SQRT_CACHE)
- vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
+ if (count < REC_INV_SQRT_CACHE)
+ vars->rec_inv_sqrt = inv_sqrt_cache[count];
else
- cobalt_newton_step(vars);
+ cobalt_newton_step(vars, count);
}
static void cobalt_vars_init(struct cobalt_vars *vars)
@@ -449,16 +449,19 @@ static bool cobalt_queue_full(struct cobalt_vars *vars,
bool up = false;
if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
- up = !vars->p_drop;
- vars->p_drop += p->p_inc;
- if (vars->p_drop < p->p_inc)
- vars->p_drop = ~0;
- vars->blue_timer = now;
- }
- vars->dropping = true;
- vars->drop_next = now;
+ u32 p_drop = vars->p_drop;
+
+ up = !p_drop;
+ p_drop += p->p_inc;
+ if (p_drop < p->p_inc)
+ p_drop = ~0;
+ WRITE_ONCE(vars->p_drop, p_drop);
+ WRITE_ONCE(vars->blue_timer, now);
+ }
+ WRITE_ONCE(vars->dropping, true);
+ WRITE_ONCE(vars->drop_next, now);
if (!vars->count)
- vars->count = 1;
+ WRITE_ONCE(vars->count, 1);
return up;
}
@@ -475,20 +478,20 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars,
if (vars->p_drop &&
ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
if (vars->p_drop < p->p_dec)
- vars->p_drop = 0;
+ WRITE_ONCE(vars->p_drop, 0);
else
- vars->p_drop -= p->p_dec;
- vars->blue_timer = now;
+ WRITE_ONCE(vars->p_drop, vars->p_drop - p->p_dec);
+ WRITE_ONCE(vars->blue_timer, now);
down = !vars->p_drop;
}
- vars->dropping = false;
+ WRITE_ONCE(vars->dropping, false);
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
- vars->count--;
- cobalt_invsqrt(vars);
- vars->drop_next = cobalt_control(vars->drop_next,
- p->interval,
- vars->rec_inv_sqrt);
+ WRITE_ONCE(vars->count, vars->count - 1);
+ cobalt_invsqrt(vars, vars->count);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(vars->drop_next, p->interval,
+ vars->rec_inv_sqrt));
}
return down;
@@ -507,6 +510,7 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
bool next_due, over_target;
ktime_t schedule;
u64 sojourn;
+ u32 count;
/* The 'schedule' variable records, in its sign, whether 'now' is before or
* after 'drop_next'. This allows 'drop_next' to be updated before the next
@@ -528,21 +532,22 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
over_target = sojourn > p->target &&
sojourn > p->mtu_time * bulk_flows * 2 &&
sojourn > p->mtu_time * 4;
- next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ count = vars->count;
+ next_due = count && ktime_to_ns(schedule) >= 0;
vars->ecn_marked = false;
if (over_target) {
if (!vars->dropping) {
- vars->dropping = true;
- vars->drop_next = cobalt_control(now,
- p->interval,
- vars->rec_inv_sqrt);
+ WRITE_ONCE(vars->dropping, true);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(now, p->interval,
+ vars->rec_inv_sqrt));
}
- if (!vars->count)
- vars->count = 1;
+ if (!count)
+ count = 1;
} else if (vars->dropping) {
- vars->dropping = false;
+ WRITE_ONCE(vars->dropping, false);
}
if (next_due && vars->dropping) {
@@ -550,23 +555,23 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
reason = QDISC_DROP_CONGESTED;
- vars->count++;
- if (!vars->count)
- vars->count--;
- cobalt_invsqrt(vars);
- vars->drop_next = cobalt_control(vars->drop_next,
- p->interval,
- vars->rec_inv_sqrt);
+ count++;
+ if (!count)
+ count--;
+ cobalt_invsqrt(vars, count);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(vars->drop_next, p->interval,
+ vars->rec_inv_sqrt));
schedule = ktime_sub(now, vars->drop_next);
} else {
while (next_due) {
- vars->count--;
- cobalt_invsqrt(vars);
- vars->drop_next = cobalt_control(vars->drop_next,
- p->interval,
- vars->rec_inv_sqrt);
+ count--;
+ cobalt_invsqrt(vars, count);
+ WRITE_ONCE(vars->drop_next,
+ cobalt_control(vars->drop_next, p->interval,
+ vars->rec_inv_sqrt));
schedule = ktime_sub(now, vars->drop_next);
- next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ next_due = count && ktime_to_ns(schedule) >= 0;
}
}
@@ -575,11 +580,12 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
get_random_u32() < vars->p_drop)
reason = QDISC_DROP_FLOOD_PROTECTION;
+ WRITE_ONCE(vars->count, count);
/* Overload the drop_next field as an activity timeout */
- if (!vars->count)
- vars->drop_next = ktime_add_ns(now, p->interval);
+ if (!count)
+ WRITE_ONCE(vars->drop_next, ktime_add_ns(now, p->interval));
else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC)
- vars->drop_next = now;
+ WRITE_ONCE(vars->drop_next, now);
return reason;
}
@@ -914,7 +920,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
struct sk_buff *skb = flow->head;
if (skb) {
- flow->head = skb->next;
+ WRITE_ONCE(flow->head, skb->next);
skb_mark_not_on_list(skb);
}
@@ -926,7 +932,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
{
if (!flow->head)
- flow->head = skb;
+ WRITE_ONCE(flow->head, skb);
else
flow->tail->next = skb;
flow->tail = skb;
@@ -1357,7 +1363,7 @@ found:
if (elig_ack_prev)
elig_ack_prev->next = elig_ack->next;
else
- flow->head = elig_ack->next;
+ WRITE_ONCE(flow->head, elig_ack->next);
skb_mark_not_on_list(elig_ack);
@@ -1595,11 +1601,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
len = qdisc_pkt_len(skb);
q->buffer_used -= skb->truesize;
- b->backlogs[idx] -= len;
WRITE_ONCE(b->tin_backlog, b->tin_backlog - len);
+ WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] - len);
sch->qstats.backlog -= len;
- flow->dropped++;
+ WRITE_ONCE(flow->dropped, flow->dropped + 1);
WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1);
if (q->config->rate_flags & CAKE_FLAG_INGRESS)
@@ -1824,11 +1830,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
/* stats */
- b->backlogs[idx] += slen;
sch->qstats.backlog += slen;
q->avg_window_bytes += slen;
WRITE_ONCE(b->bytes, b->bytes + slen);
WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen);
+ WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + slen);
qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
consume_skb(skb);
@@ -1861,11 +1867,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* stats */
WRITE_ONCE(b->packets, b->packets + 1);
- b->backlogs[idx] += len - ack_pkt_len;
sch->qstats.backlog += len - ack_pkt_len;
q->avg_window_bytes += len - ack_pkt_len;
WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len);
WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len);
+ WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + len - ack_pkt_len);
}
if (q->overflow_timeout)
@@ -1924,7 +1930,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
flow->set = CAKE_SET_SPARSE;
WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1);
- flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
+ WRITE_ONCE(flow->deficit, cake_get_flow_quantum(b, flow, q->config->flow_mode));
} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
/* this flow was empty, accounted as a sparse flow, but actually
* in the bulk rotation.
@@ -1977,7 +1983,7 @@ static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
if (flow->head) {
skb = dequeue_head(flow);
len = qdisc_pkt_len(skb);
- b->backlogs[q->cur_flow] -= len;
+ WRITE_ONCE(b->backlogs[q->cur_flow], b->backlogs[q->cur_flow] - len);
WRITE_ONCE(b->tin_backlog, b->tin_backlog - len);
sch->qstats.backlog -= len;
q->buffer_used -= skb->truesize;
@@ -2166,7 +2172,8 @@ retry:
}
}
- flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
+ WRITE_ONCE(flow->deficit,
+ flow->deficit + cake_get_flow_quantum(b, flow, q->config->flow_mode));
list_move_tail(&flow->flowchain, &b->old_flows);
goto retry;
@@ -2232,10 +2239,10 @@ retry:
if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
len = cake_advance_shaper(q, b, skb,
now, true);
- flow->deficit -= len;
+ WRITE_ONCE(flow->deficit, flow->deficit - len);
b->tin_deficit -= len;
}
- flow->dropped++;
+ WRITE_ONCE(flow->dropped, flow->dropped + 1);
WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1);
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_qstats_drop(sch);
@@ -2259,7 +2266,7 @@ retry:
delay < b->base_delay ? 2 : 8));
len = cake_advance_shaper(q, b, skb, now, false);
- flow->deficit -= len;
+ WRITE_ONCE(flow->deficit, flow->deficit - len);
b->tin_deficit -= len;
if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
@@ -3137,7 +3144,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
flow = &b->flows[idx % CAKE_QUEUES];
- if (flow->head) {
+ if (READ_ONCE(flow->head)) {
sch_tree_lock(sch);
skb = flow->head;
while (skb) {
@@ -3146,13 +3153,15 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
}
sch_tree_unlock(sch);
}
- qs.backlog = b->backlogs[idx % CAKE_QUEUES];
- qs.drops = flow->dropped;
+ qs.backlog = READ_ONCE(b->backlogs[idx % CAKE_QUEUES]);
+ qs.drops = READ_ONCE(flow->dropped);
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
if (flow) {
ktime_t now = ktime_get();
+ bool dropping;
+ u32 p_drop;
stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
if (!stats)
@@ -3167,21 +3176,23 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
goto nla_put_failure; \
} while (0)
- PUT_STAT_S32(DEFICIT, flow->deficit);
- PUT_STAT_U32(DROPPING, flow->cvars.dropping);
- PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
- PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
- if (flow->cvars.p_drop) {
+ PUT_STAT_S32(DEFICIT, READ_ONCE(flow->deficit));
+ dropping = READ_ONCE(flow->cvars.dropping);
+ PUT_STAT_U32(DROPPING, dropping);
+ PUT_STAT_U32(COBALT_COUNT, READ_ONCE(flow->cvars.count));
+ p_drop = READ_ONCE(flow->cvars.p_drop);
+ PUT_STAT_U32(P_DROP, p_drop);
+ if (p_drop) {
PUT_STAT_S32(BLUE_TIMER_US,
ktime_to_us(
ktime_sub(now,
- flow->cvars.blue_timer)));
+ READ_ONCE(flow->cvars.blue_timer))));
}
- if (flow->cvars.dropping) {
+ if (dropping) {
PUT_STAT_S32(DROP_NEXT_US,
ktime_to_us(
ktime_sub(now,
- flow->cvars.drop_next)));
+ READ_ONCE(flow->cvars.drop_next))));
}
if (nla_nest_end(d->skb, stats) < 0)
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 0664b2f2d6f2..24db54684e8a 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -117,7 +117,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
{
struct sk_buff *skb = flow->head;
- flow->head = skb->next;
+ WRITE_ONCE(flow->head, skb->next);
skb_mark_not_on_list(skb);
return skb;
}
@@ -127,7 +127,7 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
struct sk_buff *skb)
{
if (flow->head == NULL)
- flow->head = skb;
+ WRITE_ONCE(flow->head, skb);
else
flow->tail->next = skb;
flow->tail = skb;
@@ -173,8 +173,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
} while (++i < max_packets && len < threshold);
/* Tell codel to increase its signal strength also */
- flow->cvars.count += i;
- q->backlogs[idx] -= len;
+ WRITE_ONCE(flow->cvars.count, flow->cvars.count + i);
+ WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] - len);
q->memory_usage -= mem;
sch->qstats.drops += i;
sch->qstats.backlog -= len;
@@ -204,13 +204,13 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
codel_set_enqueue_time(skb);
flow = &q->flows[idx];
flow_queue_add(flow, skb);
- q->backlogs[idx] += qdisc_pkt_len(skb);
+ WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] + qdisc_pkt_len(skb));
qdisc_qstats_backlog_inc(sch, skb);
if (list_empty(&flow->flowchain)) {
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
- flow->deficit = q->quantum;
+ WRITE_ONCE(flow->deficit, q->quantum);
}
get_codel_cb(skb)->mem_usage = skb->truesize;
q->memory_usage += get_codel_cb(skb)->mem_usage;
@@ -263,7 +263,8 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
flow = container_of(vars, struct fq_codel_flow, cvars);
if (flow->head) {
skb = dequeue_head(flow);
- q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+ WRITE_ONCE(q->backlogs[flow - q->flows],
+ q->backlogs[flow - q->flows] - qdisc_pkt_len(skb));
q->memory_usage -= get_codel_cb(skb)->mem_usage;
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
@@ -296,7 +297,7 @@ begin:
flow = list_first_entry(head, struct fq_codel_flow, flowchain);
if (flow->deficit <= 0) {
- flow->deficit += q->quantum;
+ WRITE_ONCE(flow->deficit, flow->deficit + q->quantum);
list_move_tail(&flow->flowchain, &q->old_flows);
goto begin;
}
@@ -314,7 +315,7 @@ begin:
goto begin;
}
qdisc_bstats_update(sch, skb);
- flow->deficit -= qdisc_pkt_len(skb);
+ WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb));
if (q->cstats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
@@ -328,7 +329,7 @@ begin:
static void fq_codel_flow_purge(struct fq_codel_flow *flow)
{
rtnl_kfree_skbs(flow->head, flow->tail);
- flow->head = NULL;
+ WRITE_ONCE(flow->head, NULL);
}
static void fq_codel_reset(struct Qdisc *sch)
@@ -656,21 +657,21 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
memset(&xstats, 0, sizeof(xstats));
xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
- xstats.class_stats.deficit = flow->deficit;
+ xstats.class_stats.deficit = READ_ONCE(flow->deficit);
xstats.class_stats.ldelay =
- codel_time_to_us(flow->cvars.ldelay);
- xstats.class_stats.count = flow->cvars.count;
- xstats.class_stats.lastcount = flow->cvars.lastcount;
- xstats.class_stats.dropping = flow->cvars.dropping;
- if (flow->cvars.dropping) {
- codel_tdiff_t delta = flow->cvars.drop_next -
+ codel_time_to_us(READ_ONCE(flow->cvars.ldelay));
+ xstats.class_stats.count = READ_ONCE(flow->cvars.count);
+ xstats.class_stats.lastcount = READ_ONCE(flow->cvars.lastcount);
+ xstats.class_stats.dropping = READ_ONCE(flow->cvars.dropping);
+ if (xstats.class_stats.dropping) {
+ codel_tdiff_t delta = READ_ONCE(flow->cvars.drop_next) -
codel_get_time();
xstats.class_stats.drop_next = (delta >= 0) ?
codel_time_to_us(delta) :
-codel_time_to_us(-delta);
}
- if (flow->head) {
+ if (READ_ONCE(flow->head)) {
sch_tree_lock(sch);
skb = flow->head;
while (skb) {
@@ -679,7 +680,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
}
sch_tree_unlock(sch);
}
- qs.backlog = q->backlogs[idx];
+ qs.backlog = READ_ONCE(q->backlogs[idx]);
qs.drops = 0;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index fb53fbf0e328..b41f2def2e2c 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -219,16 +219,14 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
* packet timestamp.
*/
if (!params->dq_rate_estimator) {
- vars->qdelay = now - pie_get_enqueue_time(skb);
+ WRITE_ONCE(vars->qdelay,
+ backlog ? now - pie_get_enqueue_time(skb) : 0);
if (vars->dq_tstamp != DTIME_INVALID)
dtime = now - vars->dq_tstamp;
vars->dq_tstamp = now;
- if (backlog == 0)
- vars->qdelay = 0;
-
if (dtime == 0)
return;
@@ -376,7 +374,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
- vars->prob += delta;
+ WRITE_ONCE(vars->prob, vars->prob + delta);
if (delta > 0) {
/* prevent overflow */
@@ -401,7 +399,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
if (qdelay == 0 && qdelay_old == 0 && update_prob)
/* Reduce drop probability to 98.4% */
- vars->prob -= vars->prob / 64;
+ WRITE_ONCE(vars->prob, vars->prob - vars->prob / 64);
WRITE_ONCE(vars->qdelay, qdelay);
vars->backlog_old = backlog;
@@ -501,7 +499,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
- .prob = q->vars.prob << BITS_PER_BYTE,
+ .prob = READ_ONCE(q->vars.prob) << BITS_PER_BYTE,
.delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) /
NSEC_PER_USEC,
.packets_in = READ_ONCE(q->stats.packets_in),
@@ -512,7 +510,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
};
/* avg_dq_rate is only valid if dq_rate_estimator is enabled */
- st.dq_rate_estimating = q->params.dq_rate_estimator;
+ st.dq_rate_estimating = READ_ONCE(q->params.dq_rate_estimator);
/* unscale and return dq_rate in bytes per sec */
if (st.dq_rate_estimating)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 432b8a3000a5..4d0e44a2e7c6 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -162,7 +162,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch)
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
- skb = child->dequeue(child);
+ skb = qdisc_dequeue_peeked(child);
if (skb) {
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index bd5ef561030f..d3ee8e5479b3 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -441,7 +441,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
struct Qdisc *child = q->qdisc;
struct sk_buff *skb;
- skb = child->dequeue(q->qdisc);
+ skb = qdisc_dequeue_peeked(child);
if (skb) {
qdisc_bstats_update(sch, skb);
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index c3f3181dba54..f39822babf88 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -225,7 +225,8 @@ static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
sfq_unlink(q, x, n, p);
- d = q->slots[x].qlen--;
+ d = q->slots[x].qlen;
+ WRITE_ONCE(q->slots[x].qlen, d - 1);
if (n == p && q->cur_depth == d)
q->cur_depth--;
sfq_link(q, x);
@@ -238,7 +239,8 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
sfq_unlink(q, x, n, p);
- d = ++q->slots[x].qlen;
+ d = q->slots[x].qlen + 1;
+ WRITE_ONCE(q->slots[x].qlen, d);
if (q->cur_depth < d)
q->cur_depth = d;
sfq_link(q, x);
@@ -298,7 +300,7 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
drop:
skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
len = qdisc_pkt_len(skb);
- slot->backlog -= len;
+ WRITE_ONCE(slot->backlog, slot->backlog - len);
sfq_dec(q, x);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
@@ -314,7 +316,7 @@ drop:
q->tail = NULL; /* no more active slots */
else
q->tail->next = slot->next;
- q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
goto drop;
}
@@ -364,10 +366,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
x = q->dep[0].next; /* get a free slot */
if (x >= SFQ_MAX_FLOWS)
return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS);
- q->ht[hash] = x;
+ WRITE_ONCE(q->ht[hash], x);
slot = &q->slots[x];
slot->hash = hash;
- slot->backlog = 0; /* should already be 0 anyway... */
+ WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */
red_set_vars(&slot->vars);
goto enqueue;
}
@@ -426,7 +428,7 @@ congestion_drop:
head = slot_dequeue_head(slot);
delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
sch->qstats.backlog -= delta;
- slot->backlog -= delta;
+ WRITE_ONCE(slot->backlog, slot->backlog - delta);
qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT);
slot_queue_add(slot, skb);
@@ -436,7 +438,7 @@ congestion_drop:
enqueue:
qdisc_qstats_backlog_inc(sch, skb);
- slot->backlog += qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
slot_queue_add(slot, skb);
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
@@ -452,7 +454,7 @@ enqueue:
*/
q->tail = slot;
/* We could use a bigger initial quantum for new flows */
- slot->allot = q->quantum;
+ WRITE_ONCE(slot->allot, q->quantum);
}
if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
@@ -489,7 +491,7 @@ next_slot:
slot = &q->slots[a];
if (slot->allot <= 0) {
q->tail = slot;
- slot->allot += q->quantum;
+ WRITE_ONCE(slot->allot, slot->allot + q->quantum);
goto next_slot;
}
skb = slot_dequeue_head(slot);
@@ -497,10 +499,10 @@ next_slot:
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
- slot->backlog -= qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb));
/* Is the slot empty? */
if (slot->qlen == 0) {
- q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
next_a = slot->next;
if (a == next_a) {
q->tail = NULL; /* no more active slots */
@@ -508,7 +510,7 @@ next_slot:
}
q->tail->next = next_a;
} else {
- slot->allot -= qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb));
}
return skb;
}
@@ -549,9 +551,9 @@ static void sfq_rehash(struct Qdisc *sch)
sfq_dec(q, i);
__skb_queue_tail(&list, skb);
}
- slot->backlog = 0;
+ WRITE_ONCE(slot->backlog, 0);
red_set_vars(&slot->vars);
- q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
}
q->tail = NULL;
@@ -570,7 +572,7 @@ drop:
dropped++;
continue;
}
- q->ht[hash] = x;
+ WRITE_ONCE(q->ht[hash], x);
slot = &q->slots[x];
slot->hash = hash;
}
@@ -581,7 +583,7 @@ drop:
slot->vars.qavg = red_calc_qavg(q->red_parms,
&slot->vars,
slot->backlog);
- slot->backlog += qdisc_pkt_len(skb);
+ WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
if (q->tail == NULL) { /* It is the first flow */
@@ -591,7 +593,7 @@ drop:
q->tail->next = x;
}
q->tail = slot;
- slot->allot = q->quantum;
+ WRITE_ONCE(slot->allot, q->quantum);
}
}
sch->q.qlen -= dropped;
@@ -905,16 +907,16 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct sfq_sched_data *q = qdisc_priv(sch);
- sfq_index idx = q->ht[cl - 1];
+ sfq_index idx = READ_ONCE(q->ht[cl - 1]);
struct gnet_stats_queue qs = { 0 };
struct tc_sfq_xstats xstats = { 0 };
if (idx != SFQ_EMPTY_SLOT) {
const struct sfq_slot *slot = &q->slots[idx];
- xstats.allot = slot->allot;
- qs.qlen = slot->qlen;
- qs.backlog = slot->backlog;
+ xstats.allot = READ_ONCE(slot->allot);
+ qs.qlen = READ_ONCE(slot->qlen);
+ qs.backlog = READ_ONCE(slot->backlog);
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
@@ -930,7 +932,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
return;
for (i = 0; i < q->divisor; i++) {
- if (q->ht[i] == SFQ_EMPTY_SLOT) {
+ if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) {
arg->count++;
continue;
}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 1a565095376a..185dbed7de5d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1628,12 +1628,8 @@ static void smc_connect_work(struct work_struct *work)
lock_sock(&smc->sk);
if (rc != 0 || smc->sk.sk_err) {
smc->sk.sk_state = SMC_CLOSED;
- if (rc == -EPIPE || rc == -EAGAIN)
- smc->sk.sk_err = EPIPE;
- else if (rc == -ECONNREFUSED)
- smc->sk.sk_err = ECONNREFUSED;
- else if (signal_pending(current))
- smc->sk.sk_err = -sock_intr_errno(timeo);
+ if (!smc->sk.sk_err)
+ smc->sk.sk_err = (rc == -EAGAIN) ? EPIPE : -rc;
sock_put(&smc->sk); /* passive closing */
goto out;
}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 798243eabb1f..2590e855f6a5 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2317,9 +2317,9 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
if (copied < 0)
goto splice_requeue;
- if (chunk < rxm->full_len) {
- rxm->offset += len;
- rxm->full_len -= len;
+ if (copied < rxm->full_len) {
+ rxm->offset += copied;
+ rxm->full_len -= copied;
goto splice_requeue;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e2d787ca3e74..1cbf36ea043b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3323,6 +3323,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct sk_buff *skb;
int answ = 0;
+ if (sk->sk_type != SOCK_STREAM)
+ return -EOPNOTSUPP;
+
mutex_lock(&u->iolock);
skb = skb_peek(&sk->sk_receive_queue);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a7967a345827..0783555e2526 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -607,6 +607,8 @@ static void unix_gc(struct work_struct *work)
struct sk_buff_head hitlist;
struct sk_buff *skb;
+ WRITE_ONCE(gc_in_progress, true);
+
spin_lock(&unix_gc_lock);
if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) {
@@ -649,10 +651,8 @@ void unix_schedule_gc(struct user_struct *user)
READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
return;
- if (!READ_ONCE(gc_in_progress)) {
- WRITE_ONCE(gc_in_progress, true);
+ if (!READ_ONCE(gc_in_progress))
queue_work(system_dfl_wq, &unix_gc_work);
- }
if (user && READ_ONCE(unix_graph_cyclic_sccs))
flush_work(&unix_gc_work);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 416d533f493d..9b8014516f4f 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -447,7 +447,9 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
u32 len)
{
- if (vvs->buf_used + len > vvs->buf_alloc)
+ u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0);
+
+ if (skb_overhead + vvs->buf_used + len > vvs->buf_alloc)
return false;
vvs->rx_bytes += len;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f334cdef8958..7db9cd433801 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1276,6 +1276,18 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
rtnl_unlock();
return -ENODEV;
}
+
+ /*
+ * The first invocation validated the wdev's netns against
+ * the caller via __cfg80211_wdev_from_attrs(). The wiphy
+ * may have moved netns between dumpit invocations (via
+ * NL80211_CMD_SET_WIPHY_NETNS), so re-check here.
+ */
+ if (!net_eq(wiphy_net(wiphy), sock_net(cb->skb->sk))) {
+ rtnl_unlock();
+ return -ENODEV;
+ }
+
*rdev = wiphy_to_rdev(wiphy);
*wdev = NULL;
@@ -13867,6 +13879,19 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(net))
return PTR_ERR(net);
+ /*
+ * The caller already has CAP_NET_ADMIN over the source netns
+ * (enforced by GENL_UNS_ADMIN_PERM on the genl op). Mirror the
+ * convention used by net/core/rtnetlink.c::rtnl_get_net_ns_capable()
+ * and require CAP_NET_ADMIN over the target netns as well, so that
+ * a caller that is privileged in their own user namespace cannot
+ * push a wiphy into a netns where they have no privilege.
+ */
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return -EPERM;
+ }
+
err = 0;
/* check if anything to do */
@@ -19828,6 +19853,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
.cmd = NL80211_CMD_SET_PMK,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl80211_set_pmk,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_CLEAR_SKB),
},
@@ -19835,6 +19861,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
.cmd = NL80211_CMD_DEL_PMK,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl80211_del_pmk,
+ .flags = GENL_UNS_ADMIN_PERM,
.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
},
{
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 4c8ea0583f94..d6cd0de64d1f 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
out->ftm.ftms_per_burst = 0;
if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
out->ftm.ftms_per_burst =
- nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);
+ nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);
if (capa->ftm.max_ftms_per_burst &&
(out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst ||
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 887abed25466..5e5786cd9af5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
}
-static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr)
{
- skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ struct xsk_addrs *xsk_addr;
+
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (unlikely(!xsk_addr))
+ return NULL;
+
+ xsk_addr->addrs[0] = addr;
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ return xsk_addr;
+}
+
+static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (!xsk_skb_destructor_is_addr(skb))
+ return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb));
+ if (likely(xsk_addr))
+ xsk_addr->num_descs = 1;
+ return xsk_addr;
+}
+
+static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+{
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ return 0;
+ }
+
+ if (unlikely(!__xsk_addrs_alloc(skb, addr)))
+ return -ENOMEM;
+ return 0;
}
static void xsk_inc_num_desc(struct sk_buff *skb)
@@ -685,7 +718,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
spin_lock_irqsave(&pool->cq_prod_lock, flags);
idx = xskq_get_prod(pool->cq);
- if (unlikely(num_descs > 1)) {
+ if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
for (i = 0; i < num_descs; i++) {
@@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
-static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
- u64 addr)
+static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
+ u64 addr)
{
+ int err;
+
+ err = xsk_skb_destructor_set_addr(skb, addr);
+ if (unlikely(err))
+ return err;
+
skb->dev = xs->dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
- xsk_skb_destructor_set_addr(skb, addr);
+ return 0;
}
static void xsk_consume_skb(struct sk_buff *skb)
@@ -740,7 +779,7 @@ static void xsk_consume_skb(struct sk_buff *skb)
u32 num_descs = xsk_get_num_desc(skb);
struct xsk_addrs *xsk_addr;
- if (unlikely(num_descs > 1)) {
+ if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
}
@@ -819,28 +858,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
-
- xsk_skb_init_misc(skb, xs, desc->addr);
if (desc->options & XDP_TX_METADATA) {
err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
- if (unlikely(err))
+ if (unlikely(err)) {
+ kfree_skb(skb);
return ERR_PTR(err);
+ }
}
} else {
struct xsk_addrs *xsk_addr;
- if (xsk_skb_destructor_is_addr(skb)) {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
- GFP_KERNEL);
- if (!xsk_addr)
- return ERR_PTR(-ENOMEM);
-
- xsk_addr->num_descs = 1;
- xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
- skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
- } else {
- xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
- }
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
/* in case of -EOVERFLOW that could happen below,
* xsk_consume_skb() will release this node as whole skb
@@ -856,8 +886,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
addr = buffer - pool->addrs;
for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
- if (unlikely(i >= MAX_SKB_FRAGS))
+ if (unlikely(i >= MAX_SKB_FRAGS)) {
+ if (!xs->skb)
+ kfree_skb(skb);
return ERR_PTR(-EOVERFLOW);
+ }
page = pool->umem->pgs[addr >> PAGE_SHIFT];
get_page(page);
@@ -914,7 +947,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (unlikely(err))
goto free_err;
- xsk_skb_init_misc(skb, xs, desc->addr);
if (desc->options & XDP_TX_METADATA) {
err = xsk_skb_metadata(skb, buffer, desc,
xs->pool, hr);
@@ -927,19 +959,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct page *page;
u8 *vaddr;
- if (xsk_skb_destructor_is_addr(skb)) {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
- GFP_KERNEL);
- if (!xsk_addr) {
- err = -ENOMEM;
- goto free_err;
- }
-
- xsk_addr->num_descs = 1;
- xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
- skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
- } else {
- xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr) {
+ err = -ENOMEM;
+ goto free_err;
}
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
@@ -964,18 +987,28 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
}
}
+ if (!xs->skb) {
+ err = xsk_skb_init_misc(skb, xs, desc->addr);
+ if (unlikely(err))
+ goto free_err;
+ }
xsk_inc_num_desc(skb);
return skb;
free_err:
- if (skb && !skb_shinfo(skb)->nr_frags)
+ if (skb && !xs->skb)
kfree_skb(skb);
if (err == -EOVERFLOW) {
- /* Drop the packet */
- xsk_inc_num_desc(xs->skb);
- xsk_drop_skb(xs->skb);
+ if (xs->skb) {
+ /* Drop the packet */
+ xsk_inc_num_desc(xs->skb);
+ xsk_drop_skb(xs->skb);
+ } else {
+ xsk_cq_cancel_locked(xs->pool, 1);
+ xs->tx->invalid_descs++;
+ }
xskq_cons_release(xs->tx);
} else {
/* Let application retry */
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index cd7bc50872f6..d981cfdd8535 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -175,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
if (force_zc && force_copy)
return -EINVAL;
+ if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+ return -EOPNOTSUPP;
+
if (xsk_get_pool_from_qid(netdev, queue_id))
return -EBUSY;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a9652b422f51..cc35c2fcbbe0 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,7 +66,9 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *iph = ip_hdr(skb);
int ihl = iph->ihl * 4;
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol)
+ skb_set_inner_transport_header(skb,
+ skb_transport_offset(skb));
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
@@ -167,7 +169,9 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
int hdr_len;
iph = ipv6_hdr(skb);
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol)
+ skb_set_inner_transport_header(skb,
+ skb_transport_offset(skb));
hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr);
if (hdr_len < 0)
@@ -276,8 +280,10 @@ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb)
struct iphdr *top_iph;
int flags;
- skb_set_inner_network_header(skb, skb_network_offset(skb));
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ }
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
@@ -321,8 +327,10 @@ static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb)
struct ipv6hdr *top_iph;
int dsfield;
- skb_set_inner_network_header(skb, skb_network_offset(skb));
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+ }
skb_set_network_header(skb, -x->props.header_len);
skb->mac_header = skb->network_header +
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1748d374abca..686014d39429 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -818,17 +818,17 @@ int __xfrm_state_delete(struct xfrm_state *x)
spin_lock(&net->xfrm.xfrm_state_lock);
list_del(&x->km.all);
- hlist_del_rcu(&x->bydst);
- hlist_del_rcu(&x->bysrc);
- if (x->km.seq)
- hlist_del_rcu(&x->byseq);
+ hlist_del_init_rcu(&x->bydst);
+ hlist_del_init_rcu(&x->bysrc);
+ if (!hlist_unhashed(&x->byseq))
+ hlist_del_init_rcu(&x->byseq);
if (!hlist_unhashed(&x->state_cache))
hlist_del_rcu(&x->state_cache);
if (!hlist_unhashed(&x->state_cache_input))
hlist_del_rcu(&x->state_cache_input);
- if (x->id.spi)
- hlist_del_rcu(&x->byspi);
+ if (!hlist_unhashed(&x->byspi))
+ hlist_del_init_rcu(&x->byspi);
net->xfrm.state_num--;
xfrm_nat_keepalive_state_updated(x);
spin_unlock(&net->xfrm.xfrm_state_lock);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index d56450f61669..38a90e5ee3d9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -3323,6 +3323,7 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
[XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32),
[XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
[XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping),
[XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
[XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default),
};