diff options
Diffstat (limited to 'net')
145 files changed, 2161 insertions, 1455 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 2a7f1b15714a..e9326f322d7a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -273,17 +273,6 @@ static int vlan_dev_open(struct net_device *dev) goto out; } - if (dev->flags & IFF_ALLMULTI) { - err = dev_set_allmulti(real_dev, 1); - if (err < 0) - goto del_unicast; - } - if (dev->flags & IFF_PROMISC) { - err = dev_set_promiscuity(real_dev, 1); - if (err < 0) - goto clear_allmulti; - } - ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) @@ -297,12 +286,6 @@ static int vlan_dev_open(struct net_device *dev) netif_carrier_on(dev); return 0; -clear_allmulti: - if (dev->flags & IFF_ALLMULTI) - dev_set_allmulti(real_dev, -1); -del_unicast: - if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) - dev_uc_del(real_dev, dev->dev_addr); out: netif_carrier_off(dev); return err; @@ -315,10 +298,6 @@ static int vlan_dev_stop(struct net_device *dev) dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); - if (dev->flags & IFF_ALLMULTI) - dev_set_allmulti(real_dev, -1); - if (dev->flags & IFF_PROMISC) - dev_set_promiscuity(real_dev, -1); if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); @@ -490,12 +469,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - if (dev->flags & IFF_UP) { - if (change & IFF_ALLMULTI) - dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); - if (change & IFF_PROMISC) - dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); - } + if (change & IFF_ALLMULTI) + dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); + if (change & IFF_PROMISC) + dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) diff --git a/net/9p/client.c b/net/9p/client.c index d841d82e908f..cf73fe306219 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1547,7 +1547,8 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int count = iov_iter_count(to); - int rsize, received, non_zc = 0; + u32 rsize, received; + bool non_zc = false; char *dataptr; *err = 0; @@ -1570,7 +1571,7 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, 0, 11, "dqd", fid->fid, offset, rsize); } else { - non_zc = 1; + non_zc = true; req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); } @@ -1591,11 +1592,11 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, return 0; } if (rsize < received) { - pr_err("bogus RREAD count (%d > %d)\n", received, rsize); + pr_err("bogus RREAD count (%u > %u)\n", received, rsize); received = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", received); + p9_debug(P9_DEBUG_9P, "<<< RREAD count %u\n", received); if (non_zc) { int n = copy_to_iter(dataptr, received, to); @@ -1622,9 +1623,9 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) *err = 0; while (iov_iter_count(from)) { - int count = iov_iter_count(from); - int rsize = fid->iounit; - int written; + size_t count = iov_iter_count(from); + u32 rsize = fid->iounit; + u32 written; if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) rsize = clnt->msize - P9_IOHDRSZ; @@ -1632,7 +1633,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) if (count < rsize) rsize = count; - p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d (/%d)\n", + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %u (/%zu)\n", fid->fid, offset, rsize, count); /* Don't bother zerocopy for small IO (< 1024) */ @@ -1658,11 +1659,11 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) break; } if (rsize < written) { - pr_err("bogus RWRITE count (%d > %d)\n", written, rsize); + pr_err("bogus RWRITE count (%u > %u)\n", written, rsize); written = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", written); + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %u\n", written); p9_req_put(clnt, req); iov_iter_revert(from, count - written - iov_iter_count(from)); @@ -2049,7 +2050,8 @@ EXPORT_SYMBOL_GPL(p9_client_xattrcreate); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) { - int err, rsize, non_zc = 0; + int err, non_zc = 0; + u32 rsize; struct p9_client *clnt; struct p9_req_t *req; char *dataptr; @@ -2058,7 +2060,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) iov_iter_kvec(&to, ITER_DEST, &kv, 1, count); - p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", + p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %u\n", fid->fid, offset, count); clnt = fid->clnt; @@ -2093,11 +2095,11 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) goto free_and_error; } if (rsize < count) { - pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize); + pr_err("bogus RREADDIR count (%u > %u)\n", count, rsize); count = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %u\n", count); if (non_zc) memmove(data, dataptr, count); diff --git a/net/Makefile b/net/Makefile index 428c523296e3..cedc19be7ada 100644 --- a/net/Makefile +++ b/net/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX_SCM) += unix/ +obj-$(CONFIG_UNIX) += unix/ obj-y += ipv6/ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ diff --git a/net/atm/lec.c b/net/atm/lec.c index 6257bf12e5a0..ac3cfc1ae510 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -181,6 +181,7 @@ static void lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = skb->dev; + unsigned int len = skb->len; ATM_SKB(skb)->vcc = vcc; atm_account_tx(vcc, skb); @@ -191,7 +192,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) } dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + dev->stats.tx_bytes += len; } static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 033871e718a3..583c27131b7d 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1314,6 +1314,8 @@ static void MPOA_cache_impos_rcvd(struct k_message *msg, holding_time = msg->content.eg_info.holding_time; dprintk("(%s) entry = %p, holding_time = %u\n", mpc->dev->name, entry, holding_time); + if (entry == NULL && !holding_time) + return; if (entry == NULL && holding_time) { entry = mpc->eg_ops->add_entry(msg, mpc); mpc->eg_ops->put(entry); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 74b49c35ddc1..209180b4c268 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -324,8 +324,7 @@ batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /* send a batman ogm to a given interface */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e503ee0d896b..8f89ffe6020c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -839,8 +839,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /** diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 3bcc15c9415f..13b752c169be 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -825,11 +825,16 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { + struct sk_buff *skb; + /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ - return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + return skb; } static void chan_suspend_cb(struct l2cap_chan *chan) diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c index 1bc51e2b05a3..3e1713673ecc 100644 --- a/net/bluetooth/eir.c +++ b/net/bluetooth/eir.c @@ -366,17 +366,19 @@ u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr) void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len) { - while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, len))) { + size_t dlen; + + while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, &dlen))) { u16 value = get_unaligned_le16(eir); if (uuid == value) { if (len) - *len -= 2; + *len = dlen - 2; return &eir[2]; } - eir += *len; - eir_len -= *len; + eir += dlen; + eir_len -= dlen; } return NULL; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 70be66cd6036..f3596cfdffc2 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1862,10 +1862,8 @@ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); - if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) { + if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) hdev->adv_monitors_cnt--; - mgmt_adv_monitor_removed(hdev, monitor->handle); - } kfree(monitor); } @@ -2492,6 +2490,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); + mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); @@ -3380,23 +3379,18 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) bt_dev_err(hdev, "link tx timeout"); - rcu_read_lock(); + hci_dev_lock(hdev); /* Kill stalled connections */ - list_for_each_entry_rcu(c, &h->list, list) { + list_for_each_entry(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); - /* hci_disconnect might sleep, so, we have to release - * the RCU read lock before calling it. - */ - rcu_read_unlock(); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); - rcu_read_lock(); } } - rcu_read_unlock(); + hci_dev_unlock(hdev); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1e689d8c00a5..4029330e29a9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6149,11 +6149,12 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * event or send an immediate device found event if the data * should not be stored for later. */ - if (!ext_adv && !has_pending_adv_report(hdev)) { + if (!has_pending_adv_report(hdev)) { /* If the report will trigger a SCAN_REQ store it for * later merging. */ - if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) { + if (!ext_adv && (type == LE_ADV_IND || + type == LE_ADV_SCAN_IND)) { store_pending_adv_report(hdev, bdaddr, bdaddr_type, rssi, flags, data, len); return; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index d6f40806ee51..e92bc4ceb5ad 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1586,7 +1586,8 @@ static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) { u8 bid[3]; - u8 ad[4 + 3]; + u8 ad[HCI_MAX_EXT_AD_LENGTH]; + u8 len; /* Skip if NULL adv as instance 0x00 is used for general purpose * advertising so it cannot used for the likes of Broadcast Announcement @@ -1612,8 +1613,10 @@ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) /* Generate Broadcast ID */ get_random_bytes(bid, sizeof(bid)); - eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); - hci_set_adv_instance_data(hdev, adv->instance, sizeof(ad), ad, 0, NULL); + len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); + memcpy(ad + len, adv->adv_data, adv->adv_data_len); + hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len, + ad, 0, NULL); return hci_update_adv_data_sync(hdev, adv->instance); } @@ -1630,8 +1633,15 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, if (instance) { adv = hci_find_adv_instance(hdev, instance); - /* Create an instance if that could not be found */ - if (!adv) { + if (adv) { + /* Turn it into periodic advertising */ + adv->periodic = true; + adv->per_adv_data_len = data_len; + if (data) + memcpy(adv->per_adv_data, data, data_len); + adv->flags = flags; + } else if (!adv) { + /* Create an instance if that could not be found */ adv = hci_add_per_instance(hdev, instance, flags, data_len, data, sync_interval, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 8d6fc1869503..2744ad11687c 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1411,7 +1411,8 @@ static void l2cap_request_info(struct l2cap_conn *conn) sizeof(req), &req); } -static bool l2cap_check_enc_key_size(struct hci_conn *hcon) +static bool l2cap_check_enc_key_size(struct hci_conn *hcon, + struct l2cap_chan *chan) { /* The minimum encryption key size needs to be enforced by the * host stack before establishing any L2CAP connections. The @@ -1425,7 +1426,7 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon) int min_key_size = hcon->hdev->min_enc_key_size; /* On FIPS security level, key size must be 16 bytes */ - if (hcon->sec_level == BT_SECURITY_FIPS) + if (chan->sec_level == BT_SECURITY_FIPS) min_key_size = 16; return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) || @@ -1453,7 +1454,7 @@ static void l2cap_do_start(struct l2cap_chan *chan) !__l2cap_no_conn_pending(chan)) return; - if (l2cap_check_enc_key_size(conn->hcon)) + if (l2cap_check_enc_key_size(conn->hcon, chan)) l2cap_start_connection(chan); else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); @@ -1528,7 +1529,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) continue; } - if (l2cap_check_enc_key_size(conn->hcon)) + if (l2cap_check_enc_key_size(conn->hcon, chan)) l2cap_start_connection(chan); else l2cap_chan_close(chan, ECONNREFUSED); @@ -3954,7 +3955,8 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, /* Check if the ACL is secure enough (if not SDP) */ if (psm != cpu_to_le16(L2CAP_PSM_SDP) && - !hci_conn_check_link_mode(conn->hcon)) { + (!hci_conn_check_link_mode(conn->hcon) || + !l2cap_check_enc_key_size(conn->hcon, pchan))) { conn->disc_reason = HCI_ERROR_AUTH_FAILURE; result = L2CAP_CR_SEC_BLOCK; goto response; @@ -4831,7 +4833,8 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, if (!smp_sufficient_security(conn->hcon, pchan->sec_level, SMP_ALLOW_STK)) { - result = L2CAP_CR_LE_AUTHENTICATION; + result = pchan->sec_level == BT_SECURITY_MEDIUM ? + L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION; chan = NULL; goto response_unlock; } @@ -7322,7 +7325,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } if (chan->state == BT_CONNECT) { - if (!status && l2cap_check_enc_key_size(hcon)) + if (!status && l2cap_check_enc_key_size(hcon, chan)) l2cap_start_connection(chan); else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); @@ -7332,7 +7335,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) struct l2cap_conn_rsp rsp; __u16 res, stat; - if (!status && l2cap_check_enc_key_size(hcon)) { + if (!status && l2cap_check_enc_key_size(hcon, chan)) { if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { res = L2CAP_CR_PEND; stat = L2CAP_CS_AUTHOR_PEND; @@ -7385,6 +7388,9 @@ static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb, return -ENOMEM; /* Init rx_len */ conn->rx_len = len; + + skb_set_delivery_time(conn->rx_skb, skb->tstamp, + skb->tstamp_type); } /* Copy as much as the rx_skb can hold */ @@ -7509,8 +7515,24 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (skb->len > len) { BT_ERR("Frame is too long (len %u, expected len %d)", skb->len, len); + /* PTS test cases L2CAP/COS/CED/BI-14-C and BI-15-C + * (Multiple Signaling Command in one PDU, Data + * Truncated, BR/EDR) send a C-frame to the IUT with + * PDU Length set to 8 and Channel ID set to the + * correct signaling channel for the logical link. + * The Information payload contains one L2CAP_ECHO_REQ + * packet with Data Length set to 0 with 0 octets of + * echo data and one invalid command packet due to + * data truncated in PDU but present in HCI packet. + * + * Shorter the socket buffer to the PDU length to + * allow to process valid commands from the PDU before + * setting the socket unreliable. + */ + skb->len = len; + l2cap_recv_frame(conn, skb); l2cap_conn_unreliable(conn, ECOMM); - goto drop; + goto unlock; } /* Append fragment into frame (with header) */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 29e420e9754b..853d217cabc9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1433,22 +1433,17 @@ static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data) send_settings_rsp(cmd->sk, cmd->opcode, match->hdev); - list_del(&cmd->list); - if (match->sk == NULL) { match->sk = cmd->sk; sock_hold(match->sk); } - - mgmt_pending_free(cmd); } static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data) { u8 *status = data; - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status); - mgmt_pending_remove(cmd); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, *status); } static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) @@ -1462,8 +1457,6 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) if (cmd->cmd_complete) { cmd->cmd_complete(cmd, match->mgmt_status); - mgmt_pending_remove(cmd); - return; } @@ -1472,13 +1465,13 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, cmd->param, cmd->param_len); } static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, cmd->param, sizeof(struct mgmt_addr_info)); } @@ -1518,7 +1511,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); goto done; } @@ -1693,7 +1686,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); goto done; } @@ -1930,8 +1923,8 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, cmd_status_rsp, - &mgmt_err); + mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, + cmd_status_rsp, &mgmt_err); return; } @@ -1941,7 +1934,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); if (changed) new_settings(hdev, match.sk); @@ -2061,12 +2054,12 @@ static void set_le_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, cmd_status_rsp, - &status); + mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, + &status); return; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); new_settings(hdev, match.sk); @@ -2125,7 +2118,7 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) struct sock *sk = cmd->sk; if (status) { - mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, + mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, cmd_status_rsp, &status); return; } @@ -2566,7 +2559,7 @@ static void mgmt_class_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), hdev->dev_class, 3); mgmt_pending_free(cmd); @@ -3354,7 +3347,7 @@ static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status) bacpy(&rp.addr.bdaddr, &conn->dst); rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type); - err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, + err = mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_PAIR_DEVICE, status, &rp, sizeof(rp)); /* So we don't get further callbacks for this connection */ @@ -5165,24 +5158,14 @@ static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); } -void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle) +static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev, + __le16 handle) { struct mgmt_ev_adv_monitor_removed ev; - struct mgmt_pending_cmd *cmd; - struct sock *sk_skip = NULL; - struct mgmt_cp_remove_adv_monitor *cp; - cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); - if (cmd) { - cp = cmd->param; + ev.monitor_handle = handle; - if (cp->monitor_handle) - sk_skip = cmd->sk; - } - - ev.monitor_handle = cpu_to_le16(handle); - - mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk_skip); + mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk); } static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, @@ -5253,7 +5236,7 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, hci_update_passive_scan(hdev); } - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_remove(cmd); @@ -5284,8 +5267,7 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, if (pending_find(MGMT_OP_SET_LE, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || - pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev) || - pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) { + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { status = MGMT_STATUS_BUSY; goto unlock; } @@ -5455,8 +5437,7 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_remove_adv_monitor *cp; - if (status == -ECANCELED || - cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) + if (status == -ECANCELED) return; hci_dev_lock(hdev); @@ -5465,12 +5446,14 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, rp.monitor_handle = cp->monitor_handle; - if (!status) + if (!status) { + mgmt_adv_monitor_removed(cmd->sk, hdev, cp->monitor_handle); hci_update_passive_scan(hdev); + } - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); bt_dev_dbg(hdev, "remove monitor %d complete, status %d", @@ -5480,10 +5463,6 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - - if (cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) - return -ECANCELED; - struct mgmt_cp_remove_adv_monitor *cp = cmd->param; u16 handle = __le16_to_cpu(cp->monitor_handle); @@ -5502,14 +5481,13 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { status = MGMT_STATUS_BUSY; goto unlock; } - cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); + cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); if (!cmd) { status = MGMT_STATUS_NO_RESOURCES; goto unlock; @@ -5519,7 +5497,7 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, mgmt_remove_adv_monitor_complete); if (err) { - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); if (err == -ENOMEM) status = MGMT_STATUS_NO_RESOURCES; @@ -5872,7 +5850,7 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) return; - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -6110,7 +6088,7 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -6335,7 +6313,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) u8 status = mgmt_status(err); if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, cmd_status_rsp, &status); return; } @@ -6345,7 +6323,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp, + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, &match); new_settings(hdev, match.sk); @@ -6689,7 +6667,7 @@ static void set_bredr_complete(struct hci_dev *hdev, void *data, int err) */ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); } else { send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev); new_settings(hdev, cmd->sk); @@ -6826,7 +6804,7 @@ static void set_secure_conn_complete(struct hci_dev *hdev, void *data, int err) if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); goto done; } @@ -7273,7 +7251,7 @@ static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err) rp.max_tx_power = HCI_TX_POWER_INVALID; } - mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, status, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_GET_CONN_INFO, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -7433,7 +7411,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, void *data, int err) } complete: - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -7605,11 +7583,16 @@ static void add_device_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_cp_add_device *cp = cmd->param; if (!err) { + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, hdev->conn_flags, - PTR_UINT(cmd->user_data)); + params ? params->flags : 0); } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, @@ -7712,8 +7695,6 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd->user_data = UINT_PTR(current_flags); - err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, add_device_complete); if (err < 0) { @@ -8634,10 +8615,10 @@ static void add_advertising_complete(struct hci_dev *hdev, void *data, int err) rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); add_adv_complete(hdev, cmd->sk, cp->instance, err); @@ -8825,10 +8806,10 @@ static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data, hci_remove_adv_instance(hdev, cp->instance); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); } else { - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); } @@ -8976,10 +8957,10 @@ static void add_ext_adv_data_complete(struct hci_dev *hdev, void *data, int err) rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -9138,10 +9119,10 @@ static void remove_advertising_complete(struct hci_dev *hdev, void *data, rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -9412,7 +9393,7 @@ void mgmt_index_removed(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); + mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, @@ -9450,7 +9431,8 @@ void mgmt_power_on(struct hci_dev *hdev, int err) hci_update_passive_scan(hdev); } - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp, + &match); new_settings(hdev, match.sk); @@ -9465,7 +9447,8 @@ void __mgmt_power_off(struct hci_dev *hdev) struct cmd_lookup match = { NULL, hdev }; u8 zero_cod[] = { 0, 0, 0 }; - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp, + &match); /* If the power off is because of hdev unregistration let * use the appropriate INVALID_INDEX status. Otherwise use @@ -9479,7 +9462,7 @@ void __mgmt_power_off(struct hci_dev *hdev) else match.mgmt_status = MGMT_STATUS_NOT_POWERED; - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); + mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, @@ -9720,7 +9703,6 @@ static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk); cmd->cmd_complete(cmd, 0); - mgmt_pending_remove(cmd); } bool mgmt_powering_down(struct hci_dev *hdev) @@ -9773,8 +9755,8 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, struct mgmt_cp_disconnect *cp; struct mgmt_pending_cmd *cmd; - mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, - hdev); + mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, true, + unpair_device_rsp, hdev); cmd = pending_find(MGMT_OP_DISCONNECT, hdev); if (!cmd) @@ -9967,7 +9949,7 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) if (status) { u8 mgmt_err = mgmt_status(status); - mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, + mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true, cmd_status_rsp, &mgmt_err); return; } @@ -9977,8 +9959,8 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) else changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY); - mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp, - &match); + mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true, + settings_rsp, &match); if (changed) new_settings(hdev, match.sk); @@ -10002,9 +9984,12 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, { struct cmd_lookup match = { NULL, hdev, mgmt_status(status) }; - mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match); - mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); - mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); + mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, false, sk_lookup, + &match); + mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, false, sk_lookup, + &match); + mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, false, sk_lookup, + &match); if (!status) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index 0115f783bde8..4ba500c377a4 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -217,47 +217,47 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev) { - struct mgmt_pending_cmd *cmd; + struct mgmt_pending_cmd *cmd, *tmp; + + mutex_lock(&hdev->mgmt_pending_lock); - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (hci_sock_get_channel(cmd->sk) != channel) continue; - if (cmd->opcode == opcode) - return cmd; - } - return NULL; -} - -struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, - u16 opcode, - struct hci_dev *hdev, - const void *data) -{ - struct mgmt_pending_cmd *cmd; - - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { - if (cmd->user_data != data) - continue; - if (cmd->opcode == opcode) + if (cmd->opcode == opcode) { + mutex_unlock(&hdev->mgmt_pending_lock); return cmd; + } } + mutex_unlock(&hdev->mgmt_pending_lock); + return NULL; } -void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) { struct mgmt_pending_cmd *cmd, *tmp; + mutex_lock(&hdev->mgmt_pending_lock); + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; + if (remove) + list_del(&cmd->list); + cb(cmd, data); + + if (remove) + mgmt_pending_free(cmd); } + + mutex_unlock(&hdev->mgmt_pending_lock); } struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, @@ -271,7 +271,7 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, return NULL; cmd->opcode = opcode; - cmd->index = hdev->id; + cmd->hdev = hdev; cmd->param = kmemdup(data, len, GFP_KERNEL); if (!cmd->param) { @@ -297,7 +297,9 @@ struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, if (!cmd) return NULL; + mutex_lock(&hdev->mgmt_pending_lock); list_add_tail(&cmd->list, &hdev->mgmt_pending); + mutex_unlock(&hdev->mgmt_pending_lock); return cmd; } @@ -311,7 +313,10 @@ void mgmt_pending_free(struct mgmt_pending_cmd *cmd) void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) { + mutex_lock(&cmd->hdev->mgmt_pending_lock); list_del(&cmd->list); + mutex_unlock(&cmd->hdev->mgmt_pending_lock); + mgmt_pending_free(cmd); } @@ -321,7 +326,7 @@ void mgmt_mesh_foreach(struct hci_dev *hdev, { struct mgmt_mesh_tx *mesh_tx, *tmp; - list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) { + list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) cb(mesh_tx, data); } diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index bdf978605d5a..024e51dd6937 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -33,7 +33,7 @@ struct mgmt_mesh_tx { struct mgmt_pending_cmd { struct list_head list; u16 opcode; - int index; + struct hci_dev *hdev; void *param; size_t param_len; struct sock *sk; @@ -54,11 +54,7 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev); -struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, - u16 opcode, - struct hci_dev *hdev, - const void *data); -void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data); struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 7305f5f8215c..96bea0c8408f 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1030,7 +1030,7 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, /* host join */ if (!port) { - if (mp->host_joined) { + if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 98aea5485aae..a8c67035e23c 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -65,17 +65,14 @@ static struct dst_ops fake_dst_ops = { * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; rcuref_init(&rt->dst.__rcuref, 1); rt->dst.dev = br->dev; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + dst_init_metrics(&rt->dst, br->metrics, false); + dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 72d80fd943a8..9197b511e459 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -502,6 +502,7 @@ struct net_bridge { struct rtable fake_rtable; struct rt6_info fake_rt6_info; }; + u32 metrics[RTAX_MAX]; #endif u16 group_fwd_mask; u16 group_fwd_mask_required; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index be714b4d7b43..a1c22eab71ff 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -715,8 +715,8 @@ static int br_vlan_add_existing(struct net_bridge *br, u16 flags, bool *changed, struct netlink_ext_ack *extack) { - bool would_change = __vlan_flags_would_change(vlan, flags); bool becomes_brentry = false; + bool would_change = false; int err; if (!br_vlan_is_brentry(vlan)) { @@ -725,6 +725,8 @@ static int br_vlan_add_existing(struct net_bridge *br, return -EINVAL; becomes_brentry = true; + } else { + would_change = __vlan_flags_would_change(vlan, flags); } /* Master VLANs that aren't brentries weren't notified before, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6ef04f9fe481..4b4a396d9722 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -32,7 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; unsigned int hlen, ll_rs, mtu; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; @@ -60,19 +60,19 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct ip_fraglist_iter iter; struct sk_buff *frag; - if (first_len - hlen > mtu || - skb_headroom(skb) < ll_rs) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { - if (frag->len > mtu || - skb_headroom(frag) < hlen + ll_rs) + if (frag->len > mtu) goto blackhole; - if (skb_shared(frag)) + if (skb_shared(frag) || + skb_headroom(frag) < hlen + ll_rs) goto slow_path; } @@ -82,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -113,7 +113,7 @@ slow_path: goto blackhole; } - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/can/af_can.c b/net/can/af_can.c index c469fc187f0c..cdad0be43e8f 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -287,8 +287,8 @@ int can_send(struct sk_buff *skb, int loop) netif_rx(newskb); /* update statistics */ - pkg_stats->tx_frames++; - pkg_stats->tx_frames_delta++; + atomic_long_inc(&pkg_stats->tx_frames); + atomic_long_inc(&pkg_stats->tx_frames_delta); return 0; @@ -647,8 +647,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) int matches; /* update statistics */ - pkg_stats->rx_frames++; - pkg_stats->rx_frames_delta++; + atomic_long_inc(&pkg_stats->rx_frames); + atomic_long_inc(&pkg_stats->rx_frames_delta); /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) @@ -669,8 +669,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); if (matches > 0) { - pkg_stats->matches++; - pkg_stats->matches_delta++; + atomic_long_inc(&pkg_stats->matches); + atomic_long_inc(&pkg_stats->matches_delta); } } diff --git a/net/can/af_can.h b/net/can/af_can.h index 7c2d9161e224..22f3352c77fe 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -66,9 +66,9 @@ struct receiver { struct can_pkg_stats { unsigned long jiffies_init; - unsigned long rx_frames; - unsigned long tx_frames; - unsigned long matches; + atomic_long_t rx_frames; + atomic_long_t tx_frames; + atomic_long_t matches; unsigned long total_rx_rate; unsigned long total_tx_rate; @@ -82,9 +82,9 @@ struct can_pkg_stats { unsigned long max_tx_rate; unsigned long max_rx_match_ratio; - unsigned long rx_frames_delta; - unsigned long tx_frames_delta; - unsigned long matches_delta; + atomic_long_t rx_frames_delta; + atomic_long_t tx_frames_delta; + atomic_long_t matches_delta; }; /* persistent statistics */ diff --git a/net/can/bcm.c b/net/can/bcm.c index a1f5db0fd5d4..75653584f31b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -58,6 +58,7 @@ #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -120,6 +121,7 @@ struct bcm_op { struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; + spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */ }; struct bcm_sock { @@ -205,7 +207,9 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); - list_for_each_entry(op, &bo->rx_ops, list) { + rcu_read_lock(); + + list_for_each_entry_rcu(op, &bo->rx_ops, list) { unsigned long reduction; @@ -261,6 +265,9 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); + + rcu_read_unlock(); + return 0; } #endif /* CONFIG_PROC_FS */ @@ -273,13 +280,18 @@ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; - struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; + struct canfd_frame *cf; int err; /* no target device? => exit */ if (!op->ifindex) return; + /* read currframe under lock protection */ + spin_lock_bh(&op->bcm_tx_lock); + cf = op->frames + op->cfsiz * op->currframe; + spin_unlock_bh(&op->bcm_tx_lock); + dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ @@ -300,6 +312,10 @@ static void bcm_can_tx(struct bcm_op *op) skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); + + /* update currframe and count under lock protection */ + spin_lock_bh(&op->bcm_tx_lock); + if (!err) op->frames_abs++; @@ -308,6 +324,11 @@ static void bcm_can_tx(struct bcm_op *op) /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; + + if (op->count > 0) + op->count--; + + spin_unlock_bh(&op->bcm_tx_lock); out: dev_put(dev); } @@ -404,7 +425,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { - op->count--; + bcm_can_tx(op); if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ @@ -419,7 +440,6 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) bcm_send_to_user(op, &msg_head, NULL, 0); } - bcm_can_tx(op); } else if (op->kt_ival2) { bcm_can_tx(op); @@ -801,7 +821,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, REGMASK(op->can_id), bcm_rx_handler, op); - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } @@ -821,7 +841,7 @@ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } @@ -914,6 +934,27 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } op->flags = msg_head->flags; + /* only lock for unlikely count/nframes/currframe changes */ + if (op->nframes != msg_head->nframes || + op->flags & TX_RESET_MULTI_IDX || + op->flags & SETTIMER) { + + spin_lock_bh(&op->bcm_tx_lock); + + if (op->nframes != msg_head->nframes || + op->flags & TX_RESET_MULTI_IDX) { + /* potentially update changed nframes */ + op->nframes = msg_head->nframes; + /* restart multiple frame transmission */ + op->currframe = 0; + } + + if (op->flags & SETTIMER) + op->count = msg_head->count; + + spin_unlock_bh(&op->bcm_tx_lock); + } + } else { /* insert new BCM operation for the given can_id */ @@ -921,9 +962,14 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; + spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; + op->nframes = msg_head->nframes; + + if (op->flags & SETTIMER) + op->count = msg_head->count; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { @@ -982,22 +1028,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ - if (op->nframes != msg_head->nframes) { - op->nframes = msg_head->nframes; - /* start multiple frame transmission with index 0 */ - op->currframe = 0; - } - - /* check flags */ - - if (op->flags & TX_RESET_MULTI_IDX) { - /* start multiple frame transmission with index 0 */ - op->currframe = 0; - } - if (op->flags & SETTIMER) { /* set timer values */ - op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); @@ -1014,11 +1046,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->flags |= TX_ANNOUNCE; } - if (op->flags & TX_ANNOUNCE) { + if (op->flags & TX_ANNOUNCE) bcm_can_tx(op); - if (op->count) - op->count--; - } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); @@ -1234,7 +1263,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return err; } diff --git a/net/can/gw.c b/net/can/gw.c index 37528826935e..e65500c52bf5 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -130,7 +130,7 @@ struct cgw_job { u32 handled_frames; u32 dropped_frames; u32 deleted_frames; - struct cf_mod mod; + struct cf_mod __rcu *cf_mod; union { /* CAN frame data source */ struct net_device *dev; @@ -459,6 +459,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; + struct cf_mod *mod; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ @@ -506,7 +507,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ - if (gwj->mod.modfunc[0]) + mod = rcu_dereference(gwj->cf_mod); + if (mod->modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -529,8 +531,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ - while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) - (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); + while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx]) + (*mod->modfunc[modidx++])(cf, mod); /* Has the CAN frame been modified? */ if (modidx) { @@ -546,11 +548,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) } /* check for checksum updates */ - if (gwj->mod.csumfunc.crc8) - (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + if (mod->csumfunc.crc8) + (*mod->csumfunc.crc8)(cf, &mod->csum.crc8); - if (gwj->mod.csumfunc.xor) - (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); + if (mod->csumfunc.xor) + (*mod->csumfunc.xor)(cf, &mod->csum.xor); } /* clear the skb timestamp if not configured the other way */ @@ -581,9 +583,20 @@ static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); + /* cgw_job::cf_mod is always accessed from the same cgw_job object within + * the same RCU read section. Once cgw_job is scheduled for removal, + * cf_mod can also be removed without mandating an additional grace period. + */ + kfree(rcu_access_pointer(gwj->cf_mod)); kmem_cache_free(cgw_cache, gwj); } +/* Return cgw_job::cf_mod with RTNL protected section */ +static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj) +{ + return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked()); +} + static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { @@ -616,6 +629,7 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; + struct cf_mod *mod; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) @@ -650,82 +664,83 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + mod = cgw_job_cf_mod(gwj); if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; + if (mod->modtype.and) { + memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); + mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; + if (mod->modtype.or) { + memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); + mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; + if (mod->modtype.xor) { + memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); + mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; + if (mod->modtype.set) { + memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); + mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; + if (mod->modtype.and) { + memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); + mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; + if (mod->modtype.or) { + memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); + mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; + if (mod->modtype.xor) { + memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); + mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; + if (mod->modtype.set) { + memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); + mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } - if (gwj->mod.uid) { - if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) + if (mod->uid) { + if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0) goto cancel; } - if (gwj->mod.csumfunc.crc8) { + if (mod->csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, - &gwj->mod.csum.crc8) < 0) + &mod->csum.crc8) < 0) goto cancel; } - if (gwj->mod.csumfunc.xor) { + if (mod->csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, - &gwj->mod.csum.xor) < 0) + &mod->csum.xor) < 0) goto cancel; } @@ -1059,7 +1074,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; - struct cf_mod mod; + struct cf_mod *mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; @@ -1078,37 +1093,48 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; - err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); + mod = kmalloc(sizeof(*mod), GFP_KERNEL); + if (!mod) + return -ENOMEM; + + err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) - return err; + goto out_free_cf; - if (mod.uid) { + if (mod->uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { - if (gwj->mod.uid != mod.uid) + struct cf_mod *old_cf; + + old_cf = cgw_job_cf_mod(gwj); + if (old_cf->uid != mod->uid) continue; /* interfaces & filters must be identical */ - if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) - return -EINVAL; + if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) { + err = -EINVAL; + goto out_free_cf; + } - /* update modifications with disabled softirq & quit */ - local_bh_disable(); - memcpy(&gwj->mod, &mod, sizeof(mod)); - local_bh_enable(); + rcu_assign_pointer(gwj->cf_mod, mod); + kfree_rcu_mightsleep(old_cf); return 0; } } /* ifindex == 0 is not allowed for job creation */ - if (!ccgw.src_idx || !ccgw.dst_idx) - return -ENODEV; + if (!ccgw.src_idx || !ccgw.dst_idx) { + err = -ENODEV; + goto out_free_cf; + } gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); - if (!gwj) - return -ENOMEM; + if (!gwj) { + err = -ENOMEM; + goto out_free_cf; + } gwj->handled_frames = 0; gwj->dropped_frames = 0; @@ -1118,7 +1144,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, gwj->limit_hops = limhops; /* insert already parsed information */ - memcpy(&gwj->mod, &mod, sizeof(mod)); + RCU_INIT_POINTER(gwj->cf_mod, mod); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; @@ -1152,9 +1178,11 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: - if (err) + if (err) { kmem_cache_free(cgw_cache, gwj); - +out_free_cf: + kfree(mod); + } return err; } @@ -1214,19 +1242,22 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { + struct cf_mod *cf_mod; + if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; + cf_mod = cgw_job_cf_mod(gwj); /* we have a match when uid is enabled and identical */ - if (gwj->mod.uid || mod.uid) { - if (gwj->mod.uid != mod.uid) + if (cf_mod->uid || mod.uid) { + if (cf_mod->uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ - if (memcmp(&gwj->mod, &mod, sizeof(mod))) + if (memcmp(cf_mod, &mod, sizeof(mod))) continue; } diff --git a/net/can/proc.c b/net/can/proc.c index bbce97825f13..25fdf060e30d 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -118,6 +118,13 @@ void can_stat_update(struct timer_list *t) struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ + long rx_frames = atomic_long_read(&pkg_stats->rx_frames); + long tx_frames = atomic_long_read(&pkg_stats->tx_frames); + long matches = atomic_long_read(&pkg_stats->matches); + long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta); + long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta); + long matches_delta = atomic_long_read(&pkg_stats->matches_delta); + /* restart counting in timer context on user request */ if (user_reset) can_init_stats(net); @@ -127,35 +134,33 @@ void can_stat_update(struct timer_list *t) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->rx_frames > (ULONG_MAX / HZ)) + if (rx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->tx_frames > (ULONG_MAX / HZ)) + if (tx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ - if (pkg_stats->matches > (ULONG_MAX / 100)) + if (matches > (LONG_MAX / 100)) can_init_stats(net); /* calc total values */ - if (pkg_stats->rx_frames) - pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) / - pkg_stats->rx_frames; + if (rx_frames) + pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames; pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->tx_frames); + tx_frames); pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->rx_frames); + rx_frames); /* calc current values */ - if (pkg_stats->rx_frames_delta) + if (rx_frames_delta) pkg_stats->current_rx_match_ratio = - (pkg_stats->matches_delta * 100) / - pkg_stats->rx_frames_delta; + (matches_delta * 100) / rx_frames_delta; - pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta); - pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta); + pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta); + pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta); /* check / update maximum values */ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) @@ -168,9 +173,9 @@ void can_stat_update(struct timer_list *t) pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ - pkg_stats->tx_frames_delta = 0; - pkg_stats->rx_frames_delta = 0; - pkg_stats->matches_delta = 0; + atomic_long_set(&pkg_stats->tx_frames_delta, 0); + atomic_long_set(&pkg_stats->rx_frames_delta, 0); + atomic_long_set(&pkg_stats->matches_delta, 0); /* restart timer (one second) */ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); @@ -214,9 +219,12 @@ static int can_stats_proc_show(struct seq_file *m, void *v) struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); - seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames); - seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames); - seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches); + seq_printf(m, " %8ld transmitted frames (TXF)\n", + atomic_long_read(&pkg_stats->tx_frames)); + seq_printf(m, " %8ld received frames (RXF)\n", + atomic_long_read(&pkg_stats->rx_frames)); + seq_printf(m, " %8ld matched frames (RXMF)\n", + atomic_long_read(&pkg_stats->matches)); seq_putc(m, '\n'); diff --git a/net/core/dev.c b/net/core/dev.c index c31a7f7bedf3..4006fd164b7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2189,7 +2189,7 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } diff --git a/net/core/dst.c b/net/core/dst.c index 137b8d1c7220..aad197e761cb 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -167,6 +167,14 @@ static void dst_count_dec(struct dst_entry *dst) void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; + + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); + } +#endif dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } diff --git a/net/core/filter.c b/net/core/filter.c index 84992279f4b1..5143c8a9e52c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -211,24 +211,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u8 tmp, *ptr; + u8 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return *(u8 *)(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return tmp; - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return *(u8 *)ptr; - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, @@ -241,21 +253,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be16 tmp, *ptr; + __be16 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return get_unaligned_be16(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be16_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be16(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, @@ -268,21 +278,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be32 tmp, *ptr; + __be32 tmp; const int len = sizeof(tmp); - if (likely(offset >= 0)) { - if (headlen - offset >= len) - return get_unaligned_be32(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be32_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be32(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, @@ -2499,6 +2507,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? @@ -7726,13 +7735,13 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, if (!tstamp) return -EINVAL; skb->tstamp = tstamp; - skb->mono_delivery_time = 1; + skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; case BPF_SKB_TSTAMP_UNSPEC: if (tstamp) return -EINVAL; skb->tstamp = 0; - skb->mono_delivery_time = 0; + skb->tstamp_type = SKB_CLOCK_REALTIME; break; default: return -EINVAL; @@ -7860,42 +7869,37 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { #endif /* CONFIG_INET */ -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == sk_skb_change_head || - func == bpf_skb_change_tail || - func == sk_skb_change_tail || - func == bpf_skb_adjust_room || - func == sk_skb_adjust_room || - func == bpf_skb_pull_data || - func == sk_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_msg_push_data || - func == bpf_msg_pop_data || - func == bpf_xdp_adjust_tail || -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) - func == bpf_lwt_seg6_store_bytes || - func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action || -#endif -#ifdef CONFIG_INET - func == bpf_sock_ops_store_hdr_opt || -#endif - func == bpf_lwt_in_push_encap || - func == bpf_lwt_xmit_push_encap) +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: + /* tail-called program could call any of the above */ + case BPF_FUNC_tail_call: return true; - - return false; + default: + return false; + } } const struct bpf_func_proto bpf_event_output_data_proto __weak; @@ -9435,7 +9439,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); - /* skb->tc_at_ingress && skb->mono_delivery_time, + /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); @@ -9460,7 +9464,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the - * mono_delivery_time bit also. + * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; @@ -9470,7 +9474,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); - /* <clear>: mono_delivery_time */ + /* <clear>: skb->tstamp_type */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a..f63586c9ce02 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,8 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +#include "dev.h" + DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); @@ -325,82 +327,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->output)) + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->xmit)) + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } - if (!dst) + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -410,8 +462,11 @@ int lwtunnel_input(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->input)) + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); ret = ops->input(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e44feb39d459..1e2e60ffe766 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2293,6 +2293,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 31f923e7b5c4..2f2f63c8cf4b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -865,7 +865,13 @@ static void page_pool_release_retry(struct work_struct *wq) int inflight; inflight = page_pool_release(pool); - if (!inflight) + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) return; /* Periodic warning */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 359e24c3f22c..6afea369ca21 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -897,6 +897,10 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) pkt_dev->nr_labels = 0; do { __u32 tmp; + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + len = hex32_arg(&buffer[i], 8, &tmp); if (len <= 0) return len; @@ -908,8 +912,6 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) return -EFAULT; i++; n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; @@ -1875,8 +1877,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1906,7 +1908,8 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 72360c673fc6..beac45a47683 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1011,6 +1011,9 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } + if (dev->netdev_ops->ndo_get_vf_guid) + size += num_vfs * 2 * + nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; diff --git a/net/core/scm.c b/net/core/scm.c index 737917c7ac62..431bfb3ea392 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } @@ -371,8 +382,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/core/selftests.c b/net/core/selftests.c index acb1ee97bbd3..7af99d07762e 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -100,10 +100,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, ehdr->h_proto = htons(ETH_P_IP); if (attr->tcp) { + memset(thdr, 0, sizeof(*thdr)); thdr->source = htons(attr->sport); thdr->dest = htons(attr->dport); thdr->doff = sizeof(struct tcphdr) / 4; - thdr->check = 0; } else { uhdr->source = htons(attr->sport); uhdr->dest = htons(attr->dport); @@ -144,10 +144,18 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, attr->id = net_test_next_id; shdr->id = net_test_next_id++; - if (attr->size) - skb_put(skb, attr->size); - if (attr->max_size && attr->max_size > skb->len) - skb_put(skb, attr->max_size - skb->len); + if (attr->size) { + void *payload = skb_put(skb, attr->size); + + memset(payload, 0, attr->size); + } + + if (attr->max_size && attr->max_size > skb->len) { + size_t pad_len = attr->max_size - skb->len; + void *pad = skb_put(skb, pad_len); + + memset(pad, 0, pad_len); + } skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index b9b941c487c8..2076db464e93 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -529,16 +529,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; @@ -555,7 +561,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -563,7 +569,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -577,7 +583,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -589,7 +595,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -600,7 +606,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; @@ -609,7 +615,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -618,18 +624,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - int err = 0; - if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - skb_get(skb); - err = sk_psock_skb_ingress(psock, skb, off, len); - if (err < 0) - kfree_skb(skb); - return err; + + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, @@ -654,12 +655,14 @@ static void sk_psock_backlog(struct work_struct *work) bool ingress; int ret; + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; mutex_lock(&psock->work_mutex); - if (unlikely(state->len)) { - len = state->len; - off = state->off; - } - while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; @@ -669,6 +672,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -696,11 +706,14 @@ static void sk_psock_backlog(struct work_struct *work) len -= ret; } while (len); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -1013,7 +1026,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ccbdb98109f8..07736edc8b6a 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -867,6 +867,16 @@ static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) kfree(dst->lags); } +static void dsa_tree_teardown_routing_table(struct dsa_switch_tree *dst) +{ + struct dsa_link *dl, *next; + + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } +} + static int dsa_tree_setup(struct dsa_switch_tree *dst) { bool complete; @@ -884,7 +894,7 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) err = dsa_tree_setup_cpu_ports(dst); if (err) - return err; + goto teardown_rtable; err = dsa_tree_setup_switches(dst); if (err) @@ -916,14 +926,14 @@ teardown_switches: dsa_tree_teardown_switches(dst); teardown_cpu_ports: dsa_tree_teardown_cpu_ports(dst); +teardown_rtable: + dsa_tree_teardown_routing_table(dst); return err; } static void dsa_tree_teardown(struct dsa_switch_tree *dst) { - struct dsa_link *dl, *next; - if (!dst->setup) return; @@ -937,10 +947,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_cpu_ports(dst); - list_for_each_entry_safe(dl, next, &dst->rtable, list) { - list_del(&dl->list); - kfree(dl); - } + dsa_tree_teardown_routing_table(dst); pr_info("DSA: tree %d torn down\n", dst->index); @@ -1483,12 +1490,44 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) static void dsa_switch_release_ports(struct dsa_switch *ds) { + struct dsa_mac_addr *a, *tmp; struct dsa_port *dp, *next; + struct dsa_vlan *v, *n; dsa_switch_for_each_port_safe(dp, next, ds) { - WARN_ON(!list_empty(&dp->fdbs)); - WARN_ON(!list_empty(&dp->mdbs)); - WARN_ON(!list_empty(&dp->vlans)); + /* These are either entries that upper layers lost track of + * (probably due to bugs), or installed through interfaces + * where one does not necessarily have to remove them, like + * ndo_dflt_fdb_add(). + */ + list_for_each_entry_safe(a, tmp, &dp->fdbs, list) { + dev_info(ds->dev, + "Cleaning up unicast address %pM vid %u from port %d\n", + a->addr, a->vid, dp->index); + list_del(&a->list); + kfree(a); + } + + list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { + dev_info(ds->dev, + "Cleaning up multicast address %pM vid %u from port %d\n", + a->addr, a->vid, dp->index); + list_del(&a->list); + kfree(a); + } + + /* These are entries that upper layers have lost track of, + * probably due to bugs, but also due to dsa_port_do_vlan_del() + * having failed and the VLAN entry still lingering on. + */ + list_for_each_entry_safe(v, n, &dp->vlans, list) { + dev_info(ds->dev, + "Cleaning up vid %u from port %d\n", + v->vid, dp->index); + list_del(&v->list); + kfree(v); + } + list_del(&dp->list); kfree(dp); } diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index cbdfc392f7e0..a5420421e462 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -197,7 +197,7 @@ static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid) err = ds->ops->tag_8021q_vlan_del(ds, port, vid); if (err) { - refcount_inc(&v->refcount); + refcount_set(&v->refcount, 1); return err; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index cacdafb41200..146c1dbd15a9 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -257,7 +257,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, int source_port; u8 *brcm_tag; - if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) + if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) return NULL; brcm_tag = dsa_etype_header_pos_rx(skb); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index c1ad63bee8ea..7a9d8fe78ae9 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -402,7 +402,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ops->prepare_data(req_info, reply_data, info); rtnl_unlock(); if (ret < 0) - goto err_cleanup; + goto err_dev; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; @@ -460,7 +460,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); rtnl_unlock(); if (ret < 0) - goto out; + goto out_cancel; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; @@ -469,6 +469,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); +out_cancel: ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); @@ -676,7 +677,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) - goto err_cleanup; + goto err_rep; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; @@ -711,6 +712,7 @@ err_skb: err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); +err_rep: kfree(reply_data); kfree(req_info); return; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6dd960ec558c..ba0455ad7701 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -130,7 +130,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, goto err; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index eeace9b509ce..49fd664f50fc 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -118,47 +118,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) } #ifdef CONFIG_INET_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, @@ -171,20 +140,6 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -207,6 +162,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -391,6 +348,8 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 90ce87ffed46..7993ff46de23 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -829,19 +829,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 513f475c6a53..298a9944a3d1 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -222,9 +222,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + struct fib4_rule *rule4 = (struct fib4_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 77b97c48da5e..fa54b36b241a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1192,22 +1192,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1228,9 +1212,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1723,9 +1704,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c88c9034d630..496308c02384 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; - head->mono_delivery_time = q->mono_delivery_time; + head->tstamp_type = q->tstamp_type; if (sk) refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7967ff7e02f7..60e81f6b1c6d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1231,22 +1231,37 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; + spinlock_t *ptr = NULL; - if (locksz != 0) { - /* allocate 2 cache lines or at least one spinlock per cpu */ - nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); - nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + if (locksz == 0) + goto set_mask; - /* no more locks than number of hash buckets */ - nblocks = min(nblocks, hashinfo->ehash_mask + 1); + /* Allocate 2 cache lines or at least one spinlock per cpu. */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); - hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); - if (!hashinfo->ehash_locks) - return -ENOMEM; + /* At least one page per NUMA node. */ + nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); + + nblocks = roundup_pow_of_two(nblocks); + + /* No more locks than number of hash buckets. */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); - for (i = 0; i < nblocks; i++) - spin_lock_init(&hashinfo->ehash_locks[i]); + if (num_online_nodes() > 1) { + /* Use vmalloc() to allow NUMA policy to spread pages + * on all available nodes if desired. + */ + ptr = vmalloc_array(nblocks, locksz); + } + if (!ptr) { + ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); + if (!ptr) + return -ENOMEM; } + for (i = 0; i < nblocks; i++) + spin_lock_init(&ptr[i]); + hashinfo->ehash_locks = ptr; +set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 877d1e03150c..484edc8513e4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -360,7 +360,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; - qp->q.mono_delivery_time = skb->mono_delivery_time; + qp->q.tstamp_type = skb->tstamp_type; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 890c15510b42..f261e29adc7c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -140,7 +140,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -181,7 +180,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -189,10 +187,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 765bd3f2a840..b8cfe6afc84b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -764,7 +764,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; struct sk_buff *skb2; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; @@ -856,7 +856,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } } - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) @@ -912,7 +912,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, skb2); if (err) goto fail; @@ -1648,7 +1648,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - nskb->mono_delivery_time = !!transmit_time; + if (transmit_time) + nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 80ccd6661aa3..deb08cab4464 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -415,7 +415,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, skb_dst_update_pmtu_no_confirm(skb, mtu); - if (!reply || skb->pkt_type == PACKET_HOST) + if (!reply) return 0; if (skb->protocol == htons(ETH_P_IP)) @@ -450,7 +450,7 @@ static const struct nla_policy geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, - [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static const struct nla_policy diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 10d38ec0ff5a..a172248b6678 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -425,6 +425,20 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; } +static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) +{ + tp->delivered_ce += ecn_count; +} + +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tcp_count_delivered_ce(tp, delivered); +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -1137,15 +1151,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -/* Updates the delivered and delivered_ce counts */ -static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, - bool ece_ack) -{ - tp->delivered += delivered; - if (ece_ack) - tp->delivered_ce += delivered; -} - /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -3816,12 +3821,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } -static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->in_ack_event) - icsk->icsk_ca_ops->in_ack_event(sk, flags); + if (icsk->icsk_ca_ops->in_ack_event) { + u32 ack_ev_flags = 0; + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_SLOWPATH) { + ack_ev_flags |= CA_ACK_SLOWPATH; + if (flag & FLAG_ECE) + ack_ev_flags |= CA_ACK_ECE; + } + + icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); + } } /* Congestion control has updated the cwnd already. So if we're in @@ -3938,12 +3954,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3955,19 +3967,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); - - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; - - tcp_in_ack_event(sk, ack_ev_flags); } /* This is a deviation from RFC3168 since it states that: @@ -3994,6 +3999,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + tcp_in_ack_event(sk, flag); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4025,6 +4032,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3771ed22c2f5..560273e7f773 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1266,7 +1266,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; @@ -1607,7 +1607,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - skb_set_delivery_time(buff, skb->tstamp, true); + skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -2703,7 +2703,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ @@ -3688,11 +3688,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), - true); + SKB_CLOCK_MONOTONIC); else #endif { - skb_set_delivery_time(skb, now, true); + skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3741,7 +3741,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); - skb_set_delivery_time(skb, now, true); + skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; @@ -3923,7 +3923,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); + skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b84d18fcd9e2..dc91699ce032 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1412,12 +1412,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb) } /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial, - bool rx_queue_lock_held) +static void udp_rmem_release(struct sock *sk, unsigned int size, + int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; - int amt; + unsigned int amt; if (likely(partial)) { up->forward_deficit += size; @@ -1437,10 +1437,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); - - sk_forward_alloc_add(sk, size); - amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk_forward_alloc_add(sk, -amt); + amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); + sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1630,7 +1628,7 @@ EXPORT_SYMBOL_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, - int *total) + unsigned int *total) { struct sk_buff *skb; @@ -1663,8 +1661,8 @@ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + unsigned int total = 0; struct sk_buff *skb; - int total = 0; int res; spin_lock_bh(&rcvq->lock); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2ab16139c197..3870b59f5400 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -247,6 +247,62 @@ static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) return segs; } +static void __udpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, + __be16 *oldport, __be16 newport) +{ + struct udphdr *uh = udp_hdr(seg); + + if (ipv6_addr_equal(oldip, newip) && *oldport == newport) + return; + + if (uh->check) { + inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32, + newip->s6_addr32, true); + + inet_proto_csum_replace2(&uh->check, seg, *oldport, newport, + false); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + + *oldip = *newip; + *oldport = newport; +} + +static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct ipv6hdr *iph; + const struct udphdr *uh; + struct ipv6hdr *iph2; + struct sk_buff *seg; + struct udphdr *uh2; + + seg = segs; + uh = udp_hdr(seg); + iph = ipv6_hdr(seg); + uh2 = udp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + uh2 = udp_hdr(seg); + iph2 = ipv6_hdr(seg); + + __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &uh2->source, uh->source); + __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &uh2->dest, uh->dest); + } + + return segs; +} + static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) @@ -259,7 +315,10 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); - return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb); + if (is_ipv6) + return __udpv6_gso_segment_list_csum(skb); + else + return __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, @@ -273,6 +332,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) @@ -295,6 +355,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index db07d3bbaf37..231fa4dc6cde 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5753,6 +5753,27 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } +static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb, + struct inet6_dev *idev) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, u32 ext_filter_mask) { @@ -5774,18 +5795,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, /* XXX - MC not implemented */ - if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) - return 0; - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) { + if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0) + goto nla_put_failure; + } nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 1578ed9e97d8..c07e3da08d2a 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1075,8 +1075,13 @@ static int calipso_sock_getattr(struct sock *sk, struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; @@ -1128,8 +1133,13 @@ static int calipso_sock_setattr(struct sock *sk, { int ret_val; struct ipv6_opt_hdr *old, *new; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); old = NULL; if (txopts) old = txopts->hopopt; @@ -1156,8 +1166,13 @@ static int calipso_sock_setattr(struct sock *sk, static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 62bb9651133c..7e4c8628cf98 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -135,47 +135,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) } #ifdef CONFIG_INET6_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, @@ -188,20 +157,6 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -224,6 +179,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -427,6 +384,8 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 6eeab21512ba..e0f0c5f8cccd 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -350,9 +350,9 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlattr **tb, struct netlink_ext_ack *extack) { + struct fib6_rule *rule6 = (struct fib6_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct net *net = sock_net(skb->sk); - struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cd89a2b35dfb..28777b142240 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -864,7 +864,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; @@ -958,7 +958,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -1019,7 +1019,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(frag, tstamp, mono_delivery_time); + skb_set_delivery_time(frag, tstamp, tstamp_type); err = output(net, sk, frag); if (err) goto fail; @@ -1452,6 +1452,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1485,7 +1486,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1541,7 +1542,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1913,7 +1914,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -2118,7 +2119,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 857713d7a38a..606aae4e78a9 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -126,7 +126,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; @@ -163,20 +163,20 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct ip6_fraglist_iter iter; struct sk_buff *frag2; - if (first_len - hlen > mtu || - skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag2) { - if (frag2->len > mtu || - skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + if (frag2->len > mtu) goto blackhole; /* Partially cloned skb? */ - if (skb_shared(frag2)) + if (skb_shared(frag2) || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path; } @@ -192,7 +192,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -225,7 +225,7 @@ slow_path: goto blackhole; } - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c78b13ea5b19..82e51b2ec4f5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -268,7 +268,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index a7690ec62325..9ea5ef56cb27 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -103,6 +103,10 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { @@ -136,6 +140,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, return NULL; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && + ((tproto != IPPROTO_ICMPV6 && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (tproto == IPPROTO_ICMPV6 && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + dport = (tproto == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index c9f1634b3838..a89ce0fbfe4b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -158,6 +158,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); + const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; struct ipv6hdr *iph, _iph; @@ -202,11 +203,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev && - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) - goto put_rt_err; + if (!oif) { + found = rt->rt6i_idev->dev; + } else { + if (oif == rt->rt6i_idev->dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + found = oif; + } - nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); + nft_fib_store_result(dest, priv, found); put_rt_err: ip6_rt_put(rt); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2af98edef87e..cb219d4bdf25 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -198,7 +198,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; - fq->q.mono_delivery_time = skb->mono_delivery_time; + fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5715d54f3d0b..53197087353a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -414,12 +414,37 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } +static struct fib6_info * +rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) +{ + struct fib6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + iter = rcu_dereference(fn->leaf); + if (!iter) + goto out; + + while (iter) { + if (iter->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference(iter->fib6_next); + } + +out: + return NULL; +} + void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *match = res->f6i; + struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; + int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -442,16 +467,25 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, return; } - if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) + first = rt6_multipath_first_sibling_rcu(match); + if (!first) goto out; - list_for_each_entry_rcu(sibling, &match->fib6_siblings, + hash = fl6->mp_hash; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { + if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) + match = first; + goto out; + } + + list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); - if (fl6->mp_hash > nh_upper_bound) + if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; @@ -3636,7 +3670,8 @@ out: in6_dev_put(idev); if (err) { - lwtstate_put(fib6_nh->fib_nh_lws); + fib_nh_common_release(&fib6_nh->nh_common); + fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } @@ -3796,10 +3831,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (nh) { if (rt->fib6_src.plen) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + err = -EINVAL; goto out_free; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -ENOENT; goto out_free; } rt->nh = nh; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c434940131b1..7f295b9c1374 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1638,10 +1638,8 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 }, - [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)), + [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f285e52b8b85..624ab1424eba 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -934,7 +934,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); - skb_set_delivery_time(buff, tcp_transmit_time(sk), true); + skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cc25fec44f85..19f3de3c24ef 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -888,15 +888,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; + /* Partial read */ + if (used + offset < skb_len) + continue; + if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } - - /* Partial read */ - if (used + offset < skb_len) - continue; } while (len > 0); out: diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index fae701248f05..eaa4e5c6a5c3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -682,6 +682,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ieee80211_txq_remove_vlan(local, sdata); + if (sdata->vif.txq) + ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq)); + sdata->bss = NULL; if (local->open_count == 0) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index d1046f495e63..3a6fff98748b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1186,10 +1186,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return -EINVAL; } - local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) + - sizeof(void *) * channels, GFP_KERNEL); + local->int_scan_req = kzalloc(struct_size(local->int_scan_req, + channels, channels), + GFP_KERNEL); if (!local->int_scan_req) return -ENOMEM; + local->int_scan_req->n_channels = channels; eth_broadcast_addr(local->int_scan_req->bssid); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 51369072984e..c6395551f5df 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -365,6 +365,12 @@ u32 airtime_link_metric_get(struct ieee80211_local *local, return (u32)result; } +/* Check that the first metric is at least 10% better than the second one */ +static bool is_metric_better(u32 x, u32 y) +{ + return (x < y) && (x < (y - x / 10)); +} + /** * hwmp_route_info_get - Update routing info to originator and transmitter * @@ -456,8 +462,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, (mpath->sn == orig_sn && (rcu_access_pointer(mpath->next_hop) != sta ? - mult_frac(new_metric, 10, 9) : - new_metric) >= mpath->metric)) { + !is_metric_better(new_metric, mpath->metric) : + new_metric >= mpath->metric))) { process = false; fresh_info = false; } @@ -531,8 +537,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, if ((mpath->flags & MESH_PATH_FIXED) || ((mpath->flags & MESH_PATH_ACTIVE) && ((rcu_access_pointer(mpath->next_hop) != sta ? - mult_frac(last_hop_metric, 10, 9) : - last_hop_metric) > mpath->metric))) + !is_metric_better(last_hop_metric, mpath->metric) : + last_hop_metric > mpath->metric)))) fresh_info = false; } else { mpath = mesh_path_add(sdata, ta); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 42e2c84ed248..2c7e139efd53 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2959,7 +2959,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, if (tx) ieee80211_flush_queues(local, sdata, false); - drv_mgd_complete_tx(sdata->local, sdata, &info); + if (tx || frame_buf) + drv_mgd_complete_tx(sdata->local, sdata, &info); /* clear AP addr only after building the needed mgmt frames */ eth_zero_addr(sdata->deflink.u.mgd.bssid); @@ -7821,7 +7822,6 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); - drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 5d71e8d084c4..64cf5589989b 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/module.h> @@ -1321,9 +1321,13 @@ static int _sta_info_move_state(struct sta_info *sta, sta->sta.addr, new_state); /* notify the driver before the actual changes so it can - * fail the transition + * fail the transition if the state is increasing. + * The driver is required not to fail when the transition + * is decreasing the state, so first, do all the preparation + * work and only then, notify the driver. */ - if (test_sta_flag(sta, WLAN_STA_INSERTED)) { + if (new_state > sta->sta_state && + test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) @@ -1399,6 +1403,16 @@ static int _sta_info_move_state(struct sta_info *sta, break; } + if (new_state < sta->sta_state && + test_sta_flag(sta, WLAN_STA_INSERTED)) { + int err = drv_sta_state(sta->local, sta->sdata, sta, + sta->sta_state, new_state); + + WARN_ONCE(err, + "Driver is not allowed to fail if the sta_state is transitioning down the list: %d\n", + err); + } + sta->sta_state = new_state; return 0; diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 28be85d05533..8032cfba22d1 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -550,6 +550,9 @@ static int mctp_sk_hash(struct sock *sk) { struct net *net = sock_net(sk); + /* Bind lookup runs under RCU, remain live during that. */ + sock_set_flag(sk, SOCK_RCU_FREE); + mutex_lock(&net->mctp.bind_lock); sk_add_node_rcu(sk, &net->mctp.binds); mutex_unlock(&net->mctp.bind_lock); diff --git a/net/mctp/device.c b/net/mctp/device.c index 85cc5f31f1e7..8d1386601bbe 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -20,8 +20,7 @@ #include <net/sock.h> struct mctp_dump_cb { - int h; - int idx; + unsigned long ifindex; size_t a_idx; }; @@ -115,43 +114,36 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct mctp_dump_cb *mcb = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct hlist_head *head; struct net_device *dev; struct ifaddrmsg *hdr; struct mctp_dev *mdev; - int ifindex; - int idx = 0, rc; - - hdr = nlmsg_data(cb->nlh); - // filter by ifindex if requested - ifindex = hdr->ifa_index; + int ifindex = 0, rc; + + /* Filter by ifindex if a header is provided */ + if (cb->nlh->nlmsg_len >= nlmsg_msg_size(sizeof(*hdr))) { + hdr = nlmsg_data(cb->nlh); + ifindex = hdr->ifa_index; + } else { + if (cb->strict_check) { + NL_SET_ERR_MSG(cb->extack, "mctp: Invalid header for addr dump request"); + return -EINVAL; + } + } rcu_read_lock(); - for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { - idx = 0; - head = &net->dev_index_head[mcb->h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx >= mcb->idx && - (ifindex == 0 || ifindex == dev->ifindex)) { - mdev = __mctp_dev_get(dev); - if (mdev) { - rc = mctp_dump_dev_addrinfo(mdev, - skb, cb); - mctp_dev_put(mdev); - // Error indicates full buffer, this - // callback will get retried. - if (rc < 0) - goto out; - } - } - idx++; - // reset for next iteration - mcb->a_idx = 0; - } + for_each_netdev_dump(net, dev, mcb->ifindex) { + if (ifindex && ifindex != dev->ifindex) + continue; + mdev = __mctp_dev_get(dev); + if (!mdev) + continue; + rc = mctp_dump_dev_addrinfo(mdev, skb, cb); + mctp_dev_put(mdev); + if (rc < 0) + break; + mcb->a_idx = 0; } -out: rcu_read_unlock(); - mcb->idx = idx; return skb->len; } @@ -525,9 +517,12 @@ static struct notifier_block mctp_dev_nb = { }; static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = { - {THIS_MODULE, PF_MCTP, RTM_NEWADDR, mctp_rtm_newaddr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_DELADDR, mctp_rtm_deladdr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_GETADDR, NULL, mctp_dump_addrinfo, 0}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_NEWADDR, + .doit = mctp_rtm_newaddr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_DELADDR, + .doit = mctp_rtm_deladdr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_GETADDR, + .dumpit = mctp_dump_addrinfo}, }; int __init mctp_device_init(void) diff --git a/net/mctp/route.c b/net/mctp/route.c index d3c1f54386ef..009ba5edbd52 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -274,8 +274,10 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) key = flow->key; - if (WARN_ON(key->dev && key->dev != dev)) + if (key->dev) { + WARN_ON(key->dev != dev); return; + } mctp_dev_set_key(dev, key); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 838c154b2b90..6ef98608ac64 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -651,6 +651,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; + struct mptcp_addr_info addr; bool echo; int len; @@ -659,7 +660,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; @@ -672,7 +673,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * else if (opts->suboptions & OPTION_MPTCP_DSS) return false; - len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); + len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; @@ -689,6 +690,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = 0; *size -= opt_size; } + opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d0f73b9180c7..31f6899ef71a 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1388,6 +1388,32 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos); + case IP_FREEBIND: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(FREEBIND, sk)); + case IP_TRANSPARENT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(TRANSPARENT, sk)); + } + + return -EOPNOTSUPP; +} + +static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (void *)msk; + + switch (optname) { + case IPV6_V6ONLY: + return mptcp_put_int_option(msk, optval, optlen, + sk->sk_ipv6only); + case IPV6_TRANSPARENT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(TRANSPARENT, sk)); + case IPV6_FREEBIND: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(FREEBIND, sk)); } return -EOPNOTSUPP; @@ -1432,6 +1458,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_IP) return mptcp_getsockopt_v4(msk, optname, optval, option); + if (level == SOL_IPV6) + return mptcp_getsockopt_v6(msk, optname, optval, option); if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); if (level == SOL_MPTCP) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index b3eeeb948b61..f4b8ca8be81e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -731,8 +731,6 @@ static bool subflow_hmac_valid(const struct request_sock *req, subflow_req = mptcp_subflow_rsk(req); msk = subflow_req->msk; - if (!msk) - return false; subflow_generate_hmac(msk->remote_key, msk->local_key, subflow_req->remote_nonce, @@ -828,12 +826,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || - !subflow_hmac_valid(req, &mp_opt) || - !mptcp_can_accept_new_subflow(subflow_req->msk)) { - SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; - } } create_child: @@ -883,6 +877,17 @@ create_child: goto dispose_child; } + if (!subflow_hmac_valid(req, &mp_opt)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + + if (!mptcp_can_accept_new_subflow(owner)) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 4e0842df5234..2c260f33b55c 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -143,16 +143,15 @@ struct ncsi_channel_vlan_filter { }; struct ncsi_channel_stats { - u32 hnc_cnt_hi; /* Counter cleared */ - u32 hnc_cnt_lo; /* Counter cleared */ - u32 hnc_rx_bytes; /* Rx bytes */ - u32 hnc_tx_bytes; /* Tx bytes */ - u32 hnc_rx_uc_pkts; /* Rx UC packets */ - u32 hnc_rx_mc_pkts; /* Rx MC packets */ - u32 hnc_rx_bc_pkts; /* Rx BC packets */ - u32 hnc_tx_uc_pkts; /* Tx UC packets */ - u32 hnc_tx_mc_pkts; /* Tx MC packets */ - u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u64 hnc_cnt; /* Counter cleared */ + u64 hnc_rx_bytes; /* Rx bytes */ + u64 hnc_tx_bytes; /* Tx bytes */ + u64 hnc_rx_uc_pkts; /* Rx UC packets */ + u64 hnc_rx_mc_pkts; /* Rx MC packets */ + u64 hnc_rx_bc_pkts; /* Rx BC packets */ + u64 hnc_tx_uc_pkts; /* Tx UC packets */ + u64 hnc_tx_mc_pkts; /* Tx MC packets */ + u64 hnc_tx_bc_pkts; /* Tx BC packets */ u32 hnc_fcs_err; /* FCS errors */ u32 hnc_align_err; /* Alignment errors */ u32 hnc_false_carrier; /* False carrier detection */ @@ -181,7 +180,7 @@ struct ncsi_channel_stats { u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ - u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u64 hnc_rx_valid_bytes; /* Rx valid bytes */ u32 hnc_rx_runt_pkts; /* Rx error runt packets */ u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ u32 ncsi_rx_cmds; /* Rx NCSI commands */ diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index f2f3b5c1b941..24edb2737972 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -252,16 +252,15 @@ struct ncsi_rsp_gp_pkt { /* Get Controller Packet Statistics */ struct ncsi_rsp_gcps_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ - __be32 cnt_hi; /* Counter cleared */ - __be32 cnt_lo; /* Counter cleared */ - __be32 rx_bytes; /* Rx bytes */ - __be32 tx_bytes; /* Tx bytes */ - __be32 rx_uc_pkts; /* Rx UC packets */ - __be32 rx_mc_pkts; /* Rx MC packets */ - __be32 rx_bc_pkts; /* Rx BC packets */ - __be32 tx_uc_pkts; /* Tx UC packets */ - __be32 tx_mc_pkts; /* Tx MC packets */ - __be32 tx_bc_pkts; /* Tx BC packets */ + __be64 cnt; /* Counter cleared */ + __be64 rx_bytes; /* Rx bytes */ + __be64 tx_bytes; /* Tx bytes */ + __be64 rx_uc_pkts; /* Rx UC packets */ + __be64 rx_mc_pkts; /* Rx MC packets */ + __be64 rx_bc_pkts; /* Rx BC packets */ + __be64 tx_uc_pkts; /* Tx UC packets */ + __be64 tx_mc_pkts; /* Tx MC packets */ + __be64 tx_bc_pkts; /* Tx BC packets */ __be32 fcs_err; /* FCS errors */ __be32 align_err; /* Alignment errors */ __be32 false_carrier; /* False carrier detection */ @@ -290,11 +289,11 @@ struct ncsi_rsp_gcps_pkt { __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ - __be32 rx_valid_bytes; /* Rx valid bytes */ + __be64 rx_valid_bytes; /* Rx valid bytes */ __be32 rx_runt_pkts; /* Rx error runt packets */ __be32 rx_jabber_pkts; /* Rx error jabber packets */ __be32 checksum; /* Checksum */ -}; +} __packed __aligned(4); /* Get NCSI Statistics */ struct ncsi_rsp_gns_pkt { diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 4a8ce2949fae..8668888c5a2f 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -926,16 +926,15 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) /* Update HNC's statistics */ ncs = &nc->stats; - ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); - ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); - ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); - ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); - ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); - ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); - ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); - ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); - ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); - ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_cnt = be64_to_cpu(rsp->cnt); + ncs->hnc_rx_bytes = be64_to_cpu(rsp->rx_bytes); + ncs->hnc_tx_bytes = be64_to_cpu(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = be64_to_cpu(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = be64_to_cpu(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = be64_to_cpu(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = be64_to_cpu(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = be64_to_cpu(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = be64_to_cpu(rsp->tx_bc_pkts); ncs->hnc_fcs_err = ntohl(rsp->fcs_err); ncs->hnc_align_err = ntohl(rsp->align_err); ncs->hnc_false_carrier = ntohl(rsp->false_carrier); @@ -964,7 +963,7 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); - ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_valid_bytes = be64_to_cpu(rsp->rx_valid_bytes); ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 20aad81fcad7..c2d88b1b06b8 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -63,7 +63,7 @@ struct hbucket { #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) #define ahash_region(n, htable_bits) \ - ((n) % ahash_numof_locks(htable_bits)) + ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 5cd511162bc0..0103c4a4d10a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -119,13 +119,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) return false; } -/* Get route to daddr, update *saddr, optionally bind route to saddr */ +/* Get route to daddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - int rt_mode, __be32 *saddr) + int rt_mode, __be32 *ret_saddr) { struct flowi4 fl4; struct rtable *rt; - bool loop = false; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -135,23 +134,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, retry: rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { - /* Invalid saddr ? */ - if (PTR_ERR(rt) == -EINVAL && *saddr && - rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { - *saddr = 0; - flowi4_update_output(&fl4, 0, daddr, 0); - goto retry; - } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); return NULL; - } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + } + if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); - *saddr = fl4.saddr; flowi4_update_output(&fl4, 0, daddr, fl4.saddr); - loop = true; + rt_mode = 0; goto retry; } - *saddr = fl4.saddr; + if (ret_saddr) + *ret_saddr = fl4.saddr; return rt; } @@ -344,19 +337,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { - __be32 saddr = htonl(INADDR_ANY); - noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, ret_saddr); if (!rt) goto err_unreach; - if (ret_saddr) - *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 559665467b04..1d5de1d9f008 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -621,7 +621,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -657,7 +659,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", @@ -951,7 +955,9 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { } }; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ccca6e3848bc..9df883d79acc 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -248,7 +248,7 @@ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { - static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; @@ -287,8 +287,14 @@ nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); - if (unlikely(!thash)) /* clashing entry went away */ - return false; + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } ct = nf_ct_tuplehash_to_ctrack(thash); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9e9544f81942..18ae39cf4188 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2669,11 +2669,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, err = nft_netdev_register_hooks(ctx->net, &hook.list); if (err < 0) goto err_hooks; + + unregister = true; } } - unregister = true; - if (nla[NFTA_CHAIN_COUNTERS]) { if (!nft_is_base_chain(chain)) { err = -EOPNOTSUPP; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index eab0dc66bee6..cc7325329496 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -17,6 +17,11 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_counter { + u64_stats_t bytes; + u64_stats_t packets; +}; + +struct nft_counter_tot { s64 bytes; s64 packets; }; @@ -25,25 +30,24 @@ struct nft_counter_percpu_priv { struct nft_counter __percpu *counter; }; -static DEFINE_PER_CPU(seqcount_t, nft_counter_seq); +static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); - - write_seqcount_begin(myseq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - this_cpu->bytes += pkt->skb->len; - this_cpu->packets++; + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->bytes, pkt->skb->len); + u64_stats_inc(&this_cpu->packets); + u64_stats_update_end(nft_sync); - write_seqcount_end(myseq); local_bh_enable(); } @@ -66,17 +70,16 @@ static int nft_counter_do_init(const struct nlattr * const tb[], if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); + this_cpu = raw_cpu_ptr(cpu_stats); if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + u64_stats_set(&this_cpu->packets, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]))); } if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + u64_stats_set(&this_cpu->bytes, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]))); } - preempt_enable(); + priv->counter = cpu_stats; return 0; } @@ -104,40 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx, } static void nft_counter_reset(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); + nft_sync = this_cpu_ptr(&nft_counter_sync); + + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, -total->packets); + u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_update_end(nft_sync); - write_seqcount_begin(myseq); - this_cpu->packets -= total->packets; - this_cpu->bytes -= total->bytes; - write_seqcount_end(myseq); local_bh_enable(); } static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { struct nft_counter *this_cpu; - const seqcount_t *myseq; u64 bytes, packets; unsigned int seq; int cpu; memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - myseq = per_cpu_ptr(&nft_counter_seq, cpu); + struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu); + this_cpu = per_cpu_ptr(priv->counter, cpu); do { - seq = read_seqcount_begin(myseq); - bytes = this_cpu->bytes; - packets = this_cpu->packets; - } while (read_seqcount_retry(myseq, seq)); + seq = u64_stats_fetch_begin(nft_sync); + bytes = u64_stats_read(&this_cpu->bytes); + packets = u64_stats_read(&this_cpu->packets); + } while (u64_stats_fetch_retry(nft_sync, seq)); total->bytes += bytes; total->packets += packets; @@ -148,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -237,7 +241,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, g struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -245,11 +249,9 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, g if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - this_cpu->packets = total.packets; - this_cpu->bytes = total.bytes; - preempt_enable(); + this_cpu = raw_cpu_ptr(cpu_stats); + u64_stats_set(&this_cpu->packets, total.packets); + u64_stats_set(&this_cpu->bytes, total.bytes); priv_clone->counter = cpu_stats; return 0; @@ -267,17 +269,17 @@ static void nft_counter_offload_stats(struct nft_expr *expr, const struct flow_stats *stats) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - write_seqcount_begin(myseq); - this_cpu->packets += stats->pkts; - this_cpu->bytes += stats->bytes; - write_seqcount_end(myseq); + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, stats->pkts); + u64_stats_add(&this_cpu->bytes, stats->bytes); + u64_stats_update_end(nft_sync); local_bh_enable(); } @@ -286,7 +288,7 @@ void nft_counter_init_seqcount(void) int cpu; for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); + u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu)); } struct nft_expr_type nft_counter_type; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 9b2d7463d3d3..df0798da2329 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -19,10 +19,16 @@ struct nft_quota { }; static inline bool nft_overquota(struct nft_quota *priv, - const struct sk_buff *skb) + const struct sk_buff *skb, + bool *report) { - return atomic64_add_return(skb->len, priv->consumed) >= - atomic64_read(&priv->quota); + u64 consumed = atomic64_add_return(skb->len, priv->consumed); + u64 quota = atomic64_read(&priv->quota); + + if (report) + *report = consumed >= quota; + + return consumed > quota; } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) + if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); - bool overquota; + bool overquota, report; - overquota = nft_overquota(priv, pkt->skb); + overquota = nft_overquota(priv, pkt->skb, &report); if (overquota ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; - if (overquota && + if (report && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 0370f69dce86..2f1012bde1f3 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -308,7 +308,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, nft_setelem_expr_foreach(expr, elem_expr, size) { if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) + expr->ops->gc(read_pnet(&set->net), expr) && + set->flags & NFT_SET_EVAL) return true; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index b8d3c3213efe..be7c16c79f71 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -994,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); - NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); @@ -1113,6 +1114,25 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, } /** + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + +/** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation @@ -1170,7 +1190,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); - /* Starting map doesn't need to be set for this implementation */ + pipapo_resmap_init_avx2(m, res); nft_pipapo_avx2_prepare(); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index f735d79d8be5..3e3ae29dde33 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -333,13 +333,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, - [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { - struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; @@ -617,11 +617,11 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { - opt = (struct geneve_opt *)opts->u.data + offset; + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, @@ -630,8 +630,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 27511c90a26f..daef0eeaea2c 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1140,6 +1140,11 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + if (sk->sk_family != AF_INET6) { + ret_val = -EAFNOSUPPORT; + goto conn_setattr_return; + } + addr6 = (struct sockaddr_in6 *)addr; entry = netlbl_domhsh_getentry_af6(secattr->domain, &addr6->sin6_addr); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 9445ca97163b..10c646b32b9d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -931,12 +931,6 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, pskb_trim(skb, ovs_mac_header_len(key)); } - /* Need to set the pkt_type to involve the routing layer. The - * packet movement through the OVS datapath doesn't generally - * use routing, but this is needed for tunnel cases. - */ - skb->pkt_type = PACKET_OUTGOING; - if (likely(!mru || (skb->len <= mru + vport->dev->hard_header_len))) { ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); @@ -965,8 +959,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, upcall.cmd = OVS_PACKET_CMD_ACTION; upcall.mru = OVS_CB(skb)->mru; - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { + nla_for_each_nested(a, attr, rem) { switch (nla_type(a)) { case OVS_USERSPACE_ATTR_USERDATA: upcall.userdata = a; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8a848ce72e29..b80bd3a90773 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -788,7 +788,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - u8 label_count = 1; + size_t label_count = 1; memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 9c13e14034d3..089ab1826e1d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2862,7 +2862,8 @@ static int validate_set(const struct nlattr *a, size_t key_len; /* There can be only one key in a action */ - if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + if (!nla_ok(ovs_key, nla_len(a)) || + nla_total_size(nla_len(ovs_key)) != nla_len(a)) return -EINVAL; key_len = nla_len(ovs_key); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b0455fda7d0b..ac87fcff4795 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -54,8 +54,8 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } - if (unlikely(!skb->tstamp && skb->mono_delivery_time)) - skb->mono_delivery_time = 0; + if (unlikely(!skb->tstamp && skb->tstamp_type)) + skb->tstamp_type = SKB_CLOCK_REALTIME; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 0c8aa7e686ea..99fb869aee91 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -68,7 +68,7 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 96c39e9a873c..7245f39d1e65 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1977,6 +1977,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); + int ret = -EMSGSIZE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -2021,11 +2022,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, return skb->len; +cls_op_not_supp: + ret = -EOPNOTSUPP; out_nlmsg_trim: nla_put_failure: -cls_op_not_supp: nlmsg_trim(skb, b); - return -1; + return ret; +} + +static struct sk_buff *tfilter_notify_prep(struct net *net, + struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto *tp, + struct tcf_block *block, + struct Qdisc *q, u32 parent, + void *fh, int event, + u32 portid, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE; + struct sk_buff *skb; + int ret; + +retry: + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOBUFS); + + ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event, false, + rtnl_held, extack); + if (ret <= 0) { + kfree_skb(skb); + if (ret == -EMSGSIZE) { + size += NLMSG_GOODSIZE; + goto retry; + } + return ERR_PTR(-EINVAL); + } + return skb; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, @@ -2038,16 +2073,13 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err = 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; + if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return 0; - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held, extack) <= 0) { - kfree_skb(skb); - return -EINVAL; - } + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event, + portid, rtnl_held, extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); if (unicast) err = rtnl_unicast(skb, net, portid); @@ -2067,16 +2099,14 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return tp->ops->delete(tp, fh, last, rtnl_held, extack); - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held, extack) <= 0) { + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, portid, rtnl_held, extack); + if (IS_ERR(skb)) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); - kfree_skb(skb); - return -EINVAL; + return PTR_ERR(skb); } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); @@ -2891,6 +2921,9 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, struct sk_buff *skb; int err = 0; + if (!unicast && !rtnl_notify_needed(net, flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -2920,6 +2953,9 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, struct net *net = block->net; struct sk_buff *skb; + if (!rtnl_notify_needed(net, flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 382c7a71f81f..db7151c6b70b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -104,8 +104,8 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } - if (unlikely(!skb->tstamp && skb->mono_delivery_time)) - skb->mono_delivery_time = 0; + if (unlikely(!skb->tstamp && skb->tstamp_type)) + skb->tstamp_type = SKB_CLOCK_REALTIME; if (prog->exts_integrated) { res->class = 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index bcf1b8012b2c..b00e491e8130 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -748,7 +748,7 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index d7a4874543de..63c02040b426 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -95,10 +95,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) &q->stats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->stats.drop_count && sch->q.qlen) { + if (q->stats.drop_count) { qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; q->stats.drop_len = 0; @@ -171,7 +168,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 19901e77cd3b..9b36955e32b1 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,6 +35,11 @@ struct drr_sched { struct Qdisc_class_hash clhash; }; +static bool cl_is_active(struct drr_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); @@ -105,6 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); + INIT_LIST_HEAD(&cl->alist); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -229,7 +235,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - list_del(&cl->alist); + list_del_init(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, @@ -336,7 +342,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; - bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { @@ -346,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -356,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first) { + if (!cl_is_active(cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } @@ -390,7 +394,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); @@ -431,7 +435,7 @@ static void drr_reset_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); qdisc_reset(cl->qdisc); } } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 9fd70462b41d..3ee46f6e005d 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -74,6 +74,11 @@ static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, }; +static bool cl_is_active(struct ets_class *cl) +{ + return !list_empty(&cl->alist); +} + static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, unsigned int *quantum, struct netlink_ext_ack *extack) @@ -293,7 +298,7 @@ static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) * to remove them. */ if (!ets_class_is_strict(q, cl) && sch->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); } static int ets_class_dump(struct Qdisc *sch, unsigned long arg, @@ -416,7 +421,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct ets_sched *q = qdisc_priv(sch); struct ets_class *cl; int err = 0; - bool first; cl = ets_classify(skb, sch, &err); if (!cl) { @@ -426,7 +430,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -436,7 +439,7 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first && !ets_class_is_strict(q, cl)) { + if (!cl_is_active(cl) && !ets_class_is_strict(q, cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } @@ -488,7 +491,7 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); return ets_qdisc_dequeue_skb(sch, skb); } @@ -657,8 +660,8 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, } for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) - list_del(&q->classes[i].alist); - qdisc_tree_flush_backlog(q->classes[i].qdisc); + list_del_init(&q->classes[i].alist); + qdisc_purge_queue(q->classes[i].qdisc); } q->nstrict = nstrict; memcpy(q->prio2band, priomap, sizeof(priomap)); @@ -713,7 +716,7 @@ static void ets_qdisc_reset(struct Qdisc *sch) for (band = q->nstrict; band < q->nbands; band++) { if (q->classes[band].qdisc->q.qlen) - list_del(&q->classes[band].alist); + list_del_init(&q->classes[band].alist); } for (band = 0; band < q->nbands; band++) qdisc_reset(q->classes[band].qdisc); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f59a2cb2c803..91f5ef6be0f2 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -901,7 +901,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); } while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 8c4fee063436..47b5a056165c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -314,10 +314,8 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->cstats.drop_count && sch->q.qlen) { + + if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; @@ -433,7 +431,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { - struct sk_buff *skb = fq_codel_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); q->cstats.drop_len += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 68e6acd0f130..607c580d75e4 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -357,7 +357,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); len_dropped += qdisc_pkt_len(skb); num_dropped += 1; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 880c5f16b29c..afcb83d469ff 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -175,6 +175,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -203,7 +208,10 @@ eltree_insert(struct hfsc_class *cl) static inline void eltree_remove(struct hfsc_class *cl) { - rb_erase(&cl->el_node, &cl->sched->eligible); + if (!RB_EMPTY_NODE(&cl->el_node)) { + rb_erase(&cl->el_node, &cl->sched->eligible); + RB_CLEAR_NODE(&cl->el_node); + } } static inline void @@ -958,6 +966,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { int old_flags; + int len = 0; if (parentid) { if (cl->cl_parent && @@ -988,9 +997,13 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (usc != NULL) hfsc_change_usc(cl, usc, cur_time); + if (cl->qdisc->q.qlen != 0) + len = qdisc_peek_len(cl->qdisc); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ if (cl->qdisc->q.qlen != 0) { - int len = qdisc_peek_len(cl->qdisc); - if (cl->cl_flags & HFSC_RSC) { if (old_flags & HFSC_RSC) update_ed(cl, len); @@ -1032,6 +1045,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1219,7 +1234,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. */ - update_vf(cl, 0, 0); + if (cl->cl_nactive) + update_vf(cl, 0, 0); if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); } @@ -1561,7 +1577,10 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } - if (first) { + sch->qstats.backlog += len; + sch->q.qlen++; + + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) @@ -1576,9 +1595,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } - sch->qstats.backlog += len; - sch->q.qlen++; - return NET_XMIT_SUCCESS; } @@ -1633,10 +1649,16 @@ hfsc_dequeue(struct Qdisc *sch) if (cl->qdisc->q.qlen != 0) { /* update ed */ next_len = qdisc_peek_len(cl->qdisc); - if (realtime) - update_ed(cl, next_len); - else - update_d(cl, next_len); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ + if (cl->qdisc->q.qlen != 0) { + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } } else { /* the class becomes passive */ eltree_remove(cl); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d26cd436cbe3..83fc44f20e31 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -560,7 +560,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = hhf_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); rtnl_kfree_skbs(skb, skb); } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 19035ef8387f..716da8c6b3de 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -348,7 +348,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q, */ static inline void htb_next_rb_node(struct rb_node **n) { - *n = rb_next(*n); + if (*n) + *n = rb_next(*n); } /** @@ -609,8 +610,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) */ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(!cl->prio_activity); - + if (!cl->prio_activity) + return; htb_deactivate_prios(q, cl); cl->prio_activity = 0; } @@ -1738,8 +1739,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->parent) cl->parent->children--; - if (cl->prio_activity) - htb_deactivate(q, cl); + htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&cl->pq_node, @@ -1947,8 +1947,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); parent_qdisc = parent->leaf.q; - if (parent->prio_activity) - htb_deactivate(q, parent); + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 2da6250ec346..48c5ab8ec143 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -190,7 +190,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index fdc5ef52c3ee..fdd9caa41e80 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -211,7 +211,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) - qdisc_tree_flush_backlog(q->queues[i]); + qdisc_purge_queue(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 546c10adcacd..5e557b960bde 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -202,6 +202,11 @@ struct qfq_sched { */ enum update_reason {enqueue, requeue}; +static bool cl_is_active(struct qfq_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -347,7 +352,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) struct qfq_aggregate *agg = cl->agg; - list_del(&cl->alist); /* remove from RR queue of the aggregate */ + list_del_init(&cl->alist); /* remove from RR queue of the aggregate */ if (list_empty(&agg->active)) /* agg is now inactive */ qfq_deactivate_agg(q, agg); } @@ -477,6 +482,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; + INIT_LIST_HEAD(&cl->alist); cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); @@ -985,7 +991,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, cl->deficit -= (int) len; if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ - list_del(&cl->alist); + list_del_init(&cl->alist); else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); @@ -1217,7 +1223,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; - bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -1239,7 +1244,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1255,8 +1259,8 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, ++sch->q.qlen; agg = cl->agg; - /* if the queue was not empty, then done here */ - if (!first) { + /* if the class is active, then done here */ + if (cl_is_active(cl)) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) @@ -1418,6 +1422,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; + if (list_empty(&cl->alist)) + return; qfq_deactivate_class(q, cl); } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 16277b6a0238..3c6b4460cf2c 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -283,7 +283,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, q->userbits = userbits; q->limit = ctl->limit; if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old_child = q->qdisc; q->qdisc = child; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 60754f366ab7..d564675a8be4 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -310,7 +310,10 @@ drop: /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; - q->tail->next = slot->next; + if (slot->next == x) + q->tail = NULL; /* no more active slots */ + else + q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } @@ -631,6 +634,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; + unsigned int maxflows; + unsigned int quantum; + unsigned int divisor; + int perturb_period; + u8 headdrop; + u8 maxdepth; + int limit; + u8 flags; + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; @@ -652,40 +664,65 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, if (!p) return -ENOMEM; } - if (ctl->limit == 1) { - NL_SET_ERR_MSG_MOD(extack, "invalid limit"); - return -EINVAL; - } + sch_tree_lock(sch); + + limit = q->limit; + divisor = q->divisor; + headdrop = q->headdrop; + maxdepth = q->maxdepth; + maxflows = q->maxflows; + perturb_period = q->perturb_period; + quantum = q->quantum; + flags = q->flags; + + /* update and validate configuration */ if (ctl->quantum) - q->quantum = ctl->quantum; - WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); + quantum = ctl->quantum; + perturb_period = ctl->perturb_period * HZ; if (ctl->flows) - q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { - q->divisor = ctl->divisor; - q->maxflows = min_t(u32, q->maxflows, q->divisor); + divisor = ctl->divisor; + maxflows = min_t(u32, maxflows, divisor); } if (ctl_v1) { if (ctl_v1->depth) - q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { - swap(q->red_parms, p); - red_set_parms(q->red_parms, + red_set_parms(p, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } - q->flags = ctl_v1->flags; - q->headdrop = ctl_v1->headdrop; + flags = ctl_v1->flags; + headdrop = ctl_v1->headdrop; } if (ctl->limit) { - q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); - q->maxflows = min_t(u32, q->maxflows, q->limit); + limit = min_t(u32, ctl->limit, maxdepth * maxflows); + maxflows = min_t(u32, maxflows, limit); + } + if (limit == 1) { + sch_tree_unlock(sch); + kfree(p); + NL_SET_ERR_MSG_MOD(extack, "invalid limit"); + return -EINVAL; } + /* commit configuration */ + q->limit = limit; + q->divisor = divisor; + q->headdrop = headdrop; + q->maxdepth = maxdepth; + q->maxflows = maxflows; + WRITE_ONCE(q->perturb_period, perturb_period); + q->quantum = quantum; + q->flags = flags; + if (p) + swap(q->red_parms, p); + qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { dropped += sfq_drop(sch, &to_free); diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 5df2dacb7b1a..05aa363a7fee 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -121,8 +121,6 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { - /* The incoming packet is the only packet in queue. */ - BUG_ON(sch->q.qlen != 1); q->lowest_prio = prio; q->highest_prio = prio; } else { @@ -154,7 +152,6 @@ static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { - BUG_ON(sch->q.qlen); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index f92174008499..61dd5c8f2310 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -452,7 +452,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 108a0745c0c3..b84c5e0a76f5 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -71,8 +71,9 @@ /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); static void sctp_wfree(struct sk_buff *skb); -static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len); +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, + struct sctp_transport *transport, + long *timeo_p, size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1827,7 +1828,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + err = sctp_wait_for_sndbuf(asoc, transport, &timeo, msg_len); if (err) goto err; if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { @@ -9208,8 +9209,9 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ -static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len) +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, + struct sctp_transport *transport, + long *timeo_p, size_t msg_len) { struct sock *sk = asoc->base.sk; long current_timeo = *timeo_p; @@ -9219,7 +9221,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); - /* Increment the association's refcnt. */ + /* Increment the transport and association's refcnt. */ + if (transport) + sctp_transport_hold(transport); sctp_association_hold(asoc); /* Wait on the association specific sndbuf space. */ @@ -9228,7 +9232,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, TASK_INTERRUPTIBLE); if (asoc->base.dead) goto do_dead; - if (!*timeo_p) + if ((!*timeo_p) || (transport && transport->dead)) goto do_nonblock; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; @@ -9253,7 +9257,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, out: finish_wait(&asoc->wait, &wait); - /* Release the association's refcnt. */ + /* Release the transport and association's refcnt. */ + if (transport) + sctp_transport_put(transport); sctp_association_put(asoc); return err; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index fd73be940f46..77a76634014a 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -529,6 +529,8 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write, return ret; } +static DEFINE_MUTEX(sctp_sysctl_mutex); + static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -553,6 +555,7 @@ static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, if (new_value > max || new_value < min) return -EINVAL; + mutex_lock(&sctp_sysctl_mutex); net->sctp.udp_port = new_value; sctp_udp_sock_stop(net); if (new_value) { @@ -565,6 +568,7 @@ static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, lock_sock(sk); sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); release_sock(sk); + mutex_unlock(&sctp_sysctl_mutex); } return ret; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 2abe45af98e7..31eca29b6cfb 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -117,6 +117,8 @@ fail: */ void sctp_transport_free(struct sctp_transport *transport) { + transport->dead = 1; + /* Try to delete the heartbeat timer. */ if (del_timer(&transport->hb_timer)) sctp_transport_put(transport); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index dbcc72b43d0c..d44d7f427fc9 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -1084,14 +1084,16 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *base_ndev; struct net *net; - ndev = pnet_find_base_ndev(ndev); + base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); - if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, ini); + smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 142ee6554848..4ffb2bcaf364 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -275,9 +275,6 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); - if (!xprt_bound(xprt)) - clnt->cl_autobind = 1; - clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 102c3818bc54..53bcca365fb1 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -820,9 +820,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) } trace_rpcb_setport(child, map->r_status, map->r_port); - xprt->ops->set_port(xprt, map->r_port); - if (map->r_port) + if (map->r_port) { + xprt->ops->set_port(xprt, map->r_port); xprt_set_bound(xprt); + } } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9b45fbdc90ca..73bc39281ef5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,6 +276,8 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(current->flags & PF_EXITING)) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index c524421ec652..79f91b6ca8c8 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -817,12 +817,20 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, goto exit; } + /* Get net to avoid freed tipc_crypto when delete namespace */ + if (!maybe_get_net(aead->crypto->net)) { + tipc_bearer_put(b); + rc = -ENODEV; + goto exit; + } + /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); + put_net(aead->crypto->net); exit: kfree(ctx); @@ -860,6 +868,7 @@ static void tipc_aead_encrypt_done(void *data, int err) kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); + put_net(net); } /** diff --git a/net/tipc/link.c b/net/tipc/link.c index d0143823658d..6c6d8546c578 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1068,6 +1068,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { if (imp == TIPC_SYSTEM_IMPORTANCE) { pr_warn("%s<%s>, link overflow", link_rst_msg, l->name); + __skb_queue_purge(list); return -ENOBUFS; } rc = link_schedule_user(l, hdr); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 77a3d016cade..ddc3e4e5e18d 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -716,7 +716,8 @@ void tipc_mon_reinit_self(struct net *net) if (!mon) continue; write_lock_bh(&mon->lock); - mon->self->addr = tipc_own_addr(net); + if (mon->self) + mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 0a67b93a52ec..d7dea82bcf56 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -804,6 +804,11 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static int tls_disconnect(struct sock *sk, int flags) +{ + return -EOPNOTSUPP; +} + struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -899,6 +904,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_BASE] = *base; prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect; prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 5df08d848b5c..1852fac3e72b 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -395,7 +395,6 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) return 0; shinfo = skb_shinfo(strp->anchor); - shinfo->frag_list = NULL; /* If we don't know the length go max plus page for cipher overhead */ need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; @@ -411,6 +410,8 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) page, 0, 0); } + shinfo->frag_list = NULL; + strp->copy_mode = 1; strp->stm.offset = 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6e30fe879d53..4a9a3aed5d6d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -908,6 +908,13 @@ more_data: &msg_redir, send, flags); lock_sock(sk); if (err < 0) { + /* Regardless of whether the data represented by + * msg_redir is sent successfully, we have already + * uncharged it via sk_msg_return_zero(). The + * msg->sg.size represents the remaining unprocessed + * data, which needs to be uncharged here. + */ + sk_mem_uncharge(sk, msg->sg.size); *copied -= sk_msg_free_nocharge(sk, &msg_redir); msg->sg.size = 0; } @@ -1120,9 +1127,13 @@ alloc_encrypted: num_async++; else if (ret == -ENOMEM) goto wait_for_memory; - else if (ctx->open_rec && ret == -ENOSPC) + else if (ctx->open_rec && ret == -ENOSPC) { + if (msg_pl->cork_bytes) { + ret = 0; + goto send_end; + } goto rollback_iter; - else if (ret != -EAGAIN) + } else if (ret != -EAGAIN) goto send_end; } continue; diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 28b232f281ab..8b5d04210d7c 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -16,11 +16,6 @@ config UNIX Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4d0..4ddd125c4642 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ab23c8d72122..236a2cd2bc93 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,8 +117,6 @@ #include <linux/file.h> #include <linux/btf_ids.h> -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -980,11 +978,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); - u->inflight = 0; + u->listener = NULL; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1583,6 +1581,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1678,8 +1677,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1709,6 +1708,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); @@ -1752,51 +1752,65 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + unix_destroy_fpl(scm->fp); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); +} - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1855,8 +1869,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1864,8 +1880,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* @@ -1885,11 +1903,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2157,11 +2176,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2a758531e102..23efb78fe9ef 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,278 +81,551 @@ #include <net/scm.h> #include <net/tcp_states.h> -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; -/* Internal data structures and random procedures: */ + ops = READ_ONCE(sock->ops); -static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); + } -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) + return NULL; +} + +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); - - if (sk) { - struct unix_sock *u = unix_sk(sk); - - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + +static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; +} + +static LIST_HEAD(unix_unvisited_vertices); + +enum unix_vertex_index { + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, + UNIX_VERTEX_INDEX_START, +}; + +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; } - spin_unlock(&x->sk_receive_queue.lock); + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); } -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); + struct unix_vertex *vertex = edge->predecessor->vertex; - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); + if (!fpl->dead) + unix_update_graph(unix_edge_successor(edge)); - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); + list_del(&edge->vertex_entry); + vertex->out_degree--; - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); } } -static void dec_inflight(struct unix_sock *usk) +static void unix_free_vertices(struct scm_fp_list *fpl) { - usk->inflight--; + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } } -static void inc_inflight(struct unix_sock *usk) +static DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { - usk->inflight++; + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + + receiver->scm_stat.nr_unix_fds += fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); } -static void inc_inflight_move_tail(struct unix_sock *u) +void unix_del_edges(struct scm_fp_list *fpl) { - u->inflight++; + struct unix_sock *receiver; + int i = 0; + + spin_lock(&unix_gc_lock); - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + + if (!fpl->dead) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} + +void unix_update_edges(struct unix_sock *receiver) +{ + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } } -static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 +int unix_prepare_fpl(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex; + int i; + + if (!fpl->count_unix) + return 0; + + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; + + list_add(&vertex->entry, &fpl->vertices); + } + + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; + + return 0; -void wait_for_unix_gc(void) +err: + unix_free_vertices(fpl); + return -ENOMEM; +} + +void unix_destroy_fpl(struct scm_fp_list *fpl) { - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); + if (fpl->inflight) + unix_del_edges(fpl); + + kvfree(fpl->edges); + unix_free_vertices(fpl); } -/* The external entry point: unix_gc() */ -void unix_gc(void) +static bool unix_vertex_dead(struct unix_vertex *vertex) { - struct sk_buff *next_skb, *skb; + struct unix_edge *edge; struct unix_sock *u; - struct unix_sock *next; - struct sk_buff_head hitlist; - struct list_head cursor; - LIST_HEAD(not_cycle_list); + long total_ref; - spin_lock(&unix_gc_lock); + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - * - * Embryos, though never candidates themselves, affect which - * candidates are reachable by the garbage collector. Before - * being added to a listener's queue, an embryo may already - * receive data carrying SCM_RIGHTS, potentially making the - * passed socket a candidate that is not yet reachable by the - * collector. It becomes reachable once the embryo is - * enqueued. Therefore, we must ensure that no SCM-laden - * embryo appears in a (candidate) listener's queue between - * consecutive scan_children() calls. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - struct sock *sk = &u->sk; - long total_refs; - - total_refs = file_count(sk->sk_socket->file); - - BUG_ON(!u->inflight); - BUG_ON(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - - if (sk->sk_state == TCP_LISTEN) { - unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); - unix_state_unlock(sk); + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; +} + +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; + +static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) +{ + skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + WARN_ON_ONCE(skb_unref(u->oob_skb)); + u->oob_skb = NULL; + } +#endif +} + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + unix_collect_queue(unix_sk(skb->sk), hitlist); + spin_unlock(&embryo_queue->lock); } + } else { + unix_collect_queue(u, hitlist); } + + spin_unlock(&queue->lock); } +} - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + +static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) +{ + LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). + * The vertex will be popped when finalising SCC later. */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); + list_add(&vertex->scc_entry, &vertex_stack); + + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + if (!next_vertex) + continue; + + if (next_vertex->index == unix_vertex_unvisited_index) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); + vertex = next_vertex; + goto next_vertex; - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + next_vertex = vertex; + vertex = edge->predecessor->vertex; + + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index + * to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, + * propagate it to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else { + /* The successor was already grouped as another SCC */ } } - list_del(&cursor); - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); + if (vertex->index == vertex->scc_index) { + struct unix_vertex *v; + struct list_head scc; + bool scc_dead = true; -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; + /* SCC finalised. + * + * If the scc_index was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(v, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&v->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack. */ + v->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(v); } -#endif + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(struct sk_buff_head *hitlist) +{ + unsigned long last_index = UNIX_VERTEX_INDEX_START; + + unix_graph_maybe_cyclic = false; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex, &last_index, hitlist); } - spin_unlock(&unix_gc_lock); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->destructor == io_uring_destruct_scm) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) +{ + unix_graph_maybe_cyclic = false; + + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + bool scc_dead = true; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_move_tail(&vertex->entry, &unix_visited_vertices); + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); } + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* Here we are. Hitlist is filled. Die. */ - __skb_queue_purge(&hitlist); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +} + +static bool gc_in_progress; + +static void __unix_gc(struct work_struct *work) +{ + struct sk_buff_head hitlist; + struct sk_buff *skb; spin_lock(&unix_gc_lock); - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); + goto skip_gc; + } + + __skb_queue_head_init(&hitlist); + + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); - /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); + spin_unlock(&unix_gc_lock); + + skb_queue_walk(&hitlist, skb) { + if (UNIXCB(skb).fp) + UNIXCB(skb).fp->dead = true; + } - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + __skb_queue_purge(&hitlist); +skip_gc: WRITE_ONCE(gc_in_progress, false); +} - wake_up(&unix_gc_wait); +static DECLARE_WORK(unix_gc_work, __unix_gc); - out: - spin_unlock(&unix_gc_lock); +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +void wait_for_unix_gc(struct scm_fp_list *fpl) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); } diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index e92f2fad6410..000000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/fs.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/init.h> -#include <linux/io_uring.h> - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct sock *unix_get_socket(struct file *filp) -{ - struct sock *u_sock = NULL; - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - const struct proto_ops *ops = READ_ONCE(sock->ops); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && ops && ops->family == PF_UNIX) - u_sock = s; - } - - return u_sock; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - if (!u->inflight) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - u->inflight++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!u->inflight); - BUG_ON(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); - -void io_uring_destruct_scm(struct sk_buff *skb) -{ - unix_destruct_scm(skb); -} -EXPORT_SYMBOL(io_uring_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f16..000000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 622875a6f787..f8f1a49689da 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1485,7 +1485,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { + /* If the socket is already closing or it is in an error state, there + * is no point in waiting. + */ + while (sk->sk_state != TCP_ESTABLISHED && + sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ce622a287abc..6db8c9a2a7a2 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2511,7 +2511,7 @@ cfg80211_defrag_mle(const struct element *mle, const u8 *ie, size_t ielen, /* Required length for first defragmentation */ buf_len = mle->datalen - 1; for_each_element(elem, mle->data + mle->datalen, - ielen - sizeof(*mle) + mle->datalen) { + ie + ielen - mle->data - mle->datalen) { if (elem->id != WLAN_EID_FRAGMENT) break; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index f38388b6b62c..e83b707da25b 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -104,7 +104,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (pool->unaligned) pool->free_heads[i] = xskb; else - xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); + xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size); } return pool; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 04dc0c8a8370..7188d3592dde 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -371,7 +371,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, xdo->dev = dev; netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); - xdo->real_dev = dev; xdo->type = XFRM_DEV_OFFLOAD_PACKET; switch (dir) { case XFRM_POLICY_IN: @@ -393,7 +392,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); if (err) { xdo->dev = NULL; - xdo->real_dev = NULL; xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xdo->dir = 0; netdev_put(dev, &xdo->dev_tracker); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5722c95b8bb..a30538a980cc 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -610,6 +610,40 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output_resume); +static int xfrm_dev_direct_output(struct sock *sk, struct xfrm_state *x, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net *net = xs_net(x); + int err; + + dst = skb_dst_pop(skb); + if (!dst) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EHOSTUNREACH; + } + skb_dst_set(skb, dst); + nf_reset_ct(skb); + + err = skb_dst(skb)->ops->local_out(net, sk, skb); + if (unlikely(err != 1)) { + kfree_skb(skb); + return err; + } + + /* In transport mode, network destination is + * directly reachable, while in tunnel mode, + * inner packet network may not be. In packet + * offload type, HW is responsible for hard + * header packet mangling so directly xmit skb + * to netdevice. + */ + skb->dev = x->xso.dev; + __skb_push(skb, skb->dev->hard_header_len); + return dev_queue_xmit(skb); +} + static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(sk, skb, 1); @@ -729,6 +763,13 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) return -EHOSTUNREACH; } + /* Exclusive direct xmit for tunnel mode, as + * some filtering or matching rules may apply + * in transport mode. + */ + if (x->props.mode == XFRM_MODE_TUNNEL) + return xfrm_dev_direct_output(sk, x, skb); + return xfrm_output_resume(sk, skb, 0); } @@ -752,7 +793,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) skb->encapsulation = 1; if (skb_is_gso(skb)) { - if (skb->inner_protocol) + if (skb->inner_protocol && x->props.mode == XFRM_MODE_TUNNEL) return xfrm_output_gso(net, sk, skb); skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 68b3f9e7edff..2edb0f868c57 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1603,6 +1603,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *delpol; struct hlist_head *chain; + /* Sanitize mark before store */ + policy->mark.v &= policy->mark.m; + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8a6e8656d014..d2bd5bddfb05 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -754,9 +754,6 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); - if (x->encap_sk) - sock_put(rcu_dereference_raw(x->encap_sk)); - xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. @@ -1329,7 +1326,6 @@ found: xso->type = XFRM_DEV_OFFLOAD_PACKET; xso->dir = xdo->dir; xso->dev = xdo->dev; - xso->real_dev = xdo->real_dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC); error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); @@ -1337,7 +1333,6 @@ found: xso->dir = 0; netdev_put(xso->dev, &xso->dev_tracker); xso->dev = NULL; - xso->real_dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; x->km.state = XFRM_STATE_DEAD; to_put = x; @@ -1478,6 +1473,9 @@ static void __xfrm_state_insert(struct xfrm_state *x) list_add(&x->km.all, &net->xfrm.state_all); + /* Sanitize mark before store */ + x->mark.v &= x->mark.m; + h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, |